linux/arch/s390/mm/fault.c
<<
>>
Prefs
   1/*
   2 *  S390 version
   3 *    Copyright IBM Corp. 1999
   4 *    Author(s): Hartmut Penner (hp@de.ibm.com)
   5 *               Ulrich Weigand (uweigand@de.ibm.com)
   6 *
   7 *  Derived from "arch/i386/mm/fault.c"
   8 *    Copyright (C) 1995  Linus Torvalds
   9 */
  10
  11#include <linux/kernel_stat.h>
  12#include <linux/perf_event.h>
  13#include <linux/signal.h>
  14#include <linux/sched.h>
  15#include <linux/kernel.h>
  16#include <linux/errno.h>
  17#include <linux/string.h>
  18#include <linux/types.h>
  19#include <linux/ptrace.h>
  20#include <linux/mman.h>
  21#include <linux/mm.h>
  22#include <linux/compat.h>
  23#include <linux/smp.h>
  24#include <linux/kdebug.h>
  25#include <linux/init.h>
  26#include <linux/console.h>
  27#include <linux/module.h>
  28#include <linux/hardirq.h>
  29#include <linux/kprobes.h>
  30#include <linux/uaccess.h>
  31#include <linux/hugetlb.h>
  32#include <asm/asm-offsets.h>
  33#include <asm/diag.h>
  34#include <asm/pgtable.h>
  35#include <asm/irq.h>
  36#include <asm/mmu_context.h>
  37#include <asm/facility.h>
  38#include "../kernel/entry.h"
  39
  40#define __FAIL_ADDR_MASK -4096L
  41#define __SUBCODE_MASK 0x0600
  42#define __PF_RES_FIELD 0x8000000000000000ULL
  43
  44#define VM_FAULT_BADCONTEXT     0x010000
  45#define VM_FAULT_BADMAP         0x020000
  46#define VM_FAULT_BADACCESS      0x040000
  47#define VM_FAULT_SIGNAL         0x080000
  48#define VM_FAULT_PFAULT         0x100000
  49
  50static unsigned long store_indication __read_mostly;
  51
  52static int __init fault_init(void)
  53{
  54        if (test_facility(75))
  55                store_indication = 0xc00;
  56        return 0;
  57}
  58early_initcall(fault_init);
  59
  60static inline int notify_page_fault(struct pt_regs *regs)
  61{
  62        int ret = 0;
  63
  64        /* kprobe_running() needs smp_processor_id() */
  65        if (kprobes_built_in() && !user_mode(regs)) {
  66                preempt_disable();
  67                if (kprobe_running() && kprobe_fault_handler(regs, 14))
  68                        ret = 1;
  69                preempt_enable();
  70        }
  71        return ret;
  72}
  73
  74
  75/*
  76 * Unlock any spinlocks which will prevent us from getting the
  77 * message out.
  78 */
  79void bust_spinlocks(int yes)
  80{
  81        if (yes) {
  82                oops_in_progress = 1;
  83        } else {
  84                int loglevel_save = console_loglevel;
  85                console_unblank();
  86                oops_in_progress = 0;
  87                /*
  88                 * OK, the message is on the console.  Now we call printk()
  89                 * without oops_in_progress set so that printk will give klogd
  90                 * a poke.  Hold onto your hats...
  91                 */
  92                console_loglevel = 15;
  93                printk(" ");
  94                console_loglevel = loglevel_save;
  95        }
  96}
  97
  98/*
  99 * Returns the address space associated with the fault.
 100 * Returns 0 for kernel space and 1 for user space.
 101 */
 102static inline int user_space_fault(struct pt_regs *regs)
 103{
 104        unsigned long trans_exc_code;
 105
 106        /*
 107         * The lowest two bits of the translation exception
 108         * identification indicate which paging table was used.
 109         */
 110        trans_exc_code = regs->int_parm_long & 3;
 111        if (trans_exc_code == 3) /* home space -> kernel */
 112                return 0;
 113        if (user_mode(regs))
 114                return 1;
 115        if (trans_exc_code == 2) /* secondary space -> set_fs */
 116                return current->thread.mm_segment.ar4;
 117        if (current->flags & PF_VCPU)
 118                return 1;
 119        return 0;
 120}
 121
 122static int bad_address(void *p)
 123{
 124        unsigned long dummy;
 125
 126        return probe_kernel_address((unsigned long *)p, dummy);
 127}
 128
 129static void dump_pagetable(unsigned long asce, unsigned long address)
 130{
 131        unsigned long *table = __va(asce & PAGE_MASK);
 132
 133        pr_alert("AS:%016lx ", asce);
 134        switch (asce & _ASCE_TYPE_MASK) {
 135        case _ASCE_TYPE_REGION1:
 136                table = table + ((address >> 53) & 0x7ff);
 137                if (bad_address(table))
 138                        goto bad;
 139                pr_cont("R1:%016lx ", *table);
 140                if (*table & _REGION_ENTRY_INVALID)
 141                        goto out;
 142                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 143                /* fallthrough */
 144        case _ASCE_TYPE_REGION2:
 145                table = table + ((address >> 42) & 0x7ff);
 146                if (bad_address(table))
 147                        goto bad;
 148                pr_cont("R2:%016lx ", *table);
 149                if (*table & _REGION_ENTRY_INVALID)
 150                        goto out;
 151                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 152                /* fallthrough */
 153        case _ASCE_TYPE_REGION3:
 154                table = table + ((address >> 31) & 0x7ff);
 155                if (bad_address(table))
 156                        goto bad;
 157                pr_cont("R3:%016lx ", *table);
 158                if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
 159                        goto out;
 160                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 161                /* fallthrough */
 162        case _ASCE_TYPE_SEGMENT:
 163                table = table + ((address >> 20) & 0x7ff);
 164                if (bad_address(table))
 165                        goto bad;
 166                pr_cont("S:%016lx ", *table);
 167                if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE))
 168                        goto out;
 169                table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
 170        }
 171        table = table + ((address >> 12) & 0xff);
 172        if (bad_address(table))
 173                goto bad;
 174        pr_cont("P:%016lx ", *table);
 175out:
 176        pr_cont("\n");
 177        return;
 178bad:
 179        pr_cont("BAD\n");
 180}
 181
 182static void dump_fault_info(struct pt_regs *regs)
 183{
 184        unsigned long asce;
 185
 186        pr_alert("Fault in ");
 187        switch (regs->int_parm_long & 3) {
 188        case 3:
 189                pr_cont("home space ");
 190                break;
 191        case 2:
 192                pr_cont("secondary space ");
 193                break;
 194        case 1:
 195                pr_cont("access register ");
 196                break;
 197        case 0:
 198                pr_cont("primary space ");
 199                break;
 200        }
 201        pr_cont("mode while using ");
 202        if (!user_space_fault(regs)) {
 203                asce = S390_lowcore.kernel_asce;
 204                pr_cont("kernel ");
 205        }
 206#ifdef CONFIG_PGSTE
 207        else if ((current->flags & PF_VCPU) && S390_lowcore.gmap) {
 208                struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
 209                asce = gmap->asce;
 210                pr_cont("gmap ");
 211        }
 212#endif
 213        else {
 214                asce = S390_lowcore.user_asce;
 215                pr_cont("user ");
 216        }
 217        pr_cont("ASCE.\n");
 218        dump_pagetable(asce, regs->int_parm_long & __FAIL_ADDR_MASK);
 219}
 220
 221static inline void report_user_fault(struct pt_regs *regs, long signr)
 222{
 223        if ((task_pid_nr(current) > 1) && !show_unhandled_signals)
 224                return;
 225        if (!unhandled_signal(current, signr))
 226                return;
 227        if (!printk_ratelimit())
 228                return;
 229        printk(KERN_ALERT "User process fault: interruption code %04x ilc:%d ",
 230               regs->int_code & 0xffff, regs->int_code >> 17);
 231        print_vma_addr(KERN_CONT "in ", regs->psw.addr);
 232        printk(KERN_CONT "\n");
 233        printk(KERN_ALERT "failing address: %016lx TEID: %016lx\n",
 234               regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long);
 235        dump_fault_info(regs);
 236        show_regs(regs);
 237}
 238
 239/*
 240 * Send SIGSEGV to task.  This is an external routine
 241 * to keep the stack usage of do_page_fault small.
 242 */
 243static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
 244{
 245        struct siginfo si;
 246
 247        report_user_fault(regs, SIGSEGV);
 248        si.si_signo = SIGSEGV;
 249        si.si_code = si_code;
 250        si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK);
 251        force_sig_info(SIGSEGV, &si, current);
 252}
 253
 254static noinline void do_no_context(struct pt_regs *regs)
 255{
 256        const struct exception_table_entry *fixup;
 257
 258        /* Are we prepared to handle this kernel fault?  */
 259        fixup = search_exception_tables(regs->psw.addr);
 260        if (fixup) {
 261                regs->psw.addr = extable_fixup(fixup);
 262                return;
 263        }
 264
 265        /*
 266         * Oops. The kernel tried to access some bad page. We'll have to
 267         * terminate things with extreme prejudice.
 268         */
 269        if (!user_space_fault(regs))
 270                printk(KERN_ALERT "Unable to handle kernel pointer dereference"
 271                       " in virtual kernel address space\n");
 272        else
 273                printk(KERN_ALERT "Unable to handle kernel paging request"
 274                       " in virtual user address space\n");
 275        printk(KERN_ALERT "failing address: %016lx TEID: %016lx\n",
 276               regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long);
 277        dump_fault_info(regs);
 278        die(regs, "Oops");
 279        do_exit(SIGKILL);
 280}
 281
 282static noinline void do_low_address(struct pt_regs *regs)
 283{
 284        /* Low-address protection hit in kernel mode means
 285           NULL pointer write access in kernel mode.  */
 286        if (regs->psw.mask & PSW_MASK_PSTATE) {
 287                /* Low-address protection hit in user mode 'cannot happen'. */
 288                die (regs, "Low-address protection");
 289                do_exit(SIGKILL);
 290        }
 291
 292        do_no_context(regs);
 293}
 294
 295static noinline void do_sigbus(struct pt_regs *regs)
 296{
 297        struct task_struct *tsk = current;
 298        struct siginfo si;
 299
 300        /*
 301         * Send a sigbus, regardless of whether we were in kernel
 302         * or user mode.
 303         */
 304        si.si_signo = SIGBUS;
 305        si.si_errno = 0;
 306        si.si_code = BUS_ADRERR;
 307        si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK);
 308        force_sig_info(SIGBUS, &si, tsk);
 309}
 310
 311static noinline void do_fault_error(struct pt_regs *regs, int fault)
 312{
 313        int si_code;
 314
 315        switch (fault) {
 316        case VM_FAULT_BADACCESS:
 317        case VM_FAULT_BADMAP:
 318                /* Bad memory access. Check if it is kernel or user space. */
 319                if (user_mode(regs)) {
 320                        /* User mode accesses just cause a SIGSEGV */
 321                        si_code = (fault == VM_FAULT_BADMAP) ?
 322                                SEGV_MAPERR : SEGV_ACCERR;
 323                        do_sigsegv(regs, si_code);
 324                        return;
 325                }
 326        case VM_FAULT_BADCONTEXT:
 327        case VM_FAULT_PFAULT:
 328                do_no_context(regs);
 329                break;
 330        case VM_FAULT_SIGNAL:
 331                if (!user_mode(regs))
 332                        do_no_context(regs);
 333                break;
 334        default: /* fault & VM_FAULT_ERROR */
 335                if (fault & VM_FAULT_OOM) {
 336                        if (!user_mode(regs))
 337                                do_no_context(regs);
 338                        else
 339                                pagefault_out_of_memory();
 340                } else if (fault & VM_FAULT_SIGSEGV) {
 341                        /* Kernel mode? Handle exceptions or die */
 342                        if (!user_mode(regs))
 343                                do_no_context(regs);
 344                        else
 345                                do_sigsegv(regs, SEGV_MAPERR);
 346                } else if (fault & VM_FAULT_SIGBUS) {
 347                        /* Kernel mode? Handle exceptions or die */
 348                        if (!user_mode(regs))
 349                                do_no_context(regs);
 350                        else
 351                                do_sigbus(regs);
 352                } else
 353                        BUG();
 354                break;
 355        }
 356}
 357
 358/*
 359 * This routine handles page faults.  It determines the address,
 360 * and the problem, and then passes it off to one of the appropriate
 361 * routines.
 362 *
 363 * interruption code (int_code):
 364 *   04       Protection           ->  Write-Protection  (suprression)
 365 *   10       Segment translation  ->  Not present       (nullification)
 366 *   11       Page translation     ->  Not present       (nullification)
 367 *   3b       Region third trans.  ->  Not present       (nullification)
 368 */
 369static inline int do_exception(struct pt_regs *regs, int access)
 370{
 371#ifdef CONFIG_PGSTE
 372        struct gmap *gmap;
 373#endif
 374        struct task_struct *tsk;
 375        struct mm_struct *mm;
 376        struct vm_area_struct *vma;
 377        unsigned long trans_exc_code;
 378        unsigned long address;
 379        unsigned int flags;
 380        int fault;
 381
 382        tsk = current;
 383        /*
 384         * The instruction that caused the program check has
 385         * been nullified. Don't signal single step via SIGTRAP.
 386         */
 387        clear_pt_regs_flag(regs, PIF_PER_TRAP);
 388
 389        if (notify_page_fault(regs))
 390                return 0;
 391
 392        mm = tsk->mm;
 393        trans_exc_code = regs->int_parm_long;
 394
 395        /*
 396         * Verify that the fault happened in user space, that
 397         * we are not in an interrupt and that there is a 
 398         * user context.
 399         */
 400        fault = VM_FAULT_BADCONTEXT;
 401        if (unlikely(!user_space_fault(regs) || faulthandler_disabled() || !mm))
 402                goto out;
 403
 404        address = trans_exc_code & __FAIL_ADDR_MASK;
 405        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 406        flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 407        if (user_mode(regs))
 408                flags |= FAULT_FLAG_USER;
 409        if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
 410                flags |= FAULT_FLAG_WRITE;
 411        down_read(&mm->mmap_sem);
 412
 413#ifdef CONFIG_PGSTE
 414        gmap = (current->flags & PF_VCPU) ?
 415                (struct gmap *) S390_lowcore.gmap : NULL;
 416        if (gmap) {
 417                current->thread.gmap_addr = address;
 418                address = __gmap_translate(gmap, address);
 419                if (address == -EFAULT) {
 420                        fault = VM_FAULT_BADMAP;
 421                        goto out_up;
 422                }
 423                if (gmap->pfault_enabled)
 424                        flags |= FAULT_FLAG_RETRY_NOWAIT;
 425        }
 426#endif
 427
 428retry:
 429        fault = VM_FAULT_BADMAP;
 430        vma = find_vma(mm, address);
 431        if (!vma)
 432                goto out_up;
 433
 434        if (unlikely(vma->vm_start > address)) {
 435                if (!(vma->vm_flags & VM_GROWSDOWN))
 436                        goto out_up;
 437                if (expand_stack(vma, address))
 438                        goto out_up;
 439        }
 440
 441        /*
 442         * Ok, we have a good vm_area for this memory access, so
 443         * we can handle it..
 444         */
 445        fault = VM_FAULT_BADACCESS;
 446        if (unlikely(!(vma->vm_flags & access)))
 447                goto out_up;
 448
 449        if (is_vm_hugetlb_page(vma))
 450                address &= HPAGE_MASK;
 451        /*
 452         * If for any reason at all we couldn't handle the fault,
 453         * make sure we exit gracefully rather than endlessly redo
 454         * the fault.
 455         */
 456        fault = handle_mm_fault(mm, vma, address, flags);
 457        /* No reason to continue if interrupted by SIGKILL. */
 458        if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
 459                fault = VM_FAULT_SIGNAL;
 460                goto out;
 461        }
 462        if (unlikely(fault & VM_FAULT_ERROR))
 463                goto out_up;
 464
 465        /*
 466         * Major/minor page fault accounting is only done on the
 467         * initial attempt. If we go through a retry, it is extremely
 468         * likely that the page will be found in page cache at that point.
 469         */
 470        if (flags & FAULT_FLAG_ALLOW_RETRY) {
 471                if (fault & VM_FAULT_MAJOR) {
 472                        tsk->maj_flt++;
 473                        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
 474                                      regs, address);
 475                } else {
 476                        tsk->min_flt++;
 477                        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
 478                                      regs, address);
 479                }
 480                if (fault & VM_FAULT_RETRY) {
 481#ifdef CONFIG_PGSTE
 482                        if (gmap && (flags & FAULT_FLAG_RETRY_NOWAIT)) {
 483                                /* FAULT_FLAG_RETRY_NOWAIT has been set,
 484                                 * mmap_sem has not been released */
 485                                current->thread.gmap_pfault = 1;
 486                                fault = VM_FAULT_PFAULT;
 487                                goto out_up;
 488                        }
 489#endif
 490                        /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
 491                         * of starvation. */
 492                        flags &= ~(FAULT_FLAG_ALLOW_RETRY |
 493                                   FAULT_FLAG_RETRY_NOWAIT);
 494                        flags |= FAULT_FLAG_TRIED;
 495                        down_read(&mm->mmap_sem);
 496                        goto retry;
 497                }
 498        }
 499#ifdef CONFIG_PGSTE
 500        if (gmap) {
 501                address =  __gmap_link(gmap, current->thread.gmap_addr,
 502                                       address);
 503                if (address == -EFAULT) {
 504                        fault = VM_FAULT_BADMAP;
 505                        goto out_up;
 506                }
 507                if (address == -ENOMEM) {
 508                        fault = VM_FAULT_OOM;
 509                        goto out_up;
 510                }
 511        }
 512#endif
 513        fault = 0;
 514out_up:
 515        up_read(&mm->mmap_sem);
 516out:
 517        return fault;
 518}
 519
 520void do_protection_exception(struct pt_regs *regs)
 521{
 522        unsigned long trans_exc_code;
 523        int fault;
 524
 525        trans_exc_code = regs->int_parm_long;
 526        /*
 527         * Protection exceptions are suppressing, decrement psw address.
 528         * The exception to this rule are aborted transactions, for these
 529         * the PSW already points to the correct location.
 530         */
 531        if (!(regs->int_code & 0x200))
 532                regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16);
 533        /*
 534         * Check for low-address protection.  This needs to be treated
 535         * as a special case because the translation exception code
 536         * field is not guaranteed to contain valid data in this case.
 537         */
 538        if (unlikely(!(trans_exc_code & 4))) {
 539                do_low_address(regs);
 540                return;
 541        }
 542        fault = do_exception(regs, VM_WRITE);
 543        if (unlikely(fault))
 544                do_fault_error(regs, fault);
 545}
 546NOKPROBE_SYMBOL(do_protection_exception);
 547
 548void do_dat_exception(struct pt_regs *regs)
 549{
 550        int access, fault;
 551
 552        access = VM_READ | VM_EXEC | VM_WRITE;
 553        fault = do_exception(regs, access);
 554        if (unlikely(fault))
 555                do_fault_error(regs, fault);
 556}
 557NOKPROBE_SYMBOL(do_dat_exception);
 558
 559#ifdef CONFIG_PFAULT 
 560/*
 561 * 'pfault' pseudo page faults routines.
 562 */
 563static int pfault_disable;
 564
 565static int __init nopfault(char *str)
 566{
 567        pfault_disable = 1;
 568        return 1;
 569}
 570
 571__setup("nopfault", nopfault);
 572
 573struct pfault_refbk {
 574        u16 refdiagc;
 575        u16 reffcode;
 576        u16 refdwlen;
 577        u16 refversn;
 578        u64 refgaddr;
 579        u64 refselmk;
 580        u64 refcmpmk;
 581        u64 reserved;
 582} __attribute__ ((packed, aligned(8)));
 583
 584int pfault_init(void)
 585{
 586        struct pfault_refbk refbk = {
 587                .refdiagc = 0x258,
 588                .reffcode = 0,
 589                .refdwlen = 5,
 590                .refversn = 2,
 591                .refgaddr = __LC_LPP,
 592                .refselmk = 1ULL << 48,
 593                .refcmpmk = 1ULL << 48,
 594                .reserved = __PF_RES_FIELD };
 595        int rc;
 596
 597        if (pfault_disable)
 598                return -1;
 599        diag_stat_inc(DIAG_STAT_X258);
 600        asm volatile(
 601                "       diag    %1,%0,0x258\n"
 602                "0:     j       2f\n"
 603                "1:     la      %0,8\n"
 604                "2:\n"
 605                EX_TABLE(0b,1b)
 606                : "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc");
 607        return rc;
 608}
 609
 610void pfault_fini(void)
 611{
 612        struct pfault_refbk refbk = {
 613                .refdiagc = 0x258,
 614                .reffcode = 1,
 615                .refdwlen = 5,
 616                .refversn = 2,
 617        };
 618
 619        if (pfault_disable)
 620                return;
 621        diag_stat_inc(DIAG_STAT_X258);
 622        asm volatile(
 623                "       diag    %0,0,0x258\n"
 624                "0:\n"
 625                EX_TABLE(0b,0b)
 626                : : "a" (&refbk), "m" (refbk) : "cc");
 627}
 628
 629static DEFINE_SPINLOCK(pfault_lock);
 630static LIST_HEAD(pfault_list);
 631
 632static void pfault_interrupt(struct ext_code ext_code,
 633                             unsigned int param32, unsigned long param64)
 634{
 635        struct task_struct *tsk;
 636        __u16 subcode;
 637        pid_t pid;
 638
 639        /*
 640         * Get the external interruption subcode & pfault
 641         * initial/completion signal bit. VM stores this 
 642         * in the 'cpu address' field associated with the
 643         * external interrupt. 
 644         */
 645        subcode = ext_code.subcode;
 646        if ((subcode & 0xff00) != __SUBCODE_MASK)
 647                return;
 648        inc_irq_stat(IRQEXT_PFL);
 649        /* Get the token (= pid of the affected task). */
 650        pid = param64 & LPP_PFAULT_PID_MASK;
 651        rcu_read_lock();
 652        tsk = find_task_by_pid_ns(pid, &init_pid_ns);
 653        if (tsk)
 654                get_task_struct(tsk);
 655        rcu_read_unlock();
 656        if (!tsk)
 657                return;
 658        spin_lock(&pfault_lock);
 659        if (subcode & 0x0080) {
 660                /* signal bit is set -> a page has been swapped in by VM */
 661                if (tsk->thread.pfault_wait == 1) {
 662                        /* Initial interrupt was faster than the completion
 663                         * interrupt. pfault_wait is valid. Set pfault_wait
 664                         * back to zero and wake up the process. This can
 665                         * safely be done because the task is still sleeping
 666                         * and can't produce new pfaults. */
 667                        tsk->thread.pfault_wait = 0;
 668                        list_del(&tsk->thread.list);
 669                        wake_up_process(tsk);
 670                        put_task_struct(tsk);
 671                } else {
 672                        /* Completion interrupt was faster than initial
 673                         * interrupt. Set pfault_wait to -1 so the initial
 674                         * interrupt doesn't put the task to sleep.
 675                         * If the task is not running, ignore the completion
 676                         * interrupt since it must be a leftover of a PFAULT
 677                         * CANCEL operation which didn't remove all pending
 678                         * completion interrupts. */
 679                        if (tsk->state == TASK_RUNNING)
 680                                tsk->thread.pfault_wait = -1;
 681                }
 682        } else {
 683                /* signal bit not set -> a real page is missing. */
 684                if (WARN_ON_ONCE(tsk != current))
 685                        goto out;
 686                if (tsk->thread.pfault_wait == 1) {
 687                        /* Already on the list with a reference: put to sleep */
 688                        __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 689                        set_tsk_need_resched(tsk);
 690                } else if (tsk->thread.pfault_wait == -1) {
 691                        /* Completion interrupt was faster than the initial
 692                         * interrupt (pfault_wait == -1). Set pfault_wait
 693                         * back to zero and exit. */
 694                        tsk->thread.pfault_wait = 0;
 695                } else {
 696                        /* Initial interrupt arrived before completion
 697                         * interrupt. Let the task sleep.
 698                         * An extra task reference is needed since a different
 699                         * cpu may set the task state to TASK_RUNNING again
 700                         * before the scheduler is reached. */
 701                        get_task_struct(tsk);
 702                        tsk->thread.pfault_wait = 1;
 703                        list_add(&tsk->thread.list, &pfault_list);
 704                        __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 705                        set_tsk_need_resched(tsk);
 706                }
 707        }
 708out:
 709        spin_unlock(&pfault_lock);
 710        put_task_struct(tsk);
 711}
 712
 713static int pfault_cpu_notify(struct notifier_block *self, unsigned long action,
 714                             void *hcpu)
 715{
 716        struct thread_struct *thread, *next;
 717        struct task_struct *tsk;
 718
 719        switch (action & ~CPU_TASKS_FROZEN) {
 720        case CPU_DEAD:
 721                spin_lock_irq(&pfault_lock);
 722                list_for_each_entry_safe(thread, next, &pfault_list, list) {
 723                        thread->pfault_wait = 0;
 724                        list_del(&thread->list);
 725                        tsk = container_of(thread, struct task_struct, thread);
 726                        wake_up_process(tsk);
 727                        put_task_struct(tsk);
 728                }
 729                spin_unlock_irq(&pfault_lock);
 730                break;
 731        default:
 732                break;
 733        }
 734        return NOTIFY_OK;
 735}
 736
 737static int __init pfault_irq_init(void)
 738{
 739        int rc;
 740
 741        rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
 742        if (rc)
 743                goto out_extint;
 744        rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
 745        if (rc)
 746                goto out_pfault;
 747        irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL);
 748        hotcpu_notifier(pfault_cpu_notify, 0);
 749        return 0;
 750
 751out_pfault:
 752        unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
 753out_extint:
 754        pfault_disable = 1;
 755        return rc;
 756}
 757early_initcall(pfault_irq_init);
 758
 759#endif /* CONFIG_PFAULT */
 760