linux/arch/s390/mm/fault.c
<<
>>
Prefs
   1/*
   2 *  S390 version
   3 *    Copyright IBM Corp. 1999
   4 *    Author(s): Hartmut Penner (hp@de.ibm.com)
   5 *               Ulrich Weigand (uweigand@de.ibm.com)
   6 *
   7 *  Derived from "arch/i386/mm/fault.c"
   8 *    Copyright (C) 1995  Linus Torvalds
   9 */
  10
  11#include <linux/kernel_stat.h>
  12#include <linux/perf_event.h>
  13#include <linux/signal.h>
  14#include <linux/sched.h>
  15#include <linux/kernel.h>
  16#include <linux/errno.h>
  17#include <linux/string.h>
  18#include <linux/types.h>
  19#include <linux/ptrace.h>
  20#include <linux/mman.h>
  21#include <linux/mm.h>
  22#include <linux/compat.h>
  23#include <linux/smp.h>
  24#include <linux/kdebug.h>
  25#include <linux/init.h>
  26#include <linux/console.h>
  27#include <linux/module.h>
  28#include <linux/hardirq.h>
  29#include <linux/kprobes.h>
  30#include <linux/uaccess.h>
  31#include <linux/hugetlb.h>
  32#include <asm/asm-offsets.h>
  33#include <asm/pgtable.h>
  34#include <asm/irq.h>
  35#include <asm/mmu_context.h>
  36#include <asm/facility.h>
  37#include "../kernel/entry.h"
  38
  39#ifndef CONFIG_64BIT
  40#define __FAIL_ADDR_MASK 0x7ffff000
  41#define __SUBCODE_MASK 0x0200
  42#define __PF_RES_FIELD 0ULL
  43#else /* CONFIG_64BIT */
  44#define __FAIL_ADDR_MASK -4096L
  45#define __SUBCODE_MASK 0x0600
  46#define __PF_RES_FIELD 0x8000000000000000ULL
  47#endif /* CONFIG_64BIT */
  48
  49#define VM_FAULT_BADCONTEXT     0x010000
  50#define VM_FAULT_BADMAP         0x020000
  51#define VM_FAULT_BADACCESS      0x040000
  52#define VM_FAULT_SIGNAL         0x080000
  53
  54static unsigned long store_indication __read_mostly;
  55
  56#ifdef CONFIG_64BIT
  57static int __init fault_init(void)
  58{
  59        if (test_facility(75))
  60                store_indication = 0xc00;
  61        return 0;
  62}
  63early_initcall(fault_init);
  64#endif
  65
  66static inline int notify_page_fault(struct pt_regs *regs)
  67{
  68        int ret = 0;
  69
  70        /* kprobe_running() needs smp_processor_id() */
  71        if (kprobes_built_in() && !user_mode(regs)) {
  72                preempt_disable();
  73                if (kprobe_running() && kprobe_fault_handler(regs, 14))
  74                        ret = 1;
  75                preempt_enable();
  76        }
  77        return ret;
  78}
  79
  80
  81/*
  82 * Unlock any spinlocks which will prevent us from getting the
  83 * message out.
  84 */
  85void bust_spinlocks(int yes)
  86{
  87        if (yes) {
  88                oops_in_progress = 1;
  89        } else {
  90                int loglevel_save = console_loglevel;
  91                console_unblank();
  92                oops_in_progress = 0;
  93                /*
  94                 * OK, the message is on the console.  Now we call printk()
  95                 * without oops_in_progress set so that printk will give klogd
  96                 * a poke.  Hold onto your hats...
  97                 */
  98                console_loglevel = 15;
  99                printk(" ");
 100                console_loglevel = loglevel_save;
 101        }
 102}
 103
 104/*
 105 * Returns the address space associated with the fault.
 106 * Returns 0 for kernel space and 1 for user space.
 107 */
 108static inline int user_space_fault(unsigned long trans_exc_code)
 109{
 110        /*
 111         * The lowest two bits of the translation exception
 112         * identification indicate which paging table was used.
 113         */
 114        trans_exc_code &= 3;
 115        if (trans_exc_code == 2)
 116                /* Access via secondary space, set_fs setting decides */
 117                return current->thread.mm_segment.ar4;
 118        /*
 119         * Access via primary space or access register is from user space
 120         * and access via home space is from the kernel.
 121         */
 122        return trans_exc_code != 3;
 123}
 124
 125static inline void report_user_fault(struct pt_regs *regs, long signr)
 126{
 127        if ((task_pid_nr(current) > 1) && !show_unhandled_signals)
 128                return;
 129        if (!unhandled_signal(current, signr))
 130                return;
 131        if (!printk_ratelimit())
 132                return;
 133        printk(KERN_ALERT "User process fault: interruption code 0x%X ",
 134               regs->int_code);
 135        print_vma_addr(KERN_CONT "in ", regs->psw.addr & PSW_ADDR_INSN);
 136        printk(KERN_CONT "\n");
 137        printk(KERN_ALERT "failing address: %lX\n",
 138               regs->int_parm_long & __FAIL_ADDR_MASK);
 139        show_regs(regs);
 140}
 141
 142/*
 143 * Send SIGSEGV to task.  This is an external routine
 144 * to keep the stack usage of do_page_fault small.
 145 */
 146static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
 147{
 148        struct siginfo si;
 149
 150        report_user_fault(regs, SIGSEGV);
 151        si.si_signo = SIGSEGV;
 152        si.si_code = si_code;
 153        si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK);
 154        force_sig_info(SIGSEGV, &si, current);
 155}
 156
 157static noinline void do_no_context(struct pt_regs *regs)
 158{
 159        const struct exception_table_entry *fixup;
 160        unsigned long address;
 161
 162        /* Are we prepared to handle this kernel fault?  */
 163        fixup = search_exception_tables(regs->psw.addr & PSW_ADDR_INSN);
 164        if (fixup) {
 165                regs->psw.addr = extable_fixup(fixup) | PSW_ADDR_AMODE;
 166                return;
 167        }
 168
 169        /*
 170         * Oops. The kernel tried to access some bad page. We'll have to
 171         * terminate things with extreme prejudice.
 172         */
 173        address = regs->int_parm_long & __FAIL_ADDR_MASK;
 174        if (!user_space_fault(regs->int_parm_long))
 175                printk(KERN_ALERT "Unable to handle kernel pointer dereference"
 176                       " at virtual kernel address %p\n", (void *)address);
 177        else
 178                printk(KERN_ALERT "Unable to handle kernel paging request"
 179                       " at virtual user address %p\n", (void *)address);
 180
 181        die(regs, "Oops");
 182        do_exit(SIGKILL);
 183}
 184
 185static noinline void do_low_address(struct pt_regs *regs)
 186{
 187        /* Low-address protection hit in kernel mode means
 188           NULL pointer write access in kernel mode.  */
 189        if (regs->psw.mask & PSW_MASK_PSTATE) {
 190                /* Low-address protection hit in user mode 'cannot happen'. */
 191                die (regs, "Low-address protection");
 192                do_exit(SIGKILL);
 193        }
 194
 195        do_no_context(regs);
 196}
 197
 198static noinline void do_sigbus(struct pt_regs *regs)
 199{
 200        struct task_struct *tsk = current;
 201        struct siginfo si;
 202
 203        /*
 204         * Send a sigbus, regardless of whether we were in kernel
 205         * or user mode.
 206         */
 207        si.si_signo = SIGBUS;
 208        si.si_errno = 0;
 209        si.si_code = BUS_ADRERR;
 210        si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK);
 211        force_sig_info(SIGBUS, &si, tsk);
 212}
 213
 214static noinline void do_fault_error(struct pt_regs *regs, int fault)
 215{
 216        int si_code;
 217
 218        switch (fault) {
 219        case VM_FAULT_BADACCESS:
 220        case VM_FAULT_BADMAP:
 221                /* Bad memory access. Check if it is kernel or user space. */
 222                if (user_mode(regs)) {
 223                        /* User mode accesses just cause a SIGSEGV */
 224                        si_code = (fault == VM_FAULT_BADMAP) ?
 225                                SEGV_MAPERR : SEGV_ACCERR;
 226                        do_sigsegv(regs, si_code);
 227                        return;
 228                }
 229        case VM_FAULT_BADCONTEXT:
 230                do_no_context(regs);
 231                break;
 232        case VM_FAULT_SIGNAL:
 233                if (!user_mode(regs))
 234                        do_no_context(regs);
 235                break;
 236        default: /* fault & VM_FAULT_ERROR */
 237                if (fault & VM_FAULT_OOM) {
 238                        if (!user_mode(regs))
 239                                do_no_context(regs);
 240                        else
 241                                pagefault_out_of_memory();
 242                } else if (fault & VM_FAULT_SIGBUS) {
 243                        /* Kernel mode? Handle exceptions or die */
 244                        if (!user_mode(regs))
 245                                do_no_context(regs);
 246                        else
 247                                do_sigbus(regs);
 248                } else
 249                        BUG();
 250                break;
 251        }
 252}
 253
 254/*
 255 * This routine handles page faults.  It determines the address,
 256 * and the problem, and then passes it off to one of the appropriate
 257 * routines.
 258 *
 259 * interruption code (int_code):
 260 *   04       Protection           ->  Write-Protection  (suprression)
 261 *   10       Segment translation  ->  Not present       (nullification)
 262 *   11       Page translation     ->  Not present       (nullification)
 263 *   3b       Region third trans.  ->  Not present       (nullification)
 264 */
 265static inline int do_exception(struct pt_regs *regs, int access)
 266{
 267        struct task_struct *tsk;
 268        struct mm_struct *mm;
 269        struct vm_area_struct *vma;
 270        unsigned long trans_exc_code;
 271        unsigned long address;
 272        unsigned int flags;
 273        int fault;
 274
 275        tsk = current;
 276        /*
 277         * The instruction that caused the program check has
 278         * been nullified. Don't signal single step via SIGTRAP.
 279         */
 280        clear_tsk_thread_flag(tsk, TIF_PER_TRAP);
 281
 282        if (notify_page_fault(regs))
 283                return 0;
 284
 285        mm = tsk->mm;
 286        trans_exc_code = regs->int_parm_long;
 287
 288        /*
 289         * Verify that the fault happened in user space, that
 290         * we are not in an interrupt and that there is a 
 291         * user context.
 292         */
 293        fault = VM_FAULT_BADCONTEXT;
 294        if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm))
 295                goto out;
 296
 297        address = trans_exc_code & __FAIL_ADDR_MASK;
 298        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 299        flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 300        if (user_mode(regs))
 301                flags |= FAULT_FLAG_USER;
 302        if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
 303                flags |= FAULT_FLAG_WRITE;
 304        down_read(&mm->mmap_sem);
 305
 306#ifdef CONFIG_PGSTE
 307        if ((current->flags & PF_VCPU) && S390_lowcore.gmap) {
 308                address = __gmap_fault(address,
 309                                     (struct gmap *) S390_lowcore.gmap);
 310                if (address == -EFAULT) {
 311                        fault = VM_FAULT_BADMAP;
 312                        goto out_up;
 313                }
 314                if (address == -ENOMEM) {
 315                        fault = VM_FAULT_OOM;
 316                        goto out_up;
 317                }
 318        }
 319#endif
 320
 321retry:
 322        fault = VM_FAULT_BADMAP;
 323        vma = find_vma(mm, address);
 324        if (!vma)
 325                goto out_up;
 326
 327        if (unlikely(vma->vm_start > address)) {
 328                if (!(vma->vm_flags & VM_GROWSDOWN))
 329                        goto out_up;
 330                if (expand_stack(vma, address))
 331                        goto out_up;
 332        }
 333
 334        /*
 335         * Ok, we have a good vm_area for this memory access, so
 336         * we can handle it..
 337         */
 338        fault = VM_FAULT_BADACCESS;
 339        if (unlikely(!(vma->vm_flags & access)))
 340                goto out_up;
 341
 342        if (is_vm_hugetlb_page(vma))
 343                address &= HPAGE_MASK;
 344        /*
 345         * If for any reason at all we couldn't handle the fault,
 346         * make sure we exit gracefully rather than endlessly redo
 347         * the fault.
 348         */
 349        fault = handle_mm_fault(mm, vma, address, flags);
 350        /* No reason to continue if interrupted by SIGKILL. */
 351        if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
 352                fault = VM_FAULT_SIGNAL;
 353                goto out;
 354        }
 355        if (unlikely(fault & VM_FAULT_ERROR))
 356                goto out_up;
 357
 358        /*
 359         * Major/minor page fault accounting is only done on the
 360         * initial attempt. If we go through a retry, it is extremely
 361         * likely that the page will be found in page cache at that point.
 362         */
 363        if (flags & FAULT_FLAG_ALLOW_RETRY) {
 364                if (fault & VM_FAULT_MAJOR) {
 365                        tsk->maj_flt++;
 366                        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
 367                                      regs, address);
 368                } else {
 369                        tsk->min_flt++;
 370                        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
 371                                      regs, address);
 372                }
 373                if (fault & VM_FAULT_RETRY) {
 374                        /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
 375                         * of starvation. */
 376                        flags &= ~FAULT_FLAG_ALLOW_RETRY;
 377                        flags |= FAULT_FLAG_TRIED;
 378                        down_read(&mm->mmap_sem);
 379                        goto retry;
 380                }
 381        }
 382        fault = 0;
 383out_up:
 384        up_read(&mm->mmap_sem);
 385out:
 386        return fault;
 387}
 388
 389void __kprobes do_protection_exception(struct pt_regs *regs)
 390{
 391        unsigned long trans_exc_code;
 392        int fault;
 393
 394        trans_exc_code = regs->int_parm_long;
 395        /*
 396         * Protection exceptions are suppressing, decrement psw address.
 397         * The exception to this rule are aborted transactions, for these
 398         * the PSW already points to the correct location.
 399         */
 400        if (!(regs->int_code & 0x200))
 401                regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16);
 402        /*
 403         * Check for low-address protection.  This needs to be treated
 404         * as a special case because the translation exception code
 405         * field is not guaranteed to contain valid data in this case.
 406         */
 407        if (unlikely(!(trans_exc_code & 4))) {
 408                do_low_address(regs);
 409                return;
 410        }
 411        fault = do_exception(regs, VM_WRITE);
 412        if (unlikely(fault))
 413                do_fault_error(regs, fault);
 414}
 415
 416void __kprobes do_dat_exception(struct pt_regs *regs)
 417{
 418        int access, fault;
 419
 420        access = VM_READ | VM_EXEC | VM_WRITE;
 421        fault = do_exception(regs, access);
 422        if (unlikely(fault))
 423                do_fault_error(regs, fault);
 424}
 425
 426int __handle_fault(unsigned long uaddr, unsigned long pgm_int_code, int write)
 427{
 428        struct pt_regs regs;
 429        int access, fault;
 430
 431        /* Emulate a uaccess fault from kernel mode. */
 432        regs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT | PSW_MASK_MCHECK;
 433        if (!irqs_disabled())
 434                regs.psw.mask |= PSW_MASK_IO | PSW_MASK_EXT;
 435        regs.psw.addr = (unsigned long) __builtin_return_address(0);
 436        regs.psw.addr |= PSW_ADDR_AMODE;
 437        regs.int_code = pgm_int_code;
 438        regs.int_parm_long = (uaddr & PAGE_MASK) | 2;
 439        access = write ? VM_WRITE : VM_READ;
 440        fault = do_exception(&regs, access);
 441        /*
 442         * Since the fault happened in kernel mode while performing a uaccess
 443         * all we need to do now is emulating a fixup in case "fault" is not
 444         * zero.
 445         * For the calling uaccess functions this results always in -EFAULT.
 446         */
 447        return fault ? -EFAULT : 0;
 448}
 449
 450#ifdef CONFIG_PFAULT 
 451/*
 452 * 'pfault' pseudo page faults routines.
 453 */
 454static int pfault_disable;
 455
 456static int __init nopfault(char *str)
 457{
 458        pfault_disable = 1;
 459        return 1;
 460}
 461
 462__setup("nopfault", nopfault);
 463
 464struct pfault_refbk {
 465        u16 refdiagc;
 466        u16 reffcode;
 467        u16 refdwlen;
 468        u16 refversn;
 469        u64 refgaddr;
 470        u64 refselmk;
 471        u64 refcmpmk;
 472        u64 reserved;
 473} __attribute__ ((packed, aligned(8)));
 474
 475int pfault_init(void)
 476{
 477        struct pfault_refbk refbk = {
 478                .refdiagc = 0x258,
 479                .reffcode = 0,
 480                .refdwlen = 5,
 481                .refversn = 2,
 482                .refgaddr = __LC_CURRENT_PID,
 483                .refselmk = 1ULL << 48,
 484                .refcmpmk = 1ULL << 48,
 485                .reserved = __PF_RES_FIELD };
 486        int rc;
 487
 488        if (pfault_disable)
 489                return -1;
 490        asm volatile(
 491                "       diag    %1,%0,0x258\n"
 492                "0:     j       2f\n"
 493                "1:     la      %0,8\n"
 494                "2:\n"
 495                EX_TABLE(0b,1b)
 496                : "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc");
 497        return rc;
 498}
 499
 500void pfault_fini(void)
 501{
 502        struct pfault_refbk refbk = {
 503                .refdiagc = 0x258,
 504                .reffcode = 1,
 505                .refdwlen = 5,
 506                .refversn = 2,
 507        };
 508
 509        if (pfault_disable)
 510                return;
 511        asm volatile(
 512                "       diag    %0,0,0x258\n"
 513                "0:\n"
 514                EX_TABLE(0b,0b)
 515                : : "a" (&refbk), "m" (refbk) : "cc");
 516}
 517
 518static DEFINE_SPINLOCK(pfault_lock);
 519static LIST_HEAD(pfault_list);
 520
 521static void pfault_interrupt(struct ext_code ext_code,
 522                             unsigned int param32, unsigned long param64)
 523{
 524        struct task_struct *tsk;
 525        __u16 subcode;
 526        pid_t pid;
 527
 528        /*
 529         * Get the external interruption subcode & pfault
 530         * initial/completion signal bit. VM stores this 
 531         * in the 'cpu address' field associated with the
 532         * external interrupt. 
 533         */
 534        subcode = ext_code.subcode;
 535        if ((subcode & 0xff00) != __SUBCODE_MASK)
 536                return;
 537        inc_irq_stat(IRQEXT_PFL);
 538        /* Get the token (= pid of the affected task). */
 539        pid = sizeof(void *) == 4 ? param32 : param64;
 540        rcu_read_lock();
 541        tsk = find_task_by_pid_ns(pid, &init_pid_ns);
 542        if (tsk)
 543                get_task_struct(tsk);
 544        rcu_read_unlock();
 545        if (!tsk)
 546                return;
 547        spin_lock(&pfault_lock);
 548        if (subcode & 0x0080) {
 549                /* signal bit is set -> a page has been swapped in by VM */
 550                if (tsk->thread.pfault_wait == 1) {
 551                        /* Initial interrupt was faster than the completion
 552                         * interrupt. pfault_wait is valid. Set pfault_wait
 553                         * back to zero and wake up the process. This can
 554                         * safely be done because the task is still sleeping
 555                         * and can't produce new pfaults. */
 556                        tsk->thread.pfault_wait = 0;
 557                        list_del(&tsk->thread.list);
 558                        wake_up_process(tsk);
 559                        put_task_struct(tsk);
 560                } else {
 561                        /* Completion interrupt was faster than initial
 562                         * interrupt. Set pfault_wait to -1 so the initial
 563                         * interrupt doesn't put the task to sleep.
 564                         * If the task is not running, ignore the completion
 565                         * interrupt since it must be a leftover of a PFAULT
 566                         * CANCEL operation which didn't remove all pending
 567                         * completion interrupts. */
 568                        if (tsk->state == TASK_RUNNING)
 569                                tsk->thread.pfault_wait = -1;
 570                }
 571        } else {
 572                /* signal bit not set -> a real page is missing. */
 573                if (WARN_ON_ONCE(tsk != current))
 574                        goto out;
 575                if (tsk->thread.pfault_wait == 1) {
 576                        /* Already on the list with a reference: put to sleep */
 577                        __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 578                        set_tsk_need_resched(tsk);
 579                } else if (tsk->thread.pfault_wait == -1) {
 580                        /* Completion interrupt was faster than the initial
 581                         * interrupt (pfault_wait == -1). Set pfault_wait
 582                         * back to zero and exit. */
 583                        tsk->thread.pfault_wait = 0;
 584                } else {
 585                        /* Initial interrupt arrived before completion
 586                         * interrupt. Let the task sleep.
 587                         * An extra task reference is needed since a different
 588                         * cpu may set the task state to TASK_RUNNING again
 589                         * before the scheduler is reached. */
 590                        get_task_struct(tsk);
 591                        tsk->thread.pfault_wait = 1;
 592                        list_add(&tsk->thread.list, &pfault_list);
 593                        __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 594                        set_tsk_need_resched(tsk);
 595                }
 596        }
 597out:
 598        spin_unlock(&pfault_lock);
 599        put_task_struct(tsk);
 600}
 601
 602static int pfault_cpu_notify(struct notifier_block *self, unsigned long action,
 603                             void *hcpu)
 604{
 605        struct thread_struct *thread, *next;
 606        struct task_struct *tsk;
 607
 608        switch (action & ~CPU_TASKS_FROZEN) {
 609        case CPU_DEAD:
 610                spin_lock_irq(&pfault_lock);
 611                list_for_each_entry_safe(thread, next, &pfault_list, list) {
 612                        thread->pfault_wait = 0;
 613                        list_del(&thread->list);
 614                        tsk = container_of(thread, struct task_struct, thread);
 615                        wake_up_process(tsk);
 616                        put_task_struct(tsk);
 617                }
 618                spin_unlock_irq(&pfault_lock);
 619                break;
 620        default:
 621                break;
 622        }
 623        return NOTIFY_OK;
 624}
 625
 626static int __init pfault_irq_init(void)
 627{
 628        int rc;
 629
 630        rc = register_external_interrupt(0x2603, pfault_interrupt);
 631        if (rc)
 632                goto out_extint;
 633        rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
 634        if (rc)
 635                goto out_pfault;
 636        irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL);
 637        hotcpu_notifier(pfault_cpu_notify, 0);
 638        return 0;
 639
 640out_pfault:
 641        unregister_external_interrupt(0x2603, pfault_interrupt);
 642out_extint:
 643        pfault_disable = 1;
 644        return rc;
 645}
 646early_initcall(pfault_irq_init);
 647
 648#endif /* CONFIG_PFAULT */
 649