linux/arch/powerpc/kernel/interrupt.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2
   3#include <linux/context_tracking.h>
   4#include <linux/err.h>
   5#include <linux/compat.h>
   6#include <linux/sched/debug.h> /* for show_regs */
   7
   8#include <asm/asm-prototypes.h>
   9#include <asm/kup.h>
  10#include <asm/cputime.h>
  11#include <asm/interrupt.h>
  12#include <asm/hw_irq.h>
  13#include <asm/interrupt.h>
  14#include <asm/kprobes.h>
  15#include <asm/paca.h>
  16#include <asm/ptrace.h>
  17#include <asm/reg.h>
  18#include <asm/signal.h>
  19#include <asm/switch_to.h>
  20#include <asm/syscall.h>
  21#include <asm/time.h>
  22#include <asm/unistd.h>
  23
  24#if defined(CONFIG_PPC_ADV_DEBUG_REGS) && defined(CONFIG_PPC32)
  25unsigned long global_dbcr0[NR_CPUS];
  26#endif
  27
  28typedef long (*syscall_fn)(long, long, long, long, long, long);
  29
  30#ifdef CONFIG_PPC_BOOK3S_64
  31DEFINE_STATIC_KEY_FALSE(interrupt_exit_not_reentrant);
  32static inline bool exit_must_hard_disable(void)
  33{
  34        return static_branch_unlikely(&interrupt_exit_not_reentrant);
  35}
  36#else
  37static inline bool exit_must_hard_disable(void)
  38{
  39        return true;
  40}
  41#endif
  42
  43/*
  44 * local irqs must be disabled. Returns false if the caller must re-enable
  45 * them, check for new work, and try again.
  46 *
  47 * This should be called with local irqs disabled, but if they were previously
  48 * enabled when the interrupt handler returns (indicating a process-context /
  49 * synchronous interrupt) then irqs_enabled should be true.
  50 *
  51 * restartable is true then EE/RI can be left on because interrupts are handled
  52 * with a restart sequence.
  53 */
  54static notrace __always_inline bool prep_irq_for_enabled_exit(bool restartable)
  55{
  56        /* This must be done with RI=1 because tracing may touch vmaps */
  57        trace_hardirqs_on();
  58
  59        if (exit_must_hard_disable() || !restartable)
  60                __hard_EE_RI_disable();
  61
  62#ifdef CONFIG_PPC64
  63        /* This pattern matches prep_irq_for_idle */
  64        if (unlikely(lazy_irq_pending_nocheck())) {
  65                if (exit_must_hard_disable() || !restartable) {
  66                        local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
  67                        __hard_RI_enable();
  68                }
  69                trace_hardirqs_off();
  70
  71                return false;
  72        }
  73#endif
  74        return true;
  75}
  76
  77/* Has to run notrace because it is entered not completely "reconciled" */
  78notrace long system_call_exception(long r3, long r4, long r5,
  79                                   long r6, long r7, long r8,
  80                                   unsigned long r0, struct pt_regs *regs)
  81{
  82        syscall_fn f;
  83
  84        kuep_lock();
  85
  86        regs->orig_gpr3 = r3;
  87
  88        if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
  89                BUG_ON(irq_soft_mask_return() != IRQS_ALL_DISABLED);
  90
  91        trace_hardirqs_off(); /* finish reconciling */
  92
  93        CT_WARN_ON(ct_state() == CONTEXT_KERNEL);
  94        user_exit_irqoff();
  95
  96        if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x))
  97                BUG_ON(!(regs->msr & MSR_RI));
  98        BUG_ON(!(regs->msr & MSR_PR));
  99        BUG_ON(arch_irq_disabled_regs(regs));
 100
 101#ifdef CONFIG_PPC_PKEY
 102        if (mmu_has_feature(MMU_FTR_PKEY)) {
 103                unsigned long amr, iamr;
 104                bool flush_needed = false;
 105                /*
 106                 * When entering from userspace we mostly have the AMR/IAMR
 107                 * different from kernel default values. Hence don't compare.
 108                 */
 109                amr = mfspr(SPRN_AMR);
 110                iamr = mfspr(SPRN_IAMR);
 111                regs->amr  = amr;
 112                regs->iamr = iamr;
 113                if (mmu_has_feature(MMU_FTR_BOOK3S_KUAP)) {
 114                        mtspr(SPRN_AMR, AMR_KUAP_BLOCKED);
 115                        flush_needed = true;
 116                }
 117                if (mmu_has_feature(MMU_FTR_BOOK3S_KUEP)) {
 118                        mtspr(SPRN_IAMR, AMR_KUEP_BLOCKED);
 119                        flush_needed = true;
 120                }
 121                if (flush_needed)
 122                        isync();
 123        } else
 124#endif
 125                kuap_assert_locked();
 126
 127        booke_restore_dbcr0();
 128
 129        account_cpu_user_entry();
 130
 131        account_stolen_time();
 132
 133        /*
 134         * This is not required for the syscall exit path, but makes the
 135         * stack frame look nicer. If this was initialised in the first stack
 136         * frame, or if the unwinder was taught the first stack frame always
 137         * returns to user with IRQS_ENABLED, this store could be avoided!
 138         */
 139        irq_soft_mask_regs_set_state(regs, IRQS_ENABLED);
 140
 141        local_irq_enable();
 142
 143        if (unlikely(current_thread_info()->flags & _TIF_SYSCALL_DOTRACE)) {
 144                if (unlikely(trap_is_unsupported_scv(regs))) {
 145                        /* Unsupported scv vector */
 146                        _exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
 147                        return regs->gpr[3];
 148                }
 149                /*
 150                 * We use the return value of do_syscall_trace_enter() as the
 151                 * syscall number. If the syscall was rejected for any reason
 152                 * do_syscall_trace_enter() returns an invalid syscall number
 153                 * and the test against NR_syscalls will fail and the return
 154                 * value to be used is in regs->gpr[3].
 155                 */
 156                r0 = do_syscall_trace_enter(regs);
 157                if (unlikely(r0 >= NR_syscalls))
 158                        return regs->gpr[3];
 159                r3 = regs->gpr[3];
 160                r4 = regs->gpr[4];
 161                r5 = regs->gpr[5];
 162                r6 = regs->gpr[6];
 163                r7 = regs->gpr[7];
 164                r8 = regs->gpr[8];
 165
 166        } else if (unlikely(r0 >= NR_syscalls)) {
 167                if (unlikely(trap_is_unsupported_scv(regs))) {
 168                        /* Unsupported scv vector */
 169                        _exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
 170                        return regs->gpr[3];
 171                }
 172                return -ENOSYS;
 173        }
 174
 175        /* May be faster to do array_index_nospec? */
 176        barrier_nospec();
 177
 178        if (unlikely(is_compat_task())) {
 179                f = (void *)compat_sys_call_table[r0];
 180
 181                r3 &= 0x00000000ffffffffULL;
 182                r4 &= 0x00000000ffffffffULL;
 183                r5 &= 0x00000000ffffffffULL;
 184                r6 &= 0x00000000ffffffffULL;
 185                r7 &= 0x00000000ffffffffULL;
 186                r8 &= 0x00000000ffffffffULL;
 187
 188        } else {
 189                f = (void *)sys_call_table[r0];
 190        }
 191
 192        return f(r3, r4, r5, r6, r7, r8);
 193}
 194
 195static notrace void booke_load_dbcr0(void)
 196{
 197#ifdef CONFIG_PPC_ADV_DEBUG_REGS
 198        unsigned long dbcr0 = current->thread.debug.dbcr0;
 199
 200        if (likely(!(dbcr0 & DBCR0_IDM)))
 201                return;
 202
 203        /*
 204         * Check to see if the dbcr0 register is set up to debug.
 205         * Use the internal debug mode bit to do this.
 206         */
 207        mtmsr(mfmsr() & ~MSR_DE);
 208        if (IS_ENABLED(CONFIG_PPC32)) {
 209                isync();
 210                global_dbcr0[smp_processor_id()] = mfspr(SPRN_DBCR0);
 211        }
 212        mtspr(SPRN_DBCR0, dbcr0);
 213        mtspr(SPRN_DBSR, -1);
 214#endif
 215}
 216
 217static void check_return_regs_valid(struct pt_regs *regs)
 218{
 219#ifdef CONFIG_PPC_BOOK3S_64
 220        unsigned long trap, srr0, srr1;
 221        static bool warned;
 222        u8 *validp;
 223        char *h;
 224
 225        if (trap_is_scv(regs))
 226                return;
 227
 228        trap = regs->trap;
 229        // EE in HV mode sets HSRRs like 0xea0
 230        if (cpu_has_feature(CPU_FTR_HVMODE) && trap == INTERRUPT_EXTERNAL)
 231                trap = 0xea0;
 232
 233        switch (trap) {
 234        case 0x980:
 235        case INTERRUPT_H_DATA_STORAGE:
 236        case 0xe20:
 237        case 0xe40:
 238        case INTERRUPT_HMI:
 239        case 0xe80:
 240        case 0xea0:
 241        case INTERRUPT_H_FAC_UNAVAIL:
 242        case 0x1200:
 243        case 0x1500:
 244        case 0x1600:
 245        case 0x1800:
 246                validp = &local_paca->hsrr_valid;
 247                if (!*validp)
 248                        return;
 249
 250                srr0 = mfspr(SPRN_HSRR0);
 251                srr1 = mfspr(SPRN_HSRR1);
 252                h = "H";
 253
 254                break;
 255        default:
 256                validp = &local_paca->srr_valid;
 257                if (!*validp)
 258                        return;
 259
 260                srr0 = mfspr(SPRN_SRR0);
 261                srr1 = mfspr(SPRN_SRR1);
 262                h = "";
 263                break;
 264        }
 265
 266        if (srr0 == regs->nip && srr1 == regs->msr)
 267                return;
 268
 269        /*
 270         * A NMI / soft-NMI interrupt may have come in after we found
 271         * srr_valid and before the SRRs are loaded. The interrupt then
 272         * comes in and clobbers SRRs and clears srr_valid. Then we load
 273         * the SRRs here and test them above and find they don't match.
 274         *
 275         * Test validity again after that, to catch such false positives.
 276         *
 277         * This test in general will have some window for false negatives
 278         * and may not catch and fix all such cases if an NMI comes in
 279         * later and clobbers SRRs without clearing srr_valid, but hopefully
 280         * such things will get caught most of the time, statistically
 281         * enough to be able to get a warning out.
 282         */
 283        barrier();
 284
 285        if (!*validp)
 286                return;
 287
 288        if (!warned) {
 289                warned = true;
 290                printk("%sSRR0 was: %lx should be: %lx\n", h, srr0, regs->nip);
 291                printk("%sSRR1 was: %lx should be: %lx\n", h, srr1, regs->msr);
 292                show_regs(regs);
 293        }
 294
 295        *validp = 0; /* fixup */
 296#endif
 297}
 298
 299static notrace unsigned long
 300interrupt_exit_user_prepare_main(unsigned long ret, struct pt_regs *regs)
 301{
 302        unsigned long ti_flags;
 303
 304again:
 305        ti_flags = READ_ONCE(current_thread_info()->flags);
 306        while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) {
 307                local_irq_enable();
 308                if (ti_flags & _TIF_NEED_RESCHED) {
 309                        schedule();
 310                } else {
 311                        /*
 312                         * SIGPENDING must restore signal handler function
 313                         * argument GPRs, and some non-volatiles (e.g., r1).
 314                         * Restore all for now. This could be made lighter.
 315                         */
 316                        if (ti_flags & _TIF_SIGPENDING)
 317                                ret |= _TIF_RESTOREALL;
 318                        do_notify_resume(regs, ti_flags);
 319                }
 320                local_irq_disable();
 321                ti_flags = READ_ONCE(current_thread_info()->flags);
 322        }
 323
 324        if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && IS_ENABLED(CONFIG_PPC_FPU)) {
 325                if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) &&
 326                                unlikely((ti_flags & _TIF_RESTORE_TM))) {
 327                        restore_tm_state(regs);
 328                } else {
 329                        unsigned long mathflags = MSR_FP;
 330
 331                        if (cpu_has_feature(CPU_FTR_VSX))
 332                                mathflags |= MSR_VEC | MSR_VSX;
 333                        else if (cpu_has_feature(CPU_FTR_ALTIVEC))
 334                                mathflags |= MSR_VEC;
 335
 336                        /*
 337                         * If userspace MSR has all available FP bits set,
 338                         * then they are live and no need to restore. If not,
 339                         * it means the regs were given up and restore_math
 340                         * may decide to restore them (to avoid taking an FP
 341                         * fault).
 342                         */
 343                        if ((regs->msr & mathflags) != mathflags)
 344                                restore_math(regs);
 345                }
 346        }
 347
 348        check_return_regs_valid(regs);
 349
 350        user_enter_irqoff();
 351        if (!prep_irq_for_enabled_exit(true)) {
 352                user_exit_irqoff();
 353                local_irq_enable();
 354                local_irq_disable();
 355                goto again;
 356        }
 357
 358#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 359        local_paca->tm_scratch = regs->msr;
 360#endif
 361
 362        booke_load_dbcr0();
 363
 364        account_cpu_user_exit();
 365
 366        /* Restore user access locks last */
 367        kuap_user_restore(regs);
 368        kuep_unlock();
 369
 370        return ret;
 371}
 372
 373/*
 374 * This should be called after a syscall returns, with r3 the return value
 375 * from the syscall. If this function returns non-zero, the system call
 376 * exit assembly should additionally load all GPR registers and CTR and XER
 377 * from the interrupt frame.
 378 *
 379 * The function graph tracer can not trace the return side of this function,
 380 * because RI=0 and soft mask state is "unreconciled", so it is marked notrace.
 381 */
 382notrace unsigned long syscall_exit_prepare(unsigned long r3,
 383                                           struct pt_regs *regs,
 384                                           long scv)
 385{
 386        unsigned long ti_flags;
 387        unsigned long ret = 0;
 388        bool is_not_scv = !IS_ENABLED(CONFIG_PPC_BOOK3S_64) || !scv;
 389
 390        CT_WARN_ON(ct_state() == CONTEXT_USER);
 391
 392        kuap_assert_locked();
 393
 394        regs->result = r3;
 395
 396        /* Check whether the syscall is issued inside a restartable sequence */
 397        rseq_syscall(regs);
 398
 399        ti_flags = current_thread_info()->flags;
 400
 401        if (unlikely(r3 >= (unsigned long)-MAX_ERRNO) && is_not_scv) {
 402                if (likely(!(ti_flags & (_TIF_NOERROR | _TIF_RESTOREALL)))) {
 403                        r3 = -r3;
 404                        regs->ccr |= 0x10000000; /* Set SO bit in CR */
 405                }
 406        }
 407
 408        if (unlikely(ti_flags & _TIF_PERSYSCALL_MASK)) {
 409                if (ti_flags & _TIF_RESTOREALL)
 410                        ret = _TIF_RESTOREALL;
 411                else
 412                        regs->gpr[3] = r3;
 413                clear_bits(_TIF_PERSYSCALL_MASK, &current_thread_info()->flags);
 414        } else {
 415                regs->gpr[3] = r3;
 416        }
 417
 418        if (unlikely(ti_flags & _TIF_SYSCALL_DOTRACE)) {
 419                do_syscall_trace_leave(regs);
 420                ret |= _TIF_RESTOREALL;
 421        }
 422
 423        local_irq_disable();
 424        ret = interrupt_exit_user_prepare_main(ret, regs);
 425
 426#ifdef CONFIG_PPC64
 427        regs->exit_result = ret;
 428#endif
 429
 430        return ret;
 431}
 432
 433#ifdef CONFIG_PPC64
 434notrace unsigned long syscall_exit_restart(unsigned long r3, struct pt_regs *regs)
 435{
 436        /*
 437         * This is called when detecting a soft-pending interrupt as well as
 438         * an alternate-return interrupt. So we can't just have the alternate
 439         * return path clear SRR1[MSR] and set PACA_IRQ_HARD_DIS (unless
 440         * the soft-pending case were to fix things up as well). RI might be
 441         * disabled, in which case it gets re-enabled by __hard_irq_disable().
 442         */
 443        __hard_irq_disable();
 444        local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
 445
 446#ifdef CONFIG_PPC_BOOK3S_64
 447        set_kuap(AMR_KUAP_BLOCKED);
 448#endif
 449
 450        trace_hardirqs_off();
 451        user_exit_irqoff();
 452        account_cpu_user_entry();
 453
 454        BUG_ON(!user_mode(regs));
 455
 456        regs->exit_result = interrupt_exit_user_prepare_main(regs->exit_result, regs);
 457
 458        return regs->exit_result;
 459}
 460#endif
 461
 462notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs)
 463{
 464        unsigned long ret;
 465
 466        if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x))
 467                BUG_ON(!(regs->msr & MSR_RI));
 468        BUG_ON(!(regs->msr & MSR_PR));
 469        BUG_ON(arch_irq_disabled_regs(regs));
 470        CT_WARN_ON(ct_state() == CONTEXT_USER);
 471
 472        /*
 473         * We don't need to restore AMR on the way back to userspace for KUAP.
 474         * AMR can only have been unlocked if we interrupted the kernel.
 475         */
 476        kuap_assert_locked();
 477
 478        local_irq_disable();
 479
 480        ret = interrupt_exit_user_prepare_main(0, regs);
 481
 482#ifdef CONFIG_PPC64
 483        regs->exit_result = ret;
 484#endif
 485
 486        return ret;
 487}
 488
 489void preempt_schedule_irq(void);
 490
 491notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs)
 492{
 493        unsigned long flags;
 494        unsigned long ret = 0;
 495        unsigned long kuap;
 496        bool stack_store = current_thread_info()->flags &
 497                                                _TIF_EMULATE_STACK_STORE;
 498
 499        if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x) &&
 500            unlikely(!(regs->msr & MSR_RI)))
 501                unrecoverable_exception(regs);
 502        BUG_ON(regs->msr & MSR_PR);
 503        /*
 504         * CT_WARN_ON comes here via program_check_exception,
 505         * so avoid recursion.
 506         */
 507        if (TRAP(regs) != INTERRUPT_PROGRAM)
 508                CT_WARN_ON(ct_state() == CONTEXT_USER);
 509
 510        kuap = kuap_get_and_assert_locked();
 511
 512        local_irq_save(flags);
 513
 514        if (!arch_irq_disabled_regs(regs)) {
 515                /* Returning to a kernel context with local irqs enabled. */
 516                WARN_ON_ONCE(!(regs->msr & MSR_EE));
 517again:
 518                if (IS_ENABLED(CONFIG_PREEMPT)) {
 519                        /* Return to preemptible kernel context */
 520                        if (unlikely(current_thread_info()->flags & _TIF_NEED_RESCHED)) {
 521                                if (preempt_count() == 0)
 522                                        preempt_schedule_irq();
 523                        }
 524                }
 525
 526                check_return_regs_valid(regs);
 527
 528                /*
 529                 * Stack store exit can't be restarted because the interrupt
 530                 * stack frame might have been clobbered.
 531                 */
 532                if (!prep_irq_for_enabled_exit(unlikely(stack_store))) {
 533                        /*
 534                         * Replay pending soft-masked interrupts now. Don't
 535                         * just local_irq_enabe(); local_irq_disable(); because
 536                         * if we are returning from an asynchronous interrupt
 537                         * here, another one might hit after irqs are enabled,
 538                         * and it would exit via this same path allowing
 539                         * another to fire, and so on unbounded.
 540                         */
 541                        hard_irq_disable();
 542                        replay_soft_interrupts();
 543                        /* Took an interrupt, may have more exit work to do. */
 544                        goto again;
 545                }
 546#ifdef CONFIG_PPC64
 547                /*
 548                 * An interrupt may clear MSR[EE] and set this concurrently,
 549                 * but it will be marked pending and the exit will be retried.
 550                 * This leaves a racy window where MSR[EE]=0 and HARD_DIS is
 551                 * clear, until interrupt_exit_kernel_restart() calls
 552                 * hard_irq_disable(), which will set HARD_DIS again.
 553                 */
 554                local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS;
 555
 556        } else {
 557                check_return_regs_valid(regs);
 558
 559                if (unlikely(stack_store))
 560                        __hard_EE_RI_disable();
 561                /*
 562                 * Returning to a kernel context with local irqs disabled.
 563                 * Here, if EE was enabled in the interrupted context, enable
 564                 * it on return as well. A problem exists here where a soft
 565                 * masked interrupt may have cleared MSR[EE] and set HARD_DIS
 566                 * here, and it will still exist on return to the caller. This
 567                 * will be resolved by the masked interrupt firing again.
 568                 */
 569                if (regs->msr & MSR_EE)
 570                        local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS;
 571#endif /* CONFIG_PPC64 */
 572        }
 573
 574        if (unlikely(stack_store)) {
 575                clear_bits(_TIF_EMULATE_STACK_STORE, &current_thread_info()->flags);
 576                ret = 1;
 577        }
 578
 579#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 580        local_paca->tm_scratch = regs->msr;
 581#endif
 582
 583        /*
 584         * 64s does not want to mfspr(SPRN_AMR) here, because this comes after
 585         * mtmsr, which would cause Read-After-Write stalls. Hence, take the
 586         * AMR value from the check above.
 587         */
 588        kuap_kernel_restore(regs, kuap);
 589
 590        return ret;
 591}
 592
 593#ifdef CONFIG_PPC64
 594notrace unsigned long interrupt_exit_user_restart(struct pt_regs *regs)
 595{
 596        __hard_irq_disable();
 597        local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
 598
 599#ifdef CONFIG_PPC_BOOK3S_64
 600        set_kuap(AMR_KUAP_BLOCKED);
 601#endif
 602
 603        trace_hardirqs_off();
 604        user_exit_irqoff();
 605        account_cpu_user_entry();
 606
 607        BUG_ON(!user_mode(regs));
 608
 609        regs->exit_result |= interrupt_exit_user_prepare(regs);
 610
 611        return regs->exit_result;
 612}
 613
 614/*
 615 * No real need to return a value here because the stack store case does not
 616 * get restarted.
 617 */
 618notrace unsigned long interrupt_exit_kernel_restart(struct pt_regs *regs)
 619{
 620        __hard_irq_disable();
 621        local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
 622
 623#ifdef CONFIG_PPC_BOOK3S_64
 624        set_kuap(AMR_KUAP_BLOCKED);
 625#endif
 626
 627        if (regs->softe == IRQS_ENABLED)
 628                trace_hardirqs_off();
 629
 630        BUG_ON(user_mode(regs));
 631
 632        return interrupt_exit_kernel_prepare(regs);
 633}
 634#endif
 635