linux/kernel/entry/common.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2
   3#include <linux/context_tracking.h>
   4#include <linux/entry-common.h>
   5#include <linux/highmem.h>
   6#include <linux/livepatch.h>
   7#include <linux/audit.h>
   8
   9#include "common.h"
  10
  11#define CREATE_TRACE_POINTS
  12#include <trace/events/syscalls.h>
  13
  14/* See comment for enter_from_user_mode() in entry-common.h */
  15static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
  16{
  17        arch_check_user_regs(regs);
  18        lockdep_hardirqs_off(CALLER_ADDR0);
  19
  20        CT_WARN_ON(ct_state() != CONTEXT_USER);
  21        user_exit_irqoff();
  22
  23        instrumentation_begin();
  24        trace_hardirqs_off_finish();
  25        instrumentation_end();
  26}
  27
  28void noinstr enter_from_user_mode(struct pt_regs *regs)
  29{
  30        __enter_from_user_mode(regs);
  31}
  32
  33static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
  34{
  35        if (unlikely(audit_context())) {
  36                unsigned long args[6];
  37
  38                syscall_get_arguments(current, regs, args);
  39                audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
  40        }
  41}
  42
  43static long syscall_trace_enter(struct pt_regs *regs, long syscall,
  44                                unsigned long work)
  45{
  46        long ret = 0;
  47
  48        /*
  49         * Handle Syscall User Dispatch.  This must comes first, since
  50         * the ABI here can be something that doesn't make sense for
  51         * other syscall_work features.
  52         */
  53        if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
  54                if (syscall_user_dispatch(regs))
  55                        return -1L;
  56        }
  57
  58        /* Handle ptrace */
  59        if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
  60                ret = arch_syscall_enter_tracehook(regs);
  61                if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
  62                        return -1L;
  63        }
  64
  65        /* Do seccomp after ptrace, to catch any tracer changes. */
  66        if (work & SYSCALL_WORK_SECCOMP) {
  67                ret = __secure_computing(NULL);
  68                if (ret == -1L)
  69                        return ret;
  70        }
  71
  72        /* Either of the above might have changed the syscall number */
  73        syscall = syscall_get_nr(current, regs);
  74
  75        if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
  76                trace_sys_enter(regs, syscall);
  77
  78        syscall_enter_audit(regs, syscall);
  79
  80        return ret ? : syscall;
  81}
  82
  83static __always_inline long
  84__syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
  85{
  86        unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
  87
  88        if (work & SYSCALL_WORK_ENTER)
  89                syscall = syscall_trace_enter(regs, syscall, work);
  90
  91        return syscall;
  92}
  93
  94long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
  95{
  96        return __syscall_enter_from_user_work(regs, syscall);
  97}
  98
  99noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
 100{
 101        long ret;
 102
 103        __enter_from_user_mode(regs);
 104
 105        instrumentation_begin();
 106        local_irq_enable();
 107        ret = __syscall_enter_from_user_work(regs, syscall);
 108        instrumentation_end();
 109
 110        return ret;
 111}
 112
 113noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
 114{
 115        __enter_from_user_mode(regs);
 116        instrumentation_begin();
 117        local_irq_enable();
 118        instrumentation_end();
 119}
 120
 121/* See comment for exit_to_user_mode() in entry-common.h */
 122static __always_inline void __exit_to_user_mode(void)
 123{
 124        instrumentation_begin();
 125        trace_hardirqs_on_prepare();
 126        lockdep_hardirqs_on_prepare(CALLER_ADDR0);
 127        instrumentation_end();
 128
 129        user_enter_irqoff();
 130        arch_exit_to_user_mode();
 131        lockdep_hardirqs_on(CALLER_ADDR0);
 132}
 133
 134void noinstr exit_to_user_mode(void)
 135{
 136        __exit_to_user_mode();
 137}
 138
 139/* Workaround to allow gradual conversion of architecture code */
 140void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { }
 141
 142static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work)
 143{
 144        if (ti_work & _TIF_NOTIFY_SIGNAL)
 145                tracehook_notify_signal();
 146
 147        arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING);
 148}
 149
 150static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 151                                            unsigned long ti_work)
 152{
 153        /*
 154         * Before returning to user space ensure that all pending work
 155         * items have been completed.
 156         */
 157        while (ti_work & EXIT_TO_USER_MODE_WORK) {
 158
 159                local_irq_enable_exit_to_user(ti_work);
 160
 161                if (ti_work & _TIF_NEED_RESCHED)
 162                        schedule();
 163
 164                if (ti_work & _TIF_UPROBE)
 165                        uprobe_notify_resume(regs);
 166
 167                if (ti_work & _TIF_PATCH_PENDING)
 168                        klp_update_patch_state(current);
 169
 170                if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
 171                        handle_signal_work(regs, ti_work);
 172
 173                if (ti_work & _TIF_NOTIFY_RESUME) {
 174                        tracehook_notify_resume(regs);
 175                        rseq_handle_notify_resume(NULL, regs);
 176                }
 177
 178                /* Architecture specific TIF work */
 179                arch_exit_to_user_mode_work(regs, ti_work);
 180
 181                /*
 182                 * Disable interrupts and reevaluate the work flags as they
 183                 * might have changed while interrupts and preemption was
 184                 * enabled above.
 185                 */
 186                local_irq_disable_exit_to_user();
 187
 188                /* Check if any of the above work has queued a deferred wakeup */
 189                rcu_nocb_flush_deferred_wakeup();
 190
 191                ti_work = READ_ONCE(current_thread_info()->flags);
 192        }
 193
 194        /* Return the latest work state for arch_exit_to_user_mode() */
 195        return ti_work;
 196}
 197
 198static void exit_to_user_mode_prepare(struct pt_regs *regs)
 199{
 200        unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
 201
 202        lockdep_assert_irqs_disabled();
 203
 204        /* Flush pending rcuog wakeup before the last need_resched() check */
 205        rcu_nocb_flush_deferred_wakeup();
 206
 207        if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
 208                ti_work = exit_to_user_mode_loop(regs, ti_work);
 209
 210        arch_exit_to_user_mode_prepare(regs, ti_work);
 211
 212        /* Ensure that the address limit is intact and no locks are held */
 213        addr_limit_user_check();
 214        kmap_assert_nomap();
 215        lockdep_assert_irqs_disabled();
 216        lockdep_sys_exit();
 217}
 218
 219/*
 220 * If SYSCALL_EMU is set, then the only reason to report is when
 221 * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
 222 * instruction has been already reported in syscall_enter_from_user_mode().
 223 */
 224static inline bool report_single_step(unsigned long work)
 225{
 226        if (work & SYSCALL_WORK_SYSCALL_EMU)
 227                return false;
 228
 229        return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
 230}
 231
 232static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
 233{
 234        bool step;
 235
 236        /*
 237         * If the syscall was rolled back due to syscall user dispatching,
 238         * then the tracers below are not invoked for the same reason as
 239         * the entry side was not invoked in syscall_trace_enter(): The ABI
 240         * of these syscalls is unknown.
 241         */
 242        if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
 243                if (unlikely(current->syscall_dispatch.on_dispatch)) {
 244                        current->syscall_dispatch.on_dispatch = false;
 245                        return;
 246                }
 247        }
 248
 249        audit_syscall_exit(regs);
 250
 251        if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
 252                trace_sys_exit(regs, syscall_get_return_value(current, regs));
 253
 254        step = report_single_step(work);
 255        if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
 256                arch_syscall_exit_tracehook(regs, step);
 257}
 258
 259/*
 260 * Syscall specific exit to user mode preparation. Runs with interrupts
 261 * enabled.
 262 */
 263static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
 264{
 265        unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
 266        unsigned long nr = syscall_get_nr(current, regs);
 267
 268        CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 269
 270        if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
 271                if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
 272                        local_irq_enable();
 273        }
 274
 275        rseq_syscall(regs);
 276
 277        /*
 278         * Do one-time syscall specific work. If these work items are
 279         * enabled, we want to run them exactly once per syscall exit with
 280         * interrupts enabled.
 281         */
 282        if (unlikely(work & SYSCALL_WORK_EXIT))
 283                syscall_exit_work(regs, work);
 284}
 285
 286static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs)
 287{
 288        syscall_exit_to_user_mode_prepare(regs);
 289        local_irq_disable_exit_to_user();
 290        exit_to_user_mode_prepare(regs);
 291}
 292
 293void syscall_exit_to_user_mode_work(struct pt_regs *regs)
 294{
 295        __syscall_exit_to_user_mode_work(regs);
 296}
 297
 298__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
 299{
 300        instrumentation_begin();
 301        __syscall_exit_to_user_mode_work(regs);
 302        instrumentation_end();
 303        __exit_to_user_mode();
 304}
 305
 306noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
 307{
 308        __enter_from_user_mode(regs);
 309}
 310
 311noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
 312{
 313        instrumentation_begin();
 314        exit_to_user_mode_prepare(regs);
 315        instrumentation_end();
 316        __exit_to_user_mode();
 317}
 318
 319noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
 320{
 321        irqentry_state_t ret = {
 322                .exit_rcu = false,
 323        };
 324
 325        if (user_mode(regs)) {
 326                irqentry_enter_from_user_mode(regs);
 327                return ret;
 328        }
 329
 330        /*
 331         * If this entry hit the idle task invoke rcu_irq_enter() whether
 332         * RCU is watching or not.
 333         *
 334         * Interrupts can nest when the first interrupt invokes softirq
 335         * processing on return which enables interrupts.
 336         *
 337         * Scheduler ticks in the idle task can mark quiescent state and
 338         * terminate a grace period, if and only if the timer interrupt is
 339         * not nested into another interrupt.
 340         *
 341         * Checking for rcu_is_watching() here would prevent the nesting
 342         * interrupt to invoke rcu_irq_enter(). If that nested interrupt is
 343         * the tick then rcu_flavor_sched_clock_irq() would wrongfully
 344         * assume that it is the first interupt and eventually claim
 345         * quiescent state and end grace periods prematurely.
 346         *
 347         * Unconditionally invoke rcu_irq_enter() so RCU state stays
 348         * consistent.
 349         *
 350         * TINY_RCU does not support EQS, so let the compiler eliminate
 351         * this part when enabled.
 352         */
 353        if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
 354                /*
 355                 * If RCU is not watching then the same careful
 356                 * sequence vs. lockdep and tracing is required
 357                 * as in irqentry_enter_from_user_mode().
 358                 */
 359                lockdep_hardirqs_off(CALLER_ADDR0);
 360                rcu_irq_enter();
 361                instrumentation_begin();
 362                trace_hardirqs_off_finish();
 363                instrumentation_end();
 364
 365                ret.exit_rcu = true;
 366                return ret;
 367        }
 368
 369        /*
 370         * If RCU is watching then RCU only wants to check whether it needs
 371         * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
 372         * already contains a warning when RCU is not watching, so no point
 373         * in having another one here.
 374         */
 375        lockdep_hardirqs_off(CALLER_ADDR0);
 376        instrumentation_begin();
 377        rcu_irq_enter_check_tick();
 378        trace_hardirqs_off_finish();
 379        instrumentation_end();
 380
 381        return ret;
 382}
 383
 384void irqentry_exit_cond_resched(void)
 385{
 386        if (!preempt_count()) {
 387                /* Sanity check RCU and thread stack */
 388                rcu_irq_exit_check_preempt();
 389                if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
 390                        WARN_ON_ONCE(!on_thread_stack());
 391                if (need_resched())
 392                        preempt_schedule_irq();
 393        }
 394}
 395#ifdef CONFIG_PREEMPT_DYNAMIC
 396DEFINE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
 397#endif
 398
 399noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
 400{
 401        lockdep_assert_irqs_disabled();
 402
 403        /* Check whether this returns to user mode */
 404        if (user_mode(regs)) {
 405                irqentry_exit_to_user_mode(regs);
 406        } else if (!regs_irqs_disabled(regs)) {
 407                /*
 408                 * If RCU was not watching on entry this needs to be done
 409                 * carefully and needs the same ordering of lockdep/tracing
 410                 * and RCU as the return to user mode path.
 411                 */
 412                if (state.exit_rcu) {
 413                        instrumentation_begin();
 414                        /* Tell the tracer that IRET will enable interrupts */
 415                        trace_hardirqs_on_prepare();
 416                        lockdep_hardirqs_on_prepare(CALLER_ADDR0);
 417                        instrumentation_end();
 418                        rcu_irq_exit();
 419                        lockdep_hardirqs_on(CALLER_ADDR0);
 420                        return;
 421                }
 422
 423                instrumentation_begin();
 424                if (IS_ENABLED(CONFIG_PREEMPTION)) {
 425#ifdef CONFIG_PREEMPT_DYNAMIC
 426                        static_call(irqentry_exit_cond_resched)();
 427#else
 428                        irqentry_exit_cond_resched();
 429#endif
 430                }
 431                /* Covers both tracing and lockdep */
 432                trace_hardirqs_on();
 433                instrumentation_end();
 434        } else {
 435                /*
 436                 * IRQ flags state is correct already. Just tell RCU if it
 437                 * was not watching on entry.
 438                 */
 439                if (state.exit_rcu)
 440                        rcu_irq_exit();
 441        }
 442}
 443
 444irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
 445{
 446        irqentry_state_t irq_state;
 447
 448        irq_state.lockdep = lockdep_hardirqs_enabled();
 449
 450        __nmi_enter();
 451        lockdep_hardirqs_off(CALLER_ADDR0);
 452        lockdep_hardirq_enter();
 453        rcu_nmi_enter();
 454
 455        instrumentation_begin();
 456        trace_hardirqs_off_finish();
 457        ftrace_nmi_enter();
 458        instrumentation_end();
 459
 460        return irq_state;
 461}
 462
 463void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
 464{
 465        instrumentation_begin();
 466        ftrace_nmi_exit();
 467        if (irq_state.lockdep) {
 468                trace_hardirqs_on_prepare();
 469                lockdep_hardirqs_on_prepare(CALLER_ADDR0);
 470        }
 471        instrumentation_end();
 472
 473        rcu_nmi_exit();
 474        lockdep_hardirq_exit();
 475        if (irq_state.lockdep)
 476                lockdep_hardirqs_on(CALLER_ADDR0);
 477        __nmi_exit();
 478}
 479