linux/arch/x86/kernel/process.c
<<
>>
Prefs
   1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   2
   3#include <linux/errno.h>
   4#include <linux/kernel.h>
   5#include <linux/mm.h>
   6#include <linux/smp.h>
   7#include <linux/prctl.h>
   8#include <linux/slab.h>
   9#include <linux/sched.h>
  10#include <linux/module.h>
  11#include <linux/pm.h>
  12#include <linux/clockchips.h>
  13#include <linux/random.h>
  14#include <linux/user-return-notifier.h>
  15#include <linux/dmi.h>
  16#include <linux/utsname.h>
  17#include <linux/stackprotector.h>
  18#include <linux/tick.h>
  19#include <linux/cpuidle.h>
  20#include <trace/events/power.h>
  21#include <linux/hw_breakpoint.h>
  22#include <asm/cpu.h>
  23#include <asm/apic.h>
  24#include <asm/syscalls.h>
  25#include <asm/idle.h>
  26#include <asm/uaccess.h>
  27#include <asm/i387.h>
  28#include <asm/fpu-internal.h>
  29#include <asm/debugreg.h>
  30#include <asm/nmi.h>
  31
  32/*
  33 * per-CPU TSS segments. Threads are completely 'soft' on Linux,
  34 * no more per-task TSS's. The TSS size is kept cacheline-aligned
  35 * so they are allowed to end up in the .data..cacheline_aligned
  36 * section. Since TSS's are completely CPU-local, we want them
  37 * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  38 */
  39__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
  40
  41#ifdef CONFIG_X86_64
  42static DEFINE_PER_CPU(unsigned char, is_idle);
  43static ATOMIC_NOTIFIER_HEAD(idle_notifier);
  44
  45void idle_notifier_register(struct notifier_block *n)
  46{
  47        atomic_notifier_chain_register(&idle_notifier, n);
  48}
  49EXPORT_SYMBOL_GPL(idle_notifier_register);
  50
  51void idle_notifier_unregister(struct notifier_block *n)
  52{
  53        atomic_notifier_chain_unregister(&idle_notifier, n);
  54}
  55EXPORT_SYMBOL_GPL(idle_notifier_unregister);
  56#endif
  57
  58struct kmem_cache *task_xstate_cachep;
  59EXPORT_SYMBOL_GPL(task_xstate_cachep);
  60
  61/*
  62 * this gets called so that we can store lazy state into memory and copy the
  63 * current task into the new thread.
  64 */
  65int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
  66{
  67        int ret;
  68
  69        *dst = *src;
  70        if (fpu_allocated(&src->thread.fpu)) {
  71                memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
  72                ret = fpu_alloc(&dst->thread.fpu);
  73                if (ret)
  74                        return ret;
  75                fpu_copy(dst, src);
  76        }
  77        return 0;
  78}
  79
  80void free_thread_xstate(struct task_struct *tsk)
  81{
  82        fpu_free(&tsk->thread.fpu);
  83}
  84
  85void arch_release_task_struct(struct task_struct *tsk)
  86{
  87        free_thread_xstate(tsk);
  88}
  89
  90void arch_task_cache_init(void)
  91{
  92        task_xstate_cachep =
  93                kmem_cache_create("task_xstate", xstate_size,
  94                                  __alignof__(union thread_xstate),
  95                                  SLAB_PANIC | SLAB_NOTRACK, NULL);
  96}
  97
  98/*
  99 * Free current thread data structures etc..
 100 */
 101void exit_thread(void)
 102{
 103        struct task_struct *me = current;
 104        struct thread_struct *t = &me->thread;
 105        unsigned long *bp = t->io_bitmap_ptr;
 106
 107        if (bp) {
 108                struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
 109
 110                t->io_bitmap_ptr = NULL;
 111                clear_thread_flag(TIF_IO_BITMAP);
 112                /*
 113                 * Careful, clear this in the TSS too:
 114                 */
 115                memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
 116                t->io_bitmap_max = 0;
 117                put_cpu();
 118                kfree(bp);
 119        }
 120
 121        drop_fpu(me);
 122}
 123
 124void flush_thread(void)
 125{
 126        struct task_struct *tsk = current;
 127
 128        flush_ptrace_hw_breakpoint(tsk);
 129        memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 130        drop_init_fpu(tsk);
 131        /*
 132         * Free the FPU state for non xsave platforms. They get reallocated
 133         * lazily at the first use.
 134         */
 135        if (!use_eager_fpu())
 136                free_thread_xstate(tsk);
 137}
 138
 139static void hard_disable_TSC(void)
 140{
 141        write_cr4(read_cr4() | X86_CR4_TSD);
 142}
 143
 144void disable_TSC(void)
 145{
 146        preempt_disable();
 147        if (!test_and_set_thread_flag(TIF_NOTSC))
 148                /*
 149                 * Must flip the CPU state synchronously with
 150                 * TIF_NOTSC in the current running context.
 151                 */
 152                hard_disable_TSC();
 153        preempt_enable();
 154}
 155
 156static void hard_enable_TSC(void)
 157{
 158        write_cr4(read_cr4() & ~X86_CR4_TSD);
 159}
 160
 161static void enable_TSC(void)
 162{
 163        preempt_disable();
 164        if (test_and_clear_thread_flag(TIF_NOTSC))
 165                /*
 166                 * Must flip the CPU state synchronously with
 167                 * TIF_NOTSC in the current running context.
 168                 */
 169                hard_enable_TSC();
 170        preempt_enable();
 171}
 172
 173int get_tsc_mode(unsigned long adr)
 174{
 175        unsigned int val;
 176
 177        if (test_thread_flag(TIF_NOTSC))
 178                val = PR_TSC_SIGSEGV;
 179        else
 180                val = PR_TSC_ENABLE;
 181
 182        return put_user(val, (unsigned int __user *)adr);
 183}
 184
 185int set_tsc_mode(unsigned int val)
 186{
 187        if (val == PR_TSC_SIGSEGV)
 188                disable_TSC();
 189        else if (val == PR_TSC_ENABLE)
 190                enable_TSC();
 191        else
 192                return -EINVAL;
 193
 194        return 0;
 195}
 196
 197void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 198                      struct tss_struct *tss)
 199{
 200        struct thread_struct *prev, *next;
 201
 202        prev = &prev_p->thread;
 203        next = &next_p->thread;
 204
 205        if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
 206            test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
 207                unsigned long debugctl = get_debugctlmsr();
 208
 209                debugctl &= ~DEBUGCTLMSR_BTF;
 210                if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
 211                        debugctl |= DEBUGCTLMSR_BTF;
 212
 213                update_debugctlmsr(debugctl);
 214        }
 215
 216        if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
 217            test_tsk_thread_flag(next_p, TIF_NOTSC)) {
 218                /* prev and next are different */
 219                if (test_tsk_thread_flag(next_p, TIF_NOTSC))
 220                        hard_disable_TSC();
 221                else
 222                        hard_enable_TSC();
 223        }
 224
 225        if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
 226                /*
 227                 * Copy the relevant range of the IO bitmap.
 228                 * Normally this is 128 bytes or less:
 229                 */
 230                memcpy(tss->io_bitmap, next->io_bitmap_ptr,
 231                       max(prev->io_bitmap_max, next->io_bitmap_max));
 232        } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
 233                /*
 234                 * Clear any possible leftover bits:
 235                 */
 236                memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
 237        }
 238        propagate_user_return_notify(prev_p, next_p);
 239}
 240
 241/*
 242 * Idle related variables and functions
 243 */
 244unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
 245EXPORT_SYMBOL(boot_option_idle_override);
 246
 247static void (*x86_idle)(void);
 248
 249#ifndef CONFIG_SMP
 250static inline void play_dead(void)
 251{
 252        BUG();
 253}
 254#endif
 255
 256#ifdef CONFIG_X86_64
 257void enter_idle(void)
 258{
 259        this_cpu_write(is_idle, 1);
 260        atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
 261}
 262
 263static void __exit_idle(void)
 264{
 265        if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
 266                return;
 267        atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
 268}
 269
 270/* Called from interrupts to signify idle end */
 271void exit_idle(void)
 272{
 273        /* idle loop has pid 0 */
 274        if (current->pid)
 275                return;
 276        __exit_idle();
 277}
 278#endif
 279
 280void arch_cpu_idle_enter(void)
 281{
 282        local_touch_nmi();
 283        enter_idle();
 284}
 285
 286void arch_cpu_idle_exit(void)
 287{
 288        __exit_idle();
 289}
 290
 291void arch_cpu_idle_dead(void)
 292{
 293        play_dead();
 294}
 295
 296/*
 297 * Called from the generic idle code.
 298 */
 299void arch_cpu_idle(void)
 300{
 301        if (cpuidle_idle_call())
 302                x86_idle();
 303        else
 304                local_irq_enable();
 305}
 306
 307/*
 308 * We use this if we don't have any better idle routine..
 309 */
 310void default_idle(void)
 311{
 312        trace_cpu_idle_rcuidle(1, smp_processor_id());
 313        safe_halt();
 314        trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
 315}
 316#ifdef CONFIG_APM_MODULE
 317EXPORT_SYMBOL(default_idle);
 318#endif
 319
 320#ifdef CONFIG_XEN
 321bool xen_set_default_idle(void)
 322{
 323        bool ret = !!x86_idle;
 324
 325        x86_idle = default_idle;
 326
 327        return ret;
 328}
 329#endif
 330void stop_this_cpu(void *dummy)
 331{
 332        local_irq_disable();
 333        /*
 334         * Remove this CPU:
 335         */
 336        set_cpu_online(smp_processor_id(), false);
 337        disable_local_APIC();
 338
 339        for (;;)
 340                halt();
 341}
 342
 343bool amd_e400_c1e_detected;
 344EXPORT_SYMBOL(amd_e400_c1e_detected);
 345
 346static cpumask_var_t amd_e400_c1e_mask;
 347
 348void amd_e400_remove_cpu(int cpu)
 349{
 350        if (amd_e400_c1e_mask != NULL)
 351                cpumask_clear_cpu(cpu, amd_e400_c1e_mask);
 352}
 353
 354/*
 355 * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt
 356 * pending message MSR. If we detect C1E, then we handle it the same
 357 * way as C3 power states (local apic timer and TSC stop)
 358 */
 359static void amd_e400_idle(void)
 360{
 361        if (!amd_e400_c1e_detected) {
 362                u32 lo, hi;
 363
 364                rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
 365
 366                if (lo & K8_INTP_C1E_ACTIVE_MASK) {
 367                        amd_e400_c1e_detected = true;
 368                        if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
 369                                mark_tsc_unstable("TSC halt in AMD C1E");
 370                        pr_info("System has AMD C1E enabled\n");
 371                }
 372        }
 373
 374        if (amd_e400_c1e_detected) {
 375                int cpu = smp_processor_id();
 376
 377                if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
 378                        cpumask_set_cpu(cpu, amd_e400_c1e_mask);
 379                        /*
 380                         * Force broadcast so ACPI can not interfere.
 381                         */
 382                        clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
 383                                           &cpu);
 384                        pr_info("Switch to broadcast mode on CPU%d\n", cpu);
 385                }
 386                clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
 387
 388                default_idle();
 389
 390                /*
 391                 * The switch back from broadcast mode needs to be
 392                 * called with interrupts disabled.
 393                 */
 394                 local_irq_disable();
 395                 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
 396                 local_irq_enable();
 397        } else
 398                default_idle();
 399}
 400
 401void select_idle_routine(const struct cpuinfo_x86 *c)
 402{
 403#ifdef CONFIG_SMP
 404        if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1)
 405                pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
 406#endif
 407        if (x86_idle || boot_option_idle_override == IDLE_POLL)
 408                return;
 409
 410        if (cpu_has_bug(c, X86_BUG_AMD_APIC_C1E)) {
 411                /* E400: APIC timer interrupt does not wake up CPU from C1e */
 412                pr_info("using AMD E400 aware idle routine\n");
 413                x86_idle = amd_e400_idle;
 414        } else
 415                x86_idle = default_idle;
 416}
 417
 418void __init init_amd_e400_c1e_mask(void)
 419{
 420        /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */
 421        if (x86_idle == amd_e400_idle)
 422                zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL);
 423}
 424
 425static int __init idle_setup(char *str)
 426{
 427        if (!str)
 428                return -EINVAL;
 429
 430        if (!strcmp(str, "poll")) {
 431                pr_info("using polling idle threads\n");
 432                boot_option_idle_override = IDLE_POLL;
 433                cpu_idle_poll_ctrl(true);
 434        } else if (!strcmp(str, "halt")) {
 435                /*
 436                 * When the boot option of idle=halt is added, halt is
 437                 * forced to be used for CPU idle. In such case CPU C2/C3
 438                 * won't be used again.
 439                 * To continue to load the CPU idle driver, don't touch
 440                 * the boot_option_idle_override.
 441                 */
 442                x86_idle = default_idle;
 443                boot_option_idle_override = IDLE_HALT;
 444        } else if (!strcmp(str, "nomwait")) {
 445                /*
 446                 * If the boot option of "idle=nomwait" is added,
 447                 * it means that mwait will be disabled for CPU C2/C3
 448                 * states. In such case it won't touch the variable
 449                 * of boot_option_idle_override.
 450                 */
 451                boot_option_idle_override = IDLE_NOMWAIT;
 452        } else
 453                return -1;
 454
 455        return 0;
 456}
 457early_param("idle", idle_setup);
 458
 459unsigned long arch_align_stack(unsigned long sp)
 460{
 461        if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
 462                sp -= get_random_int() % 8192;
 463        return sp & ~0xf;
 464}
 465
 466unsigned long arch_randomize_brk(struct mm_struct *mm)
 467{
 468        unsigned long range_end = mm->brk + 0x02000000;
 469        return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
 470}
 471
 472