linux/arch/x86/kernel/process_64.c
<<
>>
Prefs
   1/*
   2 *  Copyright (C) 1995  Linus Torvalds
   3 *
   4 *  Pentium III FXSR, SSE support
   5 *      Gareth Hughes <gareth@valinux.com>, May 2000
   6 *
   7 *  X86-64 port
   8 *      Andi Kleen.
   9 *
  10 *      CPU hotplug support - ashok.raj@intel.com
  11 */
  12
  13/*
  14 * This file handles the architecture-dependent parts of process handling..
  15 */
  16
  17#include <linux/stackprotector.h>
  18#include <linux/cpu.h>
  19#include <linux/errno.h>
  20#include <linux/sched.h>
  21#include <linux/fs.h>
  22#include <linux/kernel.h>
  23#include <linux/mm.h>
  24#include <linux/elfcore.h>
  25#include <linux/smp.h>
  26#include <linux/slab.h>
  27#include <linux/user.h>
  28#include <linux/interrupt.h>
  29#include <linux/delay.h>
  30#include <linux/module.h>
  31#include <linux/ptrace.h>
  32#include <linux/notifier.h>
  33#include <linux/kprobes.h>
  34#include <linux/kdebug.h>
  35#include <linux/tick.h>
  36#include <linux/prctl.h>
  37#include <linux/uaccess.h>
  38#include <linux/io.h>
  39#include <linux/ftrace.h>
  40
  41#include <asm/pgtable.h>
  42#include <asm/system.h>
  43#include <asm/processor.h>
  44#include <asm/i387.h>
  45#include <asm/mmu_context.h>
  46#include <asm/prctl.h>
  47#include <asm/desc.h>
  48#include <asm/proto.h>
  49#include <asm/ia32.h>
  50#include <asm/idle.h>
  51#include <asm/syscalls.h>
  52#include <asm/debugreg.h>
  53
  54asmlinkage extern void ret_from_fork(void);
  55
  56DEFINE_PER_CPU(unsigned long, old_rsp);
  57static DEFINE_PER_CPU(unsigned char, is_idle);
  58
  59static ATOMIC_NOTIFIER_HEAD(idle_notifier);
  60
  61void idle_notifier_register(struct notifier_block *n)
  62{
  63        atomic_notifier_chain_register(&idle_notifier, n);
  64}
  65EXPORT_SYMBOL_GPL(idle_notifier_register);
  66
  67void idle_notifier_unregister(struct notifier_block *n)
  68{
  69        atomic_notifier_chain_unregister(&idle_notifier, n);
  70}
  71EXPORT_SYMBOL_GPL(idle_notifier_unregister);
  72
  73void enter_idle(void)
  74{
  75        percpu_write(is_idle, 1);
  76        atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
  77}
  78
  79static void __exit_idle(void)
  80{
  81        if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
  82                return;
  83        atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
  84}
  85
  86/* Called from interrupts to signify idle end */
  87void exit_idle(void)
  88{
  89        /* idle loop has pid 0 */
  90        if (current->pid)
  91                return;
  92        __exit_idle();
  93}
  94
  95#ifndef CONFIG_SMP
  96static inline void play_dead(void)
  97{
  98        BUG();
  99}
 100#endif
 101
 102/*
 103 * The idle thread. There's no useful work to be
 104 * done, so just try to conserve power and have a
 105 * low exit latency (ie sit in a loop waiting for
 106 * somebody to say that they'd like to reschedule)
 107 */
 108void cpu_idle(void)
 109{
 110        current_thread_info()->status |= TS_POLLING;
 111
 112        /*
 113         * If we're the non-boot CPU, nothing set the stack canary up
 114         * for us.  CPU0 already has it initialized but no harm in
 115         * doing it again.  This is a good place for updating it, as
 116         * we wont ever return from this function (so the invalid
 117         * canaries already on the stack wont ever trigger).
 118         */
 119        boot_init_stack_canary();
 120
 121        /* endless idle loop with no priority at all */
 122        while (1) {
 123                tick_nohz_stop_sched_tick(1);
 124                while (!need_resched()) {
 125
 126                        rmb();
 127
 128                        if (cpu_is_offline(smp_processor_id()))
 129                                play_dead();
 130                        /*
 131                         * Idle routines should keep interrupts disabled
 132                         * from here on, until they go to idle.
 133                         * Otherwise, idle callbacks can misfire.
 134                         */
 135                        local_irq_disable();
 136                        enter_idle();
 137                        /* Don't trace irqs off for idle */
 138                        stop_critical_timings();
 139                        pm_idle();
 140                        start_critical_timings();
 141
 142                        /* In many cases the interrupt that ended idle
 143                           has already called exit_idle. But some idle
 144                           loops can be woken up without interrupt. */
 145                        __exit_idle();
 146                }
 147
 148                tick_nohz_restart_sched_tick();
 149                preempt_enable_no_resched();
 150                schedule();
 151                preempt_disable();
 152        }
 153}
 154
 155/* Prints also some state that isn't saved in the pt_regs */
 156void __show_regs(struct pt_regs *regs, int all)
 157{
 158        unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
 159        unsigned long d0, d1, d2, d3, d6, d7;
 160        unsigned int fsindex, gsindex;
 161        unsigned int ds, cs, es;
 162
 163        show_regs_common();
 164        printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
 165        printk_address(regs->ip, 1);
 166        printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
 167                        regs->sp, regs->flags);
 168        printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
 169               regs->ax, regs->bx, regs->cx);
 170        printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
 171               regs->dx, regs->si, regs->di);
 172        printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
 173               regs->bp, regs->r8, regs->r9);
 174        printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
 175               regs->r10, regs->r11, regs->r12);
 176        printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
 177               regs->r13, regs->r14, regs->r15);
 178
 179        asm("movl %%ds,%0" : "=r" (ds));
 180        asm("movl %%cs,%0" : "=r" (cs));
 181        asm("movl %%es,%0" : "=r" (es));
 182        asm("movl %%fs,%0" : "=r" (fsindex));
 183        asm("movl %%gs,%0" : "=r" (gsindex));
 184
 185        rdmsrl(MSR_FS_BASE, fs);
 186        rdmsrl(MSR_GS_BASE, gs);
 187        rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
 188
 189        if (!all)
 190                return;
 191
 192        cr0 = read_cr0();
 193        cr2 = read_cr2();
 194        cr3 = read_cr3();
 195        cr4 = read_cr4();
 196
 197        printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
 198               fs, fsindex, gs, gsindex, shadowgs);
 199        printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
 200                        es, cr0);
 201        printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
 202                        cr4);
 203
 204        get_debugreg(d0, 0);
 205        get_debugreg(d1, 1);
 206        get_debugreg(d2, 2);
 207        printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
 208        get_debugreg(d3, 3);
 209        get_debugreg(d6, 6);
 210        get_debugreg(d7, 7);
 211        printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
 212}
 213
 214void release_thread(struct task_struct *dead_task)
 215{
 216        if (dead_task->mm) {
 217                if (dead_task->mm->context.size) {
 218                        printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
 219                                        dead_task->comm,
 220                                        dead_task->mm->context.ldt,
 221                                        dead_task->mm->context.size);
 222                        BUG();
 223                }
 224        }
 225}
 226
 227static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
 228{
 229        struct user_desc ud = {
 230                .base_addr = addr,
 231                .limit = 0xfffff,
 232                .seg_32bit = 1,
 233                .limit_in_pages = 1,
 234                .useable = 1,
 235        };
 236        struct desc_struct *desc = t->thread.tls_array;
 237        desc += tls;
 238        fill_ldt(desc, &ud);
 239}
 240
 241static inline u32 read_32bit_tls(struct task_struct *t, int tls)
 242{
 243        return get_desc_base(&t->thread.tls_array[tls]);
 244}
 245
 246/*
 247 * This gets called before we allocate a new thread and copy
 248 * the current task into it.
 249 */
 250void prepare_to_copy(struct task_struct *tsk)
 251{
 252        unlazy_fpu(tsk);
 253}
 254
 255int copy_thread(unsigned long clone_flags, unsigned long sp,
 256                unsigned long unused,
 257        struct task_struct *p, struct pt_regs *regs)
 258{
 259        int err;
 260        struct pt_regs *childregs;
 261        struct task_struct *me = current;
 262
 263        childregs = ((struct pt_regs *)
 264                        (THREAD_SIZE + task_stack_page(p))) - 1;
 265        *childregs = *regs;
 266
 267        childregs->ax = 0;
 268        if (user_mode(regs))
 269                childregs->sp = sp;
 270        else
 271                childregs->sp = (unsigned long)childregs;
 272
 273        p->thread.sp = (unsigned long) childregs;
 274        p->thread.sp0 = (unsigned long) (childregs+1);
 275        p->thread.usersp = me->thread.usersp;
 276
 277        set_tsk_thread_flag(p, TIF_FORK);
 278
 279        p->thread.io_bitmap_ptr = NULL;
 280
 281        savesegment(gs, p->thread.gsindex);
 282        p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
 283        savesegment(fs, p->thread.fsindex);
 284        p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
 285        savesegment(es, p->thread.es);
 286        savesegment(ds, p->thread.ds);
 287
 288        err = -ENOMEM;
 289        memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 290
 291        if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 292                p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
 293                if (!p->thread.io_bitmap_ptr) {
 294                        p->thread.io_bitmap_max = 0;
 295                        return -ENOMEM;
 296                }
 297                memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
 298                                IO_BITMAP_BYTES);
 299                set_tsk_thread_flag(p, TIF_IO_BITMAP);
 300        }
 301
 302        /*
 303         * Set a new TLS for the child thread?
 304         */
 305        if (clone_flags & CLONE_SETTLS) {
 306#ifdef CONFIG_IA32_EMULATION
 307                if (test_thread_flag(TIF_IA32))
 308                        err = do_set_thread_area(p, -1,
 309                                (struct user_desc __user *)childregs->si, 0);
 310                else
 311#endif
 312                        err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
 313                if (err)
 314                        goto out;
 315        }
 316        err = 0;
 317out:
 318        if (err && p->thread.io_bitmap_ptr) {
 319                kfree(p->thread.io_bitmap_ptr);
 320                p->thread.io_bitmap_max = 0;
 321        }
 322
 323        return err;
 324}
 325
 326static void
 327start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 328                    unsigned long new_sp,
 329                    unsigned int _cs, unsigned int _ss, unsigned int _ds)
 330{
 331        loadsegment(fs, 0);
 332        loadsegment(es, _ds);
 333        loadsegment(ds, _ds);
 334        load_gs_index(0);
 335        regs->ip                = new_ip;
 336        regs->sp                = new_sp;
 337        percpu_write(old_rsp, new_sp);
 338        regs->cs                = _cs;
 339        regs->ss                = _ss;
 340        regs->flags             = X86_EFLAGS_IF;
 341        /*
 342         * Free the old FP and other extended state
 343         */
 344        free_thread_xstate(current);
 345}
 346
 347void
 348start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 349{
 350        start_thread_common(regs, new_ip, new_sp,
 351                            __USER_CS, __USER_DS, 0);
 352}
 353
 354#ifdef CONFIG_IA32_EMULATION
 355void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
 356{
 357        start_thread_common(regs, new_ip, new_sp,
 358                            __USER32_CS, __USER32_DS, __USER32_DS);
 359}
 360#endif
 361
 362/*
 363 *      switch_to(x,y) should switch tasks from x to y.
 364 *
 365 * This could still be optimized:
 366 * - fold all the options into a flag word and test it with a single test.
 367 * - could test fs/gs bitsliced
 368 *
 369 * Kprobes not supported here. Set the probe on schedule instead.
 370 * Function graph tracer not supported too.
 371 */
 372__notrace_funcgraph struct task_struct *
 373__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 374{
 375        struct thread_struct *prev = &prev_p->thread;
 376        struct thread_struct *next = &next_p->thread;
 377        int cpu = smp_processor_id();
 378        struct tss_struct *tss = &per_cpu(init_tss, cpu);
 379        unsigned fsindex, gsindex;
 380        bool preload_fpu;
 381
 382        /*
 383         * If the task has used fpu the last 5 timeslices, just do a full
 384         * restore of the math state immediately to avoid the trap; the
 385         * chances of needing FPU soon are obviously high now
 386         */
 387        preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
 388
 389        /* we're going to use this soon, after a few expensive things */
 390        if (preload_fpu)
 391                prefetch(next->fpu.state);
 392
 393        /*
 394         * Reload esp0, LDT and the page table pointer:
 395         */
 396        load_sp0(tss, next);
 397
 398        /*
 399         * Switch DS and ES.
 400         * This won't pick up thread selector changes, but I guess that is ok.
 401         */
 402        savesegment(es, prev->es);
 403        if (unlikely(next->es | prev->es))
 404                loadsegment(es, next->es);
 405
 406        savesegment(ds, prev->ds);
 407        if (unlikely(next->ds | prev->ds))
 408                loadsegment(ds, next->ds);
 409
 410
 411        /* We must save %fs and %gs before load_TLS() because
 412         * %fs and %gs may be cleared by load_TLS().
 413         *
 414         * (e.g. xen_load_tls())
 415         */
 416        savesegment(fs, fsindex);
 417        savesegment(gs, gsindex);
 418
 419        load_TLS(next, cpu);
 420
 421        /* Must be after DS reload */
 422        __unlazy_fpu(prev_p);
 423
 424        /* Make sure cpu is ready for new context */
 425        if (preload_fpu)
 426                clts();
 427
 428        /*
 429         * Leave lazy mode, flushing any hypercalls made here.
 430         * This must be done before restoring TLS segments so
 431         * the GDT and LDT are properly updated, and must be
 432         * done before math_state_restore, so the TS bit is up
 433         * to date.
 434         */
 435        arch_end_context_switch(next_p);
 436
 437        /*
 438         * Switch FS and GS.
 439         *
 440         * Segment register != 0 always requires a reload.  Also
 441         * reload when it has changed.  When prev process used 64bit
 442         * base always reload to avoid an information leak.
 443         */
 444        if (unlikely(fsindex | next->fsindex | prev->fs)) {
 445                loadsegment(fs, next->fsindex);
 446                /*
 447                 * Check if the user used a selector != 0; if yes
 448                 *  clear 64bit base, since overloaded base is always
 449                 *  mapped to the Null selector
 450                 */
 451                if (fsindex)
 452                        prev->fs = 0;
 453        }
 454        /* when next process has a 64bit base use it */
 455        if (next->fs)
 456                wrmsrl(MSR_FS_BASE, next->fs);
 457        prev->fsindex = fsindex;
 458
 459        if (unlikely(gsindex | next->gsindex | prev->gs)) {
 460                load_gs_index(next->gsindex);
 461                if (gsindex)
 462                        prev->gs = 0;
 463        }
 464        if (next->gs)
 465                wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
 466        prev->gsindex = gsindex;
 467
 468        /*
 469         * Switch the PDA and FPU contexts.
 470         */
 471        prev->usersp = percpu_read(old_rsp);
 472        percpu_write(old_rsp, next->usersp);
 473        percpu_write(current_task, next_p);
 474
 475        percpu_write(kernel_stack,
 476                  (unsigned long)task_stack_page(next_p) +
 477                  THREAD_SIZE - KERNEL_STACK_OFFSET);
 478
 479        /*
 480         * Now maybe reload the debug registers and handle I/O bitmaps
 481         */
 482        if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
 483                     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
 484                __switch_to_xtra(prev_p, next_p, tss);
 485
 486        /*
 487         * Preload the FPU context, now that we've determined that the
 488         * task is likely to be using it. 
 489         */
 490        if (preload_fpu)
 491                __math_state_restore();
 492
 493        return prev_p;
 494}
 495
 496void set_personality_64bit(void)
 497{
 498        /* inherit personality from parent */
 499
 500        /* Make sure to be in 64bit mode */
 501        clear_thread_flag(TIF_IA32);
 502
 503        /* Ensure the corresponding mm is not marked. */
 504        if (current->mm)
 505                current->mm->context.ia32_compat = 0;
 506
 507        /* TBD: overwrites user setup. Should have two bits.
 508           But 64bit processes have always behaved this way,
 509           so it's not too bad. The main problem is just that
 510           32bit childs are affected again. */
 511        current->personality &= ~READ_IMPLIES_EXEC;
 512}
 513
 514void set_personality_ia32(void)
 515{
 516        /* inherit personality from parent */
 517
 518        /* Make sure to be in 32bit mode */
 519        set_thread_flag(TIF_IA32);
 520        current->personality |= force_personality32;
 521
 522        /* Mark the associated mm as containing 32-bit tasks. */
 523        if (current->mm)
 524                current->mm->context.ia32_compat = 1;
 525
 526        /* Prepare the first "return" to user space */
 527        current_thread_info()->status |= TS_COMPAT;
 528}
 529
 530unsigned long get_wchan(struct task_struct *p)
 531{
 532        unsigned long stack;
 533        u64 fp, ip;
 534        int count = 0;
 535
 536        if (!p || p == current || p->state == TASK_RUNNING)
 537                return 0;
 538        stack = (unsigned long)task_stack_page(p);
 539        if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
 540                return 0;
 541        fp = *(u64 *)(p->thread.sp);
 542        do {
 543                if (fp < (unsigned long)stack ||
 544                    fp >= (unsigned long)stack+THREAD_SIZE)
 545                        return 0;
 546                ip = *(u64 *)(fp+8);
 547                if (!in_sched_functions(ip))
 548                        return ip;
 549                fp = *(u64 *)fp;
 550        } while (count++ < 16);
 551        return 0;
 552}
 553
 554long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 555{
 556        int ret = 0;
 557        int doit = task == current;
 558        int cpu;
 559
 560        switch (code) {
 561        case ARCH_SET_GS:
 562                if (addr >= TASK_SIZE_OF(task))
 563                        return -EPERM;
 564                cpu = get_cpu();
 565                /* handle small bases via the GDT because that's faster to
 566                   switch. */
 567                if (addr <= 0xffffffff) {
 568                        set_32bit_tls(task, GS_TLS, addr);
 569                        if (doit) {
 570                                load_TLS(&task->thread, cpu);
 571                                load_gs_index(GS_TLS_SEL);
 572                        }
 573                        task->thread.gsindex = GS_TLS_SEL;
 574                        task->thread.gs = 0;
 575                } else {
 576                        task->thread.gsindex = 0;
 577                        task->thread.gs = addr;
 578                        if (doit) {
 579                                load_gs_index(0);
 580                                ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
 581                        }
 582                }
 583                put_cpu();
 584                break;
 585        case ARCH_SET_FS:
 586                /* Not strictly needed for fs, but do it for symmetry
 587                   with gs */
 588                if (addr >= TASK_SIZE_OF(task))
 589                        return -EPERM;
 590                cpu = get_cpu();
 591                /* handle small bases via the GDT because that's faster to
 592                   switch. */
 593                if (addr <= 0xffffffff) {
 594                        set_32bit_tls(task, FS_TLS, addr);
 595                        if (doit) {
 596                                load_TLS(&task->thread, cpu);
 597                                loadsegment(fs, FS_TLS_SEL);
 598                        }
 599                        task->thread.fsindex = FS_TLS_SEL;
 600                        task->thread.fs = 0;
 601                } else {
 602                        task->thread.fsindex = 0;
 603                        task->thread.fs = addr;
 604                        if (doit) {
 605                                /* set the selector to 0 to not confuse
 606                                   __switch_to */
 607                                loadsegment(fs, 0);
 608                                ret = checking_wrmsrl(MSR_FS_BASE, addr);
 609                        }
 610                }
 611                put_cpu();
 612                break;
 613        case ARCH_GET_FS: {
 614                unsigned long base;
 615                if (task->thread.fsindex == FS_TLS_SEL)
 616                        base = read_32bit_tls(task, FS_TLS);
 617                else if (doit)
 618                        rdmsrl(MSR_FS_BASE, base);
 619                else
 620                        base = task->thread.fs;
 621                ret = put_user(base, (unsigned long __user *)addr);
 622                break;
 623        }
 624        case ARCH_GET_GS: {
 625                unsigned long base;
 626                unsigned gsindex;
 627                if (task->thread.gsindex == GS_TLS_SEL)
 628                        base = read_32bit_tls(task, GS_TLS);
 629                else if (doit) {
 630                        savesegment(gs, gsindex);
 631                        if (gsindex)
 632                                rdmsrl(MSR_KERNEL_GS_BASE, base);
 633                        else
 634                                base = task->thread.gs;
 635                } else
 636                        base = task->thread.gs;
 637                ret = put_user(base, (unsigned long __user *)addr);
 638                break;
 639        }
 640
 641        default:
 642                ret = -EINVAL;
 643                break;
 644        }
 645
 646        return ret;
 647}
 648
 649long sys_arch_prctl(int code, unsigned long addr)
 650{
 651        return do_arch_prctl(current, code, addr);
 652}
 653
 654unsigned long KSTK_ESP(struct task_struct *task)
 655{
 656        return (test_tsk_thread_flag(task, TIF_IA32)) ?
 657                        (task_pt_regs(task)->sp) : ((task)->thread.usersp);
 658}
 659