linux/arch/x86/kernel/process_64.c
<<
>>
Prefs
   1/*
   2 *  Copyright (C) 1995  Linus Torvalds
   3 *
   4 *  Pentium III FXSR, SSE support
   5 *      Gareth Hughes <gareth@valinux.com>, May 2000
   6 *
   7 *  X86-64 port
   8 *      Andi Kleen.
   9 *
  10 *      CPU hotplug support - ashok.raj@intel.com
  11 */
  12
  13/*
  14 * This file handles the architecture-dependent parts of process handling..
  15 */
  16
  17#include <linux/stackprotector.h>
  18#include <linux/cpu.h>
  19#include <linux/errno.h>
  20#include <linux/sched.h>
  21#include <linux/fs.h>
  22#include <linux/kernel.h>
  23#include <linux/mm.h>
  24#include <linux/elfcore.h>
  25#include <linux/smp.h>
  26#include <linux/slab.h>
  27#include <linux/user.h>
  28#include <linux/interrupt.h>
  29#include <linux/delay.h>
  30#include <linux/module.h>
  31#include <linux/ptrace.h>
  32#include <linux/notifier.h>
  33#include <linux/kprobes.h>
  34#include <linux/kdebug.h>
  35#include <linux/tick.h>
  36#include <linux/prctl.h>
  37#include <linux/uaccess.h>
  38#include <linux/io.h>
  39#include <linux/ftrace.h>
  40
  41#include <asm/pgtable.h>
  42#include <asm/system.h>
  43#include <asm/processor.h>
  44#include <asm/i387.h>
  45#include <asm/mmu_context.h>
  46#include <asm/prctl.h>
  47#include <asm/desc.h>
  48#include <asm/proto.h>
  49#include <asm/ia32.h>
  50#include <asm/idle.h>
  51#include <asm/syscalls.h>
  52#include <asm/debugreg.h>
  53
  54asmlinkage extern void ret_from_fork(void);
  55
  56DEFINE_PER_CPU(unsigned long, old_rsp);
  57static DEFINE_PER_CPU(unsigned char, is_idle);
  58
  59static ATOMIC_NOTIFIER_HEAD(idle_notifier);
  60
  61void idle_notifier_register(struct notifier_block *n)
  62{
  63        atomic_notifier_chain_register(&idle_notifier, n);
  64}
  65EXPORT_SYMBOL_GPL(idle_notifier_register);
  66
  67void idle_notifier_unregister(struct notifier_block *n)
  68{
  69        atomic_notifier_chain_unregister(&idle_notifier, n);
  70}
  71EXPORT_SYMBOL_GPL(idle_notifier_unregister);
  72
  73void enter_idle(void)
  74{
  75        percpu_write(is_idle, 1);
  76        atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
  77}
  78
  79static void __exit_idle(void)
  80{
  81        if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
  82                return;
  83        atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
  84}
  85
  86/* Called from interrupts to signify idle end */
  87void exit_idle(void)
  88{
  89        /* idle loop has pid 0 */
  90        if (current->pid)
  91                return;
  92        __exit_idle();
  93}
  94
  95#ifndef CONFIG_SMP
  96static inline void play_dead(void)
  97{
  98        BUG();
  99}
 100#endif
 101
 102/*
 103 * The idle thread. There's no useful work to be
 104 * done, so just try to conserve power and have a
 105 * low exit latency (ie sit in a loop waiting for
 106 * somebody to say that they'd like to reschedule)
 107 */
 108void cpu_idle(void)
 109{
 110        current_thread_info()->status |= TS_POLLING;
 111
 112        /*
 113         * If we're the non-boot CPU, nothing set the stack canary up
 114         * for us.  CPU0 already has it initialized but no harm in
 115         * doing it again.  This is a good place for updating it, as
 116         * we wont ever return from this function (so the invalid
 117         * canaries already on the stack wont ever trigger).
 118         */
 119        boot_init_stack_canary();
 120
 121        /* endless idle loop with no priority at all */
 122        while (1) {
 123                tick_nohz_stop_sched_tick(1);
 124                while (!need_resched()) {
 125
 126                        rmb();
 127
 128                        if (cpu_is_offline(smp_processor_id()))
 129                                play_dead();
 130                        /*
 131                         * Idle routines should keep interrupts disabled
 132                         * from here on, until they go to idle.
 133                         * Otherwise, idle callbacks can misfire.
 134                         */
 135                        local_irq_disable();
 136                        enter_idle();
 137                        /* Don't trace irqs off for idle */
 138                        stop_critical_timings();
 139                        pm_idle();
 140                        start_critical_timings();
 141
 142                        /* In many cases the interrupt that ended idle
 143                           has already called exit_idle. But some idle
 144                           loops can be woken up without interrupt. */
 145                        __exit_idle();
 146                }
 147
 148                tick_nohz_restart_sched_tick();
 149                preempt_enable_no_resched();
 150                schedule();
 151                preempt_disable();
 152        }
 153}
 154
 155/* Prints also some state that isn't saved in the pt_regs */
 156void __show_regs(struct pt_regs *regs, int all)
 157{
 158        unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
 159        unsigned long d0, d1, d2, d3, d6, d7;
 160        unsigned int fsindex, gsindex;
 161        unsigned int ds, cs, es;
 162
 163        show_regs_common();
 164        printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
 165        printk_address(regs->ip, 1);
 166        printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
 167                        regs->sp, regs->flags);
 168        printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
 169               regs->ax, regs->bx, regs->cx);
 170        printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
 171               regs->dx, regs->si, regs->di);
 172        printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
 173               regs->bp, regs->r8, regs->r9);
 174        printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
 175               regs->r10, regs->r11, regs->r12);
 176        printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
 177               regs->r13, regs->r14, regs->r15);
 178
 179        asm("movl %%ds,%0" : "=r" (ds));
 180        asm("movl %%cs,%0" : "=r" (cs));
 181        asm("movl %%es,%0" : "=r" (es));
 182        asm("movl %%fs,%0" : "=r" (fsindex));
 183        asm("movl %%gs,%0" : "=r" (gsindex));
 184
 185        rdmsrl(MSR_FS_BASE, fs);
 186        rdmsrl(MSR_GS_BASE, gs);
 187        rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
 188
 189        if (!all)
 190                return;
 191
 192        cr0 = read_cr0();
 193        cr2 = read_cr2();
 194        cr3 = read_cr3();
 195        cr4 = read_cr4();
 196
 197        printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
 198               fs, fsindex, gs, gsindex, shadowgs);
 199        printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
 200                        es, cr0);
 201        printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
 202                        cr4);
 203
 204        get_debugreg(d0, 0);
 205        get_debugreg(d1, 1);
 206        get_debugreg(d2, 2);
 207        printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
 208        get_debugreg(d3, 3);
 209        get_debugreg(d6, 6);
 210        get_debugreg(d7, 7);
 211        printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
 212}
 213
 214void release_thread(struct task_struct *dead_task)
 215{
 216        if (dead_task->mm) {
 217                if (dead_task->mm->context.size) {
 218                        printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
 219                                        dead_task->comm,
 220                                        dead_task->mm->context.ldt,
 221                                        dead_task->mm->context.size);
 222                        BUG();
 223                }
 224        }
 225}
 226
 227static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
 228{
 229        struct user_desc ud = {
 230                .base_addr = addr,
 231                .limit = 0xfffff,
 232                .seg_32bit = 1,
 233                .limit_in_pages = 1,
 234                .useable = 1,
 235        };
 236        struct desc_struct *desc = t->thread.tls_array;
 237        desc += tls;
 238        fill_ldt(desc, &ud);
 239}
 240
 241static inline u32 read_32bit_tls(struct task_struct *t, int tls)
 242{
 243        return get_desc_base(&t->thread.tls_array[tls]);
 244}
 245
 246/*
 247 * This gets called before we allocate a new thread and copy
 248 * the current task into it.
 249 */
 250void prepare_to_copy(struct task_struct *tsk)
 251{
 252        unlazy_fpu(tsk);
 253}
 254
 255int copy_thread(unsigned long clone_flags, unsigned long sp,
 256                unsigned long unused,
 257        struct task_struct *p, struct pt_regs *regs)
 258{
 259        int err;
 260        struct pt_regs *childregs;
 261        struct task_struct *me = current;
 262
 263        childregs = ((struct pt_regs *)
 264                        (THREAD_SIZE + task_stack_page(p))) - 1;
 265        *childregs = *regs;
 266
 267        childregs->ax = 0;
 268        if (user_mode(regs))
 269                childregs->sp = sp;
 270        else
 271                childregs->sp = (unsigned long)childregs;
 272
 273        p->thread.sp = (unsigned long) childregs;
 274        p->thread.sp0 = (unsigned long) (childregs+1);
 275        p->thread.usersp = me->thread.usersp;
 276
 277        set_tsk_thread_flag(p, TIF_FORK);
 278
 279        p->thread.io_bitmap_ptr = NULL;
 280
 281        savesegment(gs, p->thread.gsindex);
 282        p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
 283        savesegment(fs, p->thread.fsindex);
 284        p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
 285        savesegment(es, p->thread.es);
 286        savesegment(ds, p->thread.ds);
 287
 288        err = -ENOMEM;
 289        memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 290
 291        if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 292                p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
 293                if (!p->thread.io_bitmap_ptr) {
 294                        p->thread.io_bitmap_max = 0;
 295                        return -ENOMEM;
 296                }
 297                memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
 298                                IO_BITMAP_BYTES);
 299                set_tsk_thread_flag(p, TIF_IO_BITMAP);
 300        }
 301
 302        /*
 303         * Set a new TLS for the child thread?
 304         */
 305        if (clone_flags & CLONE_SETTLS) {
 306#ifdef CONFIG_IA32_EMULATION
 307                if (test_thread_flag(TIF_IA32))
 308                        err = do_set_thread_area(p, -1,
 309                                (struct user_desc __user *)childregs->si, 0);
 310                else
 311#endif
 312                        err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
 313                if (err)
 314                        goto out;
 315        }
 316        err = 0;
 317out:
 318        if (err && p->thread.io_bitmap_ptr) {
 319                kfree(p->thread.io_bitmap_ptr);
 320                p->thread.io_bitmap_max = 0;
 321        }
 322
 323        return err;
 324}
 325
 326static void
 327start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 328                    unsigned long new_sp,
 329                    unsigned int _cs, unsigned int _ss, unsigned int _ds)
 330{
 331        loadsegment(fs, 0);
 332        loadsegment(es, _ds);
 333        loadsegment(ds, _ds);
 334        load_gs_index(0);
 335        regs->ip                = new_ip;
 336        regs->sp                = new_sp;
 337        percpu_write(old_rsp, new_sp);
 338        regs->cs                = _cs;
 339        regs->ss                = _ss;
 340        regs->flags             = X86_EFLAGS_IF;
 341        set_fs(USER_DS);
 342        /*
 343         * Free the old FP and other extended state
 344         */
 345        free_thread_xstate(current);
 346}
 347
 348void
 349start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 350{
 351        start_thread_common(regs, new_ip, new_sp,
 352                            __USER_CS, __USER_DS, 0);
 353}
 354
 355#ifdef CONFIG_IA32_EMULATION
 356void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
 357{
 358        start_thread_common(regs, new_ip, new_sp,
 359                            __USER32_CS, __USER32_DS, __USER32_DS);
 360}
 361#endif
 362
 363/*
 364 *      switch_to(x,y) should switch tasks from x to y.
 365 *
 366 * This could still be optimized:
 367 * - fold all the options into a flag word and test it with a single test.
 368 * - could test fs/gs bitsliced
 369 *
 370 * Kprobes not supported here. Set the probe on schedule instead.
 371 * Function graph tracer not supported too.
 372 */
 373__notrace_funcgraph struct task_struct *
 374__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 375{
 376        struct thread_struct *prev = &prev_p->thread;
 377        struct thread_struct *next = &next_p->thread;
 378        int cpu = smp_processor_id();
 379        struct tss_struct *tss = &per_cpu(init_tss, cpu);
 380        unsigned fsindex, gsindex;
 381        bool preload_fpu;
 382
 383        /*
 384         * If the task has used fpu the last 5 timeslices, just do a full
 385         * restore of the math state immediately to avoid the trap; the
 386         * chances of needing FPU soon are obviously high now
 387         */
 388        preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
 389
 390        /* we're going to use this soon, after a few expensive things */
 391        if (preload_fpu)
 392                prefetch(next->fpu.state);
 393
 394        /*
 395         * Reload esp0, LDT and the page table pointer:
 396         */
 397        load_sp0(tss, next);
 398
 399        /*
 400         * Switch DS and ES.
 401         * This won't pick up thread selector changes, but I guess that is ok.
 402         */
 403        savesegment(es, prev->es);
 404        if (unlikely(next->es | prev->es))
 405                loadsegment(es, next->es);
 406
 407        savesegment(ds, prev->ds);
 408        if (unlikely(next->ds | prev->ds))
 409                loadsegment(ds, next->ds);
 410
 411
 412        /* We must save %fs and %gs before load_TLS() because
 413         * %fs and %gs may be cleared by load_TLS().
 414         *
 415         * (e.g. xen_load_tls())
 416         */
 417        savesegment(fs, fsindex);
 418        savesegment(gs, gsindex);
 419
 420        load_TLS(next, cpu);
 421
 422        /* Must be after DS reload */
 423        __unlazy_fpu(prev_p);
 424
 425        /* Make sure cpu is ready for new context */
 426        if (preload_fpu)
 427                clts();
 428
 429        /*
 430         * Leave lazy mode, flushing any hypercalls made here.
 431         * This must be done before restoring TLS segments so
 432         * the GDT and LDT are properly updated, and must be
 433         * done before math_state_restore, so the TS bit is up
 434         * to date.
 435         */
 436        arch_end_context_switch(next_p);
 437
 438        /*
 439         * Switch FS and GS.
 440         *
 441         * Segment register != 0 always requires a reload.  Also
 442         * reload when it has changed.  When prev process used 64bit
 443         * base always reload to avoid an information leak.
 444         */
 445        if (unlikely(fsindex | next->fsindex | prev->fs)) {
 446                loadsegment(fs, next->fsindex);
 447                /*
 448                 * Check if the user used a selector != 0; if yes
 449                 *  clear 64bit base, since overloaded base is always
 450                 *  mapped to the Null selector
 451                 */
 452                if (fsindex)
 453                        prev->fs = 0;
 454        }
 455        /* when next process has a 64bit base use it */
 456        if (next->fs)
 457                wrmsrl(MSR_FS_BASE, next->fs);
 458        prev->fsindex = fsindex;
 459
 460        if (unlikely(gsindex | next->gsindex | prev->gs)) {
 461                load_gs_index(next->gsindex);
 462                if (gsindex)
 463                        prev->gs = 0;
 464        }
 465        if (next->gs)
 466                wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
 467        prev->gsindex = gsindex;
 468
 469        /*
 470         * Switch the PDA and FPU contexts.
 471         */
 472        prev->usersp = percpu_read(old_rsp);
 473        percpu_write(old_rsp, next->usersp);
 474        percpu_write(current_task, next_p);
 475
 476        percpu_write(kernel_stack,
 477                  (unsigned long)task_stack_page(next_p) +
 478                  THREAD_SIZE - KERNEL_STACK_OFFSET);
 479
 480        /*
 481         * Now maybe reload the debug registers and handle I/O bitmaps
 482         */
 483        if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
 484                     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
 485                __switch_to_xtra(prev_p, next_p, tss);
 486
 487        /*
 488         * Preload the FPU context, now that we've determined that the
 489         * task is likely to be using it. 
 490         */
 491        if (preload_fpu)
 492                __math_state_restore();
 493
 494        return prev_p;
 495}
 496
 497void set_personality_64bit(void)
 498{
 499        /* inherit personality from parent */
 500
 501        /* Make sure to be in 64bit mode */
 502        clear_thread_flag(TIF_IA32);
 503
 504        /* TBD: overwrites user setup. Should have two bits.
 505           But 64bit processes have always behaved this way,
 506           so it's not too bad. The main problem is just that
 507           32bit childs are affected again. */
 508        current->personality &= ~READ_IMPLIES_EXEC;
 509}
 510
 511void set_personality_ia32(void)
 512{
 513        /* inherit personality from parent */
 514
 515        /* Make sure to be in 32bit mode */
 516        set_thread_flag(TIF_IA32);
 517        current->personality |= force_personality32;
 518
 519        /* Prepare the first "return" to user space */
 520        current_thread_info()->status |= TS_COMPAT;
 521}
 522
 523unsigned long get_wchan(struct task_struct *p)
 524{
 525        unsigned long stack;
 526        u64 fp, ip;
 527        int count = 0;
 528
 529        if (!p || p == current || p->state == TASK_RUNNING)
 530                return 0;
 531        stack = (unsigned long)task_stack_page(p);
 532        if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
 533                return 0;
 534        fp = *(u64 *)(p->thread.sp);
 535        do {
 536                if (fp < (unsigned long)stack ||
 537                    fp >= (unsigned long)stack+THREAD_SIZE)
 538                        return 0;
 539                ip = *(u64 *)(fp+8);
 540                if (!in_sched_functions(ip))
 541                        return ip;
 542                fp = *(u64 *)fp;
 543        } while (count++ < 16);
 544        return 0;
 545}
 546
 547long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 548{
 549        int ret = 0;
 550        int doit = task == current;
 551        int cpu;
 552
 553        switch (code) {
 554        case ARCH_SET_GS:
 555                if (addr >= TASK_SIZE_OF(task))
 556                        return -EPERM;
 557                cpu = get_cpu();
 558                /* handle small bases via the GDT because that's faster to
 559                   switch. */
 560                if (addr <= 0xffffffff) {
 561                        set_32bit_tls(task, GS_TLS, addr);
 562                        if (doit) {
 563                                load_TLS(&task->thread, cpu);
 564                                load_gs_index(GS_TLS_SEL);
 565                        }
 566                        task->thread.gsindex = GS_TLS_SEL;
 567                        task->thread.gs = 0;
 568                } else {
 569                        task->thread.gsindex = 0;
 570                        task->thread.gs = addr;
 571                        if (doit) {
 572                                load_gs_index(0);
 573                                ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
 574                        }
 575                }
 576                put_cpu();
 577                break;
 578        case ARCH_SET_FS:
 579                /* Not strictly needed for fs, but do it for symmetry
 580                   with gs */
 581                if (addr >= TASK_SIZE_OF(task))
 582                        return -EPERM;
 583                cpu = get_cpu();
 584                /* handle small bases via the GDT because that's faster to
 585                   switch. */
 586                if (addr <= 0xffffffff) {
 587                        set_32bit_tls(task, FS_TLS, addr);
 588                        if (doit) {
 589                                load_TLS(&task->thread, cpu);
 590                                loadsegment(fs, FS_TLS_SEL);
 591                        }
 592                        task->thread.fsindex = FS_TLS_SEL;
 593                        task->thread.fs = 0;
 594                } else {
 595                        task->thread.fsindex = 0;
 596                        task->thread.fs = addr;
 597                        if (doit) {
 598                                /* set the selector to 0 to not confuse
 599                                   __switch_to */
 600                                loadsegment(fs, 0);
 601                                ret = checking_wrmsrl(MSR_FS_BASE, addr);
 602                        }
 603                }
 604                put_cpu();
 605                break;
 606        case ARCH_GET_FS: {
 607                unsigned long base;
 608                if (task->thread.fsindex == FS_TLS_SEL)
 609                        base = read_32bit_tls(task, FS_TLS);
 610                else if (doit)
 611                        rdmsrl(MSR_FS_BASE, base);
 612                else
 613                        base = task->thread.fs;
 614                ret = put_user(base, (unsigned long __user *)addr);
 615                break;
 616        }
 617        case ARCH_GET_GS: {
 618                unsigned long base;
 619                unsigned gsindex;
 620                if (task->thread.gsindex == GS_TLS_SEL)
 621                        base = read_32bit_tls(task, GS_TLS);
 622                else if (doit) {
 623                        savesegment(gs, gsindex);
 624                        if (gsindex)
 625                                rdmsrl(MSR_KERNEL_GS_BASE, base);
 626                        else
 627                                base = task->thread.gs;
 628                } else
 629                        base = task->thread.gs;
 630                ret = put_user(base, (unsigned long __user *)addr);
 631                break;
 632        }
 633
 634        default:
 635                ret = -EINVAL;
 636                break;
 637        }
 638
 639        return ret;
 640}
 641
 642long sys_arch_prctl(int code, unsigned long addr)
 643{
 644        return do_arch_prctl(current, code, addr);
 645}
 646
 647unsigned long KSTK_ESP(struct task_struct *task)
 648{
 649        return (test_tsk_thread_flag(task, TIF_IA32)) ?
 650                        (task_pt_regs(task)->sp) : ((task)->thread.usersp);
 651}
 652