linux/arch/x86/kernel/process_64.c
<<
>>
Prefs
   1/*
   2 *  Copyright (C) 1995  Linus Torvalds
   3 *
   4 *  Pentium III FXSR, SSE support
   5 *      Gareth Hughes <gareth@valinux.com>, May 2000
   6 *
   7 *  X86-64 port
   8 *      Andi Kleen.
   9 *
  10 *      CPU hotplug support - ashok.raj@intel.com
  11 */
  12
  13/*
  14 * This file handles the architecture-dependent parts of process handling..
  15 */
  16
  17#include <linux/stackprotector.h>
  18#include <linux/cpu.h>
  19#include <linux/errno.h>
  20#include <linux/sched.h>
  21#include <linux/fs.h>
  22#include <linux/kernel.h>
  23#include <linux/mm.h>
  24#include <linux/elfcore.h>
  25#include <linux/smp.h>
  26#include <linux/slab.h>
  27#include <linux/user.h>
  28#include <linux/interrupt.h>
  29#include <linux/utsname.h>
  30#include <linux/delay.h>
  31#include <linux/module.h>
  32#include <linux/ptrace.h>
  33#include <linux/notifier.h>
  34#include <linux/kprobes.h>
  35#include <linux/kdebug.h>
  36#include <linux/tick.h>
  37#include <linux/prctl.h>
  38#include <linux/uaccess.h>
  39#include <linux/io.h>
  40#include <linux/ftrace.h>
  41#include <linux/dmi.h>
  42
  43#include <asm/pgtable.h>
  44#include <asm/system.h>
  45#include <asm/processor.h>
  46#include <asm/i387.h>
  47#include <asm/mmu_context.h>
  48#include <asm/prctl.h>
  49#include <asm/desc.h>
  50#include <asm/proto.h>
  51#include <asm/ia32.h>
  52#include <asm/idle.h>
  53#include <asm/syscalls.h>
  54#include <asm/ds.h>
  55
  56asmlinkage extern void ret_from_fork(void);
  57
  58DEFINE_PER_CPU(unsigned long, old_rsp);
  59static DEFINE_PER_CPU(unsigned char, is_idle);
  60
  61unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
  62
  63static ATOMIC_NOTIFIER_HEAD(idle_notifier);
  64
  65void idle_notifier_register(struct notifier_block *n)
  66{
  67        atomic_notifier_chain_register(&idle_notifier, n);
  68}
  69EXPORT_SYMBOL_GPL(idle_notifier_register);
  70
  71void idle_notifier_unregister(struct notifier_block *n)
  72{
  73        atomic_notifier_chain_unregister(&idle_notifier, n);
  74}
  75EXPORT_SYMBOL_GPL(idle_notifier_unregister);
  76
  77void enter_idle(void)
  78{
  79        percpu_write(is_idle, 1);
  80        atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
  81}
  82
  83static void __exit_idle(void)
  84{
  85        if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
  86                return;
  87        atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
  88}
  89
  90/* Called from interrupts to signify idle end */
  91void exit_idle(void)
  92{
  93        /* idle loop has pid 0 */
  94        if (current->pid)
  95                return;
  96        __exit_idle();
  97}
  98
  99#ifndef CONFIG_SMP
 100static inline void play_dead(void)
 101{
 102        BUG();
 103}
 104#endif
 105
 106/*
 107 * The idle thread. There's no useful work to be
 108 * done, so just try to conserve power and have a
 109 * low exit latency (ie sit in a loop waiting for
 110 * somebody to say that they'd like to reschedule)
 111 */
 112void cpu_idle(void)
 113{
 114        current_thread_info()->status |= TS_POLLING;
 115
 116        /*
 117         * If we're the non-boot CPU, nothing set the stack canary up
 118         * for us.  CPU0 already has it initialized but no harm in
 119         * doing it again.  This is a good place for updating it, as
 120         * we wont ever return from this function (so the invalid
 121         * canaries already on the stack wont ever trigger).
 122         */
 123        boot_init_stack_canary();
 124
 125        /* endless idle loop with no priority at all */
 126        while (1) {
 127                tick_nohz_stop_sched_tick(1);
 128                while (!need_resched()) {
 129
 130                        rmb();
 131
 132                        if (cpu_is_offline(smp_processor_id()))
 133                                play_dead();
 134                        /*
 135                         * Idle routines should keep interrupts disabled
 136                         * from here on, until they go to idle.
 137                         * Otherwise, idle callbacks can misfire.
 138                         */
 139                        local_irq_disable();
 140                        enter_idle();
 141                        /* Don't trace irqs off for idle */
 142                        stop_critical_timings();
 143                        pm_idle();
 144                        start_critical_timings();
 145                        /* In many cases the interrupt that ended idle
 146                           has already called exit_idle. But some idle
 147                           loops can be woken up without interrupt. */
 148                        __exit_idle();
 149                }
 150
 151                tick_nohz_restart_sched_tick();
 152                preempt_enable_no_resched();
 153                schedule();
 154                preempt_disable();
 155        }
 156}
 157
 158/* Prints also some state that isn't saved in the pt_regs */
 159void __show_regs(struct pt_regs *regs, int all)
 160{
 161        unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
 162        unsigned long d0, d1, d2, d3, d6, d7;
 163        unsigned int fsindex, gsindex;
 164        unsigned int ds, cs, es;
 165        const char *board;
 166
 167        printk("\n");
 168        print_modules();
 169        board = dmi_get_system_info(DMI_PRODUCT_NAME);
 170        if (!board)
 171                board = "";
 172        printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
 173                current->pid, current->comm, print_tainted(),
 174                init_utsname()->release,
 175                (int)strcspn(init_utsname()->version, " "),
 176                init_utsname()->version, board);
 177        printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
 178        printk_address(regs->ip, 1);
 179        printk(KERN_INFO "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
 180                        regs->sp, regs->flags);
 181        printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
 182               regs->ax, regs->bx, regs->cx);
 183        printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
 184               regs->dx, regs->si, regs->di);
 185        printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
 186               regs->bp, regs->r8, regs->r9);
 187        printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
 188               regs->r10, regs->r11, regs->r12);
 189        printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
 190               regs->r13, regs->r14, regs->r15);
 191
 192        asm("movl %%ds,%0" : "=r" (ds));
 193        asm("movl %%cs,%0" : "=r" (cs));
 194        asm("movl %%es,%0" : "=r" (es));
 195        asm("movl %%fs,%0" : "=r" (fsindex));
 196        asm("movl %%gs,%0" : "=r" (gsindex));
 197
 198        rdmsrl(MSR_FS_BASE, fs);
 199        rdmsrl(MSR_GS_BASE, gs);
 200        rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
 201
 202        if (!all)
 203                return;
 204
 205        cr0 = read_cr0();
 206        cr2 = read_cr2();
 207        cr3 = read_cr3();
 208        cr4 = read_cr4();
 209
 210        printk(KERN_INFO "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
 211               fs, fsindex, gs, gsindex, shadowgs);
 212        printk(KERN_INFO "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
 213                        es, cr0);
 214        printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
 215                        cr4);
 216
 217        get_debugreg(d0, 0);
 218        get_debugreg(d1, 1);
 219        get_debugreg(d2, 2);
 220        printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
 221        get_debugreg(d3, 3);
 222        get_debugreg(d6, 6);
 223        get_debugreg(d7, 7);
 224        printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
 225}
 226
 227void show_regs(struct pt_regs *regs)
 228{
 229        printk(KERN_INFO "CPU %d:", smp_processor_id());
 230        __show_regs(regs, 1);
 231        show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
 232}
 233
 234void release_thread(struct task_struct *dead_task)
 235{
 236        if (dead_task->mm) {
 237                if (dead_task->mm->context.size) {
 238                        printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
 239                                        dead_task->comm,
 240                                        dead_task->mm->context.ldt,
 241                                        dead_task->mm->context.size);
 242                        BUG();
 243                }
 244        }
 245}
 246
 247static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
 248{
 249        struct user_desc ud = {
 250                .base_addr = addr,
 251                .limit = 0xfffff,
 252                .seg_32bit = 1,
 253                .limit_in_pages = 1,
 254                .useable = 1,
 255        };
 256        struct desc_struct *desc = t->thread.tls_array;
 257        desc += tls;
 258        fill_ldt(desc, &ud);
 259}
 260
 261static inline u32 read_32bit_tls(struct task_struct *t, int tls)
 262{
 263        return get_desc_base(&t->thread.tls_array[tls]);
 264}
 265
 266/*
 267 * This gets called before we allocate a new thread and copy
 268 * the current task into it.
 269 */
 270void prepare_to_copy(struct task_struct *tsk)
 271{
 272        unlazy_fpu(tsk);
 273}
 274
 275int copy_thread(unsigned long clone_flags, unsigned long sp,
 276                unsigned long unused,
 277        struct task_struct *p, struct pt_regs *regs)
 278{
 279        int err;
 280        struct pt_regs *childregs;
 281        struct task_struct *me = current;
 282
 283        childregs = ((struct pt_regs *)
 284                        (THREAD_SIZE + task_stack_page(p))) - 1;
 285        *childregs = *regs;
 286
 287        childregs->ax = 0;
 288        childregs->sp = sp;
 289        if (sp == ~0UL)
 290                childregs->sp = (unsigned long)childregs;
 291
 292        p->thread.sp = (unsigned long) childregs;
 293        p->thread.sp0 = (unsigned long) (childregs+1);
 294        p->thread.usersp = me->thread.usersp;
 295
 296        set_tsk_thread_flag(p, TIF_FORK);
 297
 298        p->thread.fs = me->thread.fs;
 299        p->thread.gs = me->thread.gs;
 300
 301        savesegment(gs, p->thread.gsindex);
 302        savesegment(fs, p->thread.fsindex);
 303        savesegment(es, p->thread.es);
 304        savesegment(ds, p->thread.ds);
 305
 306        if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 307                p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
 308                if (!p->thread.io_bitmap_ptr) {
 309                        p->thread.io_bitmap_max = 0;
 310                        return -ENOMEM;
 311                }
 312                memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
 313                                IO_BITMAP_BYTES);
 314                set_tsk_thread_flag(p, TIF_IO_BITMAP);
 315        }
 316
 317        /*
 318         * Set a new TLS for the child thread?
 319         */
 320        if (clone_flags & CLONE_SETTLS) {
 321#ifdef CONFIG_IA32_EMULATION
 322                if (test_thread_flag(TIF_IA32))
 323                        err = do_set_thread_area(p, -1,
 324                                (struct user_desc __user *)childregs->si, 0);
 325                else
 326#endif
 327                        err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
 328                if (err)
 329                        goto out;
 330        }
 331
 332        clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
 333        p->thread.ds_ctx = NULL;
 334
 335        clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
 336        p->thread.debugctlmsr = 0;
 337
 338        err = 0;
 339out:
 340        if (err && p->thread.io_bitmap_ptr) {
 341                kfree(p->thread.io_bitmap_ptr);
 342                p->thread.io_bitmap_max = 0;
 343        }
 344        return err;
 345}
 346
 347void
 348start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 349{
 350        loadsegment(fs, 0);
 351        loadsegment(es, 0);
 352        loadsegment(ds, 0);
 353        load_gs_index(0);
 354        regs->ip                = new_ip;
 355        regs->sp                = new_sp;
 356        percpu_write(old_rsp, new_sp);
 357        regs->cs                = __USER_CS;
 358        regs->ss                = __USER_DS;
 359        regs->flags             = 0x200;
 360        set_fs(USER_DS);
 361        /*
 362         * Free the old FP and other extended state
 363         */
 364        free_thread_xstate(current);
 365}
 366EXPORT_SYMBOL_GPL(start_thread);
 367
 368/*
 369 *      switch_to(x,y) should switch tasks from x to y.
 370 *
 371 * This could still be optimized:
 372 * - fold all the options into a flag word and test it with a single test.
 373 * - could test fs/gs bitsliced
 374 *
 375 * Kprobes not supported here. Set the probe on schedule instead.
 376 * Function graph tracer not supported too.
 377 */
 378__notrace_funcgraph struct task_struct *
 379__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 380{
 381        struct thread_struct *prev = &prev_p->thread;
 382        struct thread_struct *next = &next_p->thread;
 383        int cpu = smp_processor_id();
 384        struct tss_struct *tss = &per_cpu(init_tss, cpu);
 385        unsigned fsindex, gsindex;
 386        bool preload_fpu;
 387
 388        /*
 389         * If the task has used fpu the last 5 timeslices, just do a full
 390         * restore of the math state immediately to avoid the trap; the
 391         * chances of needing FPU soon are obviously high now
 392         */
 393        preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
 394
 395        /* we're going to use this soon, after a few expensive things */
 396        if (preload_fpu)
 397                prefetch(next->xstate);
 398
 399        /*
 400         * Reload esp0, LDT and the page table pointer:
 401         */
 402        load_sp0(tss, next);
 403
 404        /*
 405         * Switch DS and ES.
 406         * This won't pick up thread selector changes, but I guess that is ok.
 407         */
 408        savesegment(es, prev->es);
 409        if (unlikely(next->es | prev->es))
 410                loadsegment(es, next->es);
 411
 412        savesegment(ds, prev->ds);
 413        if (unlikely(next->ds | prev->ds))
 414                loadsegment(ds, next->ds);
 415
 416
 417        /* We must save %fs and %gs before load_TLS() because
 418         * %fs and %gs may be cleared by load_TLS().
 419         *
 420         * (e.g. xen_load_tls())
 421         */
 422        savesegment(fs, fsindex);
 423        savesegment(gs, gsindex);
 424
 425        load_TLS(next, cpu);
 426
 427        /* Must be after DS reload */
 428        unlazy_fpu(prev_p);
 429
 430        /* Make sure cpu is ready for new context */
 431        if (preload_fpu)
 432                clts();
 433
 434        /*
 435         * Leave lazy mode, flushing any hypercalls made here.
 436         * This must be done before restoring TLS segments so
 437         * the GDT and LDT are properly updated, and must be
 438         * done before math_state_restore, so the TS bit is up
 439         * to date.
 440         */
 441        arch_end_context_switch(next_p);
 442
 443        /*
 444         * Switch FS and GS.
 445         *
 446         * Segment register != 0 always requires a reload.  Also
 447         * reload when it has changed.  When prev process used 64bit
 448         * base always reload to avoid an information leak.
 449         */
 450        if (unlikely(fsindex | next->fsindex | prev->fs)) {
 451                loadsegment(fs, next->fsindex);
 452                /*
 453                 * Check if the user used a selector != 0; if yes
 454                 *  clear 64bit base, since overloaded base is always
 455                 *  mapped to the Null selector
 456                 */
 457                if (fsindex)
 458                        prev->fs = 0;
 459        }
 460        /* when next process has a 64bit base use it */
 461        if (next->fs)
 462                wrmsrl(MSR_FS_BASE, next->fs);
 463        prev->fsindex = fsindex;
 464
 465        if (unlikely(gsindex | next->gsindex | prev->gs)) {
 466                load_gs_index(next->gsindex);
 467                if (gsindex)
 468                        prev->gs = 0;
 469        }
 470        if (next->gs)
 471                wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
 472        prev->gsindex = gsindex;
 473
 474        /*
 475         * Switch the PDA and FPU contexts.
 476         */
 477        prev->usersp = percpu_read(old_rsp);
 478        percpu_write(old_rsp, next->usersp);
 479        percpu_write(current_task, next_p);
 480
 481        percpu_write(kernel_stack,
 482                  (unsigned long)task_stack_page(next_p) +
 483                  THREAD_SIZE - KERNEL_STACK_OFFSET);
 484
 485        /*
 486         * Now maybe reload the debug registers and handle I/O bitmaps
 487         */
 488        if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
 489                     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
 490                __switch_to_xtra(prev_p, next_p, tss);
 491
 492        /*
 493         * Preload the FPU context, now that we've determined that the
 494         * task is likely to be using it. 
 495         */
 496        if (preload_fpu)
 497                __math_state_restore();
 498        return prev_p;
 499}
 500
 501/*
 502 * sys_execve() executes a new program.
 503 */
 504asmlinkage
 505long sys_execve(char __user *name, char __user * __user *argv,
 506                char __user * __user *envp, struct pt_regs *regs)
 507{
 508        long error;
 509        char *filename;
 510
 511        filename = getname(name);
 512        error = PTR_ERR(filename);
 513        if (IS_ERR(filename))
 514                return error;
 515        error = do_execve(filename, argv, envp, regs);
 516        putname(filename);
 517        return error;
 518}
 519
 520void set_personality_64bit(void)
 521{
 522        /* inherit personality from parent */
 523
 524        /* Make sure to be in 64bit mode */
 525        clear_thread_flag(TIF_IA32);
 526
 527        /* TBD: overwrites user setup. Should have two bits.
 528           But 64bit processes have always behaved this way,
 529           so it's not too bad. The main problem is just that
 530           32bit childs are affected again. */
 531        current->personality &= ~READ_IMPLIES_EXEC;
 532}
 533
 534asmlinkage long
 535sys_clone(unsigned long clone_flags, unsigned long newsp,
 536          void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
 537{
 538        if (!newsp)
 539                newsp = regs->sp;
 540        return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
 541}
 542
 543unsigned long get_wchan(struct task_struct *p)
 544{
 545        unsigned long stack;
 546        u64 fp, ip;
 547        int count = 0;
 548
 549        if (!p || p == current || p->state == TASK_RUNNING)
 550                return 0;
 551        stack = (unsigned long)task_stack_page(p);
 552        if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
 553                return 0;
 554        fp = *(u64 *)(p->thread.sp);
 555        do {
 556                if (fp < (unsigned long)stack ||
 557                    fp >= (unsigned long)stack+THREAD_SIZE)
 558                        return 0;
 559                ip = *(u64 *)(fp+8);
 560                if (!in_sched_functions(ip))
 561                        return ip;
 562                fp = *(u64 *)fp;
 563        } while (count++ < 16);
 564        return 0;
 565}
 566
 567long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 568{
 569        int ret = 0;
 570        int doit = task == current;
 571        int cpu;
 572
 573        switch (code) {
 574        case ARCH_SET_GS:
 575                if (addr >= TASK_SIZE_OF(task))
 576                        return -EPERM;
 577                cpu = get_cpu();
 578                /* handle small bases via the GDT because that's faster to
 579                   switch. */
 580                if (addr <= 0xffffffff) {
 581                        set_32bit_tls(task, GS_TLS, addr);
 582                        if (doit) {
 583                                load_TLS(&task->thread, cpu);
 584                                load_gs_index(GS_TLS_SEL);
 585                        }
 586                        task->thread.gsindex = GS_TLS_SEL;
 587                        task->thread.gs = 0;
 588                } else {
 589                        task->thread.gsindex = 0;
 590                        task->thread.gs = addr;
 591                        if (doit) {
 592                                load_gs_index(0);
 593                                ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
 594                        }
 595                }
 596                put_cpu();
 597                break;
 598        case ARCH_SET_FS:
 599                /* Not strictly needed for fs, but do it for symmetry
 600                   with gs */
 601                if (addr >= TASK_SIZE_OF(task))
 602                        return -EPERM;
 603                cpu = get_cpu();
 604                /* handle small bases via the GDT because that's faster to
 605                   switch. */
 606                if (addr <= 0xffffffff) {
 607                        set_32bit_tls(task, FS_TLS, addr);
 608                        if (doit) {
 609                                load_TLS(&task->thread, cpu);
 610                                loadsegment(fs, FS_TLS_SEL);
 611                        }
 612                        task->thread.fsindex = FS_TLS_SEL;
 613                        task->thread.fs = 0;
 614                } else {
 615                        task->thread.fsindex = 0;
 616                        task->thread.fs = addr;
 617                        if (doit) {
 618                                /* set the selector to 0 to not confuse
 619                                   __switch_to */
 620                                loadsegment(fs, 0);
 621                                ret = checking_wrmsrl(MSR_FS_BASE, addr);
 622                        }
 623                }
 624                put_cpu();
 625                break;
 626        case ARCH_GET_FS: {
 627                unsigned long base;
 628                if (task->thread.fsindex == FS_TLS_SEL)
 629                        base = read_32bit_tls(task, FS_TLS);
 630                else if (doit)
 631                        rdmsrl(MSR_FS_BASE, base);
 632                else
 633                        base = task->thread.fs;
 634                ret = put_user(base, (unsigned long __user *)addr);
 635                break;
 636        }
 637        case ARCH_GET_GS: {
 638                unsigned long base;
 639                unsigned gsindex;
 640                if (task->thread.gsindex == GS_TLS_SEL)
 641                        base = read_32bit_tls(task, GS_TLS);
 642                else if (doit) {
 643                        savesegment(gs, gsindex);
 644                        if (gsindex)
 645                                rdmsrl(MSR_KERNEL_GS_BASE, base);
 646                        else
 647                                base = task->thread.gs;
 648                } else
 649                        base = task->thread.gs;
 650                ret = put_user(base, (unsigned long __user *)addr);
 651                break;
 652        }
 653
 654        default:
 655                ret = -EINVAL;
 656                break;
 657        }
 658
 659        return ret;
 660}
 661
 662long sys_arch_prctl(int code, unsigned long addr)
 663{
 664        return do_arch_prctl(current, code, addr);
 665}
 666
 667unsigned long KSTK_ESP(struct task_struct *task)
 668{
 669        return (test_tsk_thread_flag(task, TIF_IA32)) ?
 670                        (task_pt_regs(task)->sp) : ((task)->thread.usersp);
 671}
 672