linux/arch/x86/kernel/process_64.c
<<
>>
Prefs
   1/*
   2 *  Copyright (C) 1995  Linus Torvalds
   3 *
   4 *  Pentium III FXSR, SSE support
   5 *      Gareth Hughes <gareth@valinux.com>, May 2000
   6 *
   7 *  X86-64 port
   8 *      Andi Kleen.
   9 *
  10 *      CPU hotplug support - ashok.raj@intel.com
  11 */
  12
  13/*
  14 * This file handles the architecture-dependent parts of process handling..
  15 */
  16
  17#include <linux/cpu.h>
  18#include <linux/errno.h>
  19#include <linux/sched.h>
  20#include <linux/fs.h>
  21#include <linux/kernel.h>
  22#include <linux/mm.h>
  23#include <linux/elfcore.h>
  24#include <linux/smp.h>
  25#include <linux/slab.h>
  26#include <linux/user.h>
  27#include <linux/interrupt.h>
  28#include <linux/delay.h>
  29#include <linux/module.h>
  30#include <linux/ptrace.h>
  31#include <linux/notifier.h>
  32#include <linux/kprobes.h>
  33#include <linux/kdebug.h>
  34#include <linux/prctl.h>
  35#include <linux/uaccess.h>
  36#include <linux/io.h>
  37#include <linux/ftrace.h>
  38
  39#include <asm/pgtable.h>
  40#include <asm/processor.h>
  41#include <asm/i387.h>
  42#include <asm/fpu-internal.h>
  43#include <asm/mmu_context.h>
  44#include <asm/prctl.h>
  45#include <asm/desc.h>
  46#include <asm/proto.h>
  47#include <asm/ia32.h>
  48#include <asm/idle.h>
  49#include <asm/syscalls.h>
  50#include <asm/debugreg.h>
  51#include <asm/switch_to.h>
  52
  53asmlinkage extern void ret_from_fork(void);
  54
  55DEFINE_PER_CPU(unsigned long, old_rsp);
  56
  57/* Prints also some state that isn't saved in the pt_regs */
  58void __show_regs(struct pt_regs *regs, int all)
  59{
  60        unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
  61        unsigned long d0, d1, d2, d3, d6, d7;
  62        unsigned int fsindex, gsindex;
  63        unsigned int ds, cs, es;
  64
  65        printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
  66        printk_address(regs->ip, 1);
  67        printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
  68                        regs->sp, regs->flags);
  69        printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
  70               regs->ax, regs->bx, regs->cx);
  71        printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
  72               regs->dx, regs->si, regs->di);
  73        printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
  74               regs->bp, regs->r8, regs->r9);
  75        printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
  76               regs->r10, regs->r11, regs->r12);
  77        printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
  78               regs->r13, regs->r14, regs->r15);
  79
  80        asm("movl %%ds,%0" : "=r" (ds));
  81        asm("movl %%cs,%0" : "=r" (cs));
  82        asm("movl %%es,%0" : "=r" (es));
  83        asm("movl %%fs,%0" : "=r" (fsindex));
  84        asm("movl %%gs,%0" : "=r" (gsindex));
  85
  86        rdmsrl(MSR_FS_BASE, fs);
  87        rdmsrl(MSR_GS_BASE, gs);
  88        rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
  89
  90        if (!all)
  91                return;
  92
  93        cr0 = read_cr0();
  94        cr2 = read_cr2();
  95        cr3 = read_cr3();
  96        cr4 = read_cr4();
  97
  98        printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
  99               fs, fsindex, gs, gsindex, shadowgs);
 100        printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
 101                        es, cr0);
 102        printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
 103                        cr4);
 104
 105        get_debugreg(d0, 0);
 106        get_debugreg(d1, 1);
 107        get_debugreg(d2, 2);
 108        get_debugreg(d3, 3);
 109        get_debugreg(d6, 6);
 110        get_debugreg(d7, 7);
 111
 112        /* Only print out debug registers if they are in their non-default state. */
 113        if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
 114            (d6 == DR6_RESERVED) && (d7 == 0x400))
 115                return;
 116
 117        printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
 118        printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
 119
 120}
 121
 122void release_thread(struct task_struct *dead_task)
 123{
 124        if (dead_task->mm) {
 125                if (dead_task->mm->context.size) {
 126                        pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
 127                                dead_task->comm,
 128                                dead_task->mm->context.ldt,
 129                                dead_task->mm->context.size);
 130                        BUG();
 131                }
 132        }
 133}
 134
 135static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
 136{
 137        struct user_desc ud = {
 138                .base_addr = addr,
 139                .limit = 0xfffff,
 140                .seg_32bit = 1,
 141                .limit_in_pages = 1,
 142                .useable = 1,
 143        };
 144        struct desc_struct *desc = t->thread.tls_array;
 145        desc += tls;
 146        fill_ldt(desc, &ud);
 147}
 148
 149static inline u32 read_32bit_tls(struct task_struct *t, int tls)
 150{
 151        return get_desc_base(&t->thread.tls_array[tls]);
 152}
 153
 154int copy_thread(unsigned long clone_flags, unsigned long sp,
 155                unsigned long arg, struct task_struct *p)
 156{
 157        int err;
 158        struct pt_regs *childregs;
 159        struct task_struct *me = current;
 160
 161        p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
 162        childregs = task_pt_regs(p);
 163        p->thread.sp = (unsigned long) childregs;
 164        p->thread.usersp = me->thread.usersp;
 165        set_tsk_thread_flag(p, TIF_FORK);
 166        p->fpu_counter = 0;
 167        p->thread.io_bitmap_ptr = NULL;
 168
 169        savesegment(gs, p->thread.gsindex);
 170        p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
 171        savesegment(fs, p->thread.fsindex);
 172        p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
 173        savesegment(es, p->thread.es);
 174        savesegment(ds, p->thread.ds);
 175        memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 176
 177        if (unlikely(p->flags & PF_KTHREAD)) {
 178                /* kernel thread */
 179                memset(childregs, 0, sizeof(struct pt_regs));
 180                childregs->sp = (unsigned long)childregs;
 181                childregs->ss = __KERNEL_DS;
 182                childregs->bx = sp; /* function */
 183                childregs->bp = arg;
 184                childregs->orig_ax = -1;
 185                childregs->cs = __KERNEL_CS | get_kernel_rpl();
 186                childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
 187                return 0;
 188        }
 189        *childregs = *current_pt_regs();
 190
 191        childregs->ax = 0;
 192        if (sp)
 193                childregs->sp = sp;
 194
 195        err = -ENOMEM;
 196        memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 197
 198        if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 199                p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
 200                                                  IO_BITMAP_BYTES, GFP_KERNEL);
 201                if (!p->thread.io_bitmap_ptr) {
 202                        p->thread.io_bitmap_max = 0;
 203                        return -ENOMEM;
 204                }
 205                set_tsk_thread_flag(p, TIF_IO_BITMAP);
 206        }
 207
 208        /*
 209         * Set a new TLS for the child thread?
 210         */
 211        if (clone_flags & CLONE_SETTLS) {
 212#ifdef CONFIG_IA32_EMULATION
 213                if (test_thread_flag(TIF_IA32))
 214                        err = do_set_thread_area(p, -1,
 215                                (struct user_desc __user *)childregs->si, 0);
 216                else
 217#endif
 218                        err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
 219                if (err)
 220                        goto out;
 221        }
 222        err = 0;
 223out:
 224        if (err && p->thread.io_bitmap_ptr) {
 225                kfree(p->thread.io_bitmap_ptr);
 226                p->thread.io_bitmap_max = 0;
 227        }
 228
 229        return err;
 230}
 231
 232static void
 233start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 234                    unsigned long new_sp,
 235                    unsigned int _cs, unsigned int _ss, unsigned int _ds)
 236{
 237        loadsegment(fs, 0);
 238        loadsegment(es, _ds);
 239        loadsegment(ds, _ds);
 240        load_gs_index(0);
 241        current->thread.usersp  = new_sp;
 242        regs->ip                = new_ip;
 243        regs->sp                = new_sp;
 244        this_cpu_write(old_rsp, new_sp);
 245        regs->cs                = _cs;
 246        regs->ss                = _ss;
 247        regs->flags             = X86_EFLAGS_IF;
 248}
 249
 250void
 251start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 252{
 253        start_thread_common(regs, new_ip, new_sp,
 254                            __USER_CS, __USER_DS, 0);
 255}
 256
 257#ifdef CONFIG_IA32_EMULATION
 258void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
 259{
 260        start_thread_common(regs, new_ip, new_sp,
 261                            test_thread_flag(TIF_X32)
 262                            ? __USER_CS : __USER32_CS,
 263                            __USER_DS, __USER_DS);
 264}
 265#endif
 266
 267/*
 268 *      switch_to(x,y) should switch tasks from x to y.
 269 *
 270 * This could still be optimized:
 271 * - fold all the options into a flag word and test it with a single test.
 272 * - could test fs/gs bitsliced
 273 *
 274 * Kprobes not supported here. Set the probe on schedule instead.
 275 * Function graph tracer not supported too.
 276 */
 277__notrace_funcgraph struct task_struct *
 278__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 279{
 280        struct thread_struct *prev = &prev_p->thread;
 281        struct thread_struct *next = &next_p->thread;
 282        int cpu = smp_processor_id();
 283        struct tss_struct *tss = &per_cpu(init_tss, cpu);
 284        unsigned fsindex, gsindex;
 285        fpu_switch_t fpu;
 286
 287        fpu = switch_fpu_prepare(prev_p, next_p, cpu);
 288
 289        /*
 290         * Reload esp0, LDT and the page table pointer:
 291         */
 292        load_sp0(tss, next);
 293
 294        /*
 295         * Switch DS and ES.
 296         * This won't pick up thread selector changes, but I guess that is ok.
 297         */
 298        savesegment(es, prev->es);
 299        if (unlikely(next->es | prev->es))
 300                loadsegment(es, next->es);
 301
 302        savesegment(ds, prev->ds);
 303        if (unlikely(next->ds | prev->ds))
 304                loadsegment(ds, next->ds);
 305
 306
 307        /* We must save %fs and %gs before load_TLS() because
 308         * %fs and %gs may be cleared by load_TLS().
 309         *
 310         * (e.g. xen_load_tls())
 311         */
 312        savesegment(fs, fsindex);
 313        savesegment(gs, gsindex);
 314
 315        load_TLS(next, cpu);
 316
 317        /*
 318         * Leave lazy mode, flushing any hypercalls made here.
 319         * This must be done before restoring TLS segments so
 320         * the GDT and LDT are properly updated, and must be
 321         * done before math_state_restore, so the TS bit is up
 322         * to date.
 323         */
 324        arch_end_context_switch(next_p);
 325
 326        /*
 327         * Switch FS and GS.
 328         *
 329         * Segment register != 0 always requires a reload.  Also
 330         * reload when it has changed.  When prev process used 64bit
 331         * base always reload to avoid an information leak.
 332         */
 333        if (unlikely(fsindex | next->fsindex | prev->fs)) {
 334                loadsegment(fs, next->fsindex);
 335                /*
 336                 * Check if the user used a selector != 0; if yes
 337                 *  clear 64bit base, since overloaded base is always
 338                 *  mapped to the Null selector
 339                 */
 340                if (fsindex)
 341                        prev->fs = 0;
 342        }
 343        /* when next process has a 64bit base use it */
 344        if (next->fs)
 345                wrmsrl(MSR_FS_BASE, next->fs);
 346        prev->fsindex = fsindex;
 347
 348        if (unlikely(gsindex | next->gsindex | prev->gs)) {
 349                load_gs_index(next->gsindex);
 350                if (gsindex)
 351                        prev->gs = 0;
 352        }
 353        if (next->gs)
 354                wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
 355        prev->gsindex = gsindex;
 356
 357        switch_fpu_finish(next_p, fpu);
 358
 359        /*
 360         * Switch the PDA and FPU contexts.
 361         */
 362        prev->usersp = this_cpu_read(old_rsp);
 363        this_cpu_write(old_rsp, next->usersp);
 364        this_cpu_write(current_task, next_p);
 365
 366        this_cpu_write(kernel_stack,
 367                  (unsigned long)task_stack_page(next_p) +
 368                  THREAD_SIZE - KERNEL_STACK_OFFSET);
 369
 370        /*
 371         * Now maybe reload the debug registers and handle I/O bitmaps
 372         */
 373        if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
 374                     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
 375                __switch_to_xtra(prev_p, next_p, tss);
 376
 377        return prev_p;
 378}
 379
 380void set_personality_64bit(void)
 381{
 382        /* inherit personality from parent */
 383
 384        /* Make sure to be in 64bit mode */
 385        clear_thread_flag(TIF_IA32);
 386        clear_thread_flag(TIF_ADDR32);
 387        clear_thread_flag(TIF_X32);
 388
 389        /* Ensure the corresponding mm is not marked. */
 390        if (current->mm)
 391                current->mm->context.ia32_compat = 0;
 392
 393        /* TBD: overwrites user setup. Should have two bits.
 394           But 64bit processes have always behaved this way,
 395           so it's not too bad. The main problem is just that
 396           32bit childs are affected again. */
 397        current->personality &= ~READ_IMPLIES_EXEC;
 398}
 399
 400void set_personality_ia32(bool x32)
 401{
 402        /* inherit personality from parent */
 403
 404        /* Make sure to be in 32bit mode */
 405        set_thread_flag(TIF_ADDR32);
 406
 407        /* Mark the associated mm as containing 32-bit tasks. */
 408        if (current->mm)
 409                current->mm->context.ia32_compat = 1;
 410
 411        if (x32) {
 412                clear_thread_flag(TIF_IA32);
 413                set_thread_flag(TIF_X32);
 414                current->personality &= ~READ_IMPLIES_EXEC;
 415                /* is_compat_task() uses the presence of the x32
 416                   syscall bit flag to determine compat status */
 417                current_thread_info()->status &= ~TS_COMPAT;
 418        } else {
 419                set_thread_flag(TIF_IA32);
 420                clear_thread_flag(TIF_X32);
 421                current->personality |= force_personality32;
 422                /* Prepare the first "return" to user space */
 423                current_thread_info()->status |= TS_COMPAT;
 424        }
 425}
 426EXPORT_SYMBOL_GPL(set_personality_ia32);
 427
 428unsigned long get_wchan(struct task_struct *p)
 429{
 430        unsigned long stack;
 431        u64 fp, ip;
 432        int count = 0;
 433
 434        if (!p || p == current || p->state == TASK_RUNNING)
 435                return 0;
 436        stack = (unsigned long)task_stack_page(p);
 437        if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
 438                return 0;
 439        fp = *(u64 *)(p->thread.sp);
 440        do {
 441                if (fp < (unsigned long)stack ||
 442                    fp >= (unsigned long)stack+THREAD_SIZE)
 443                        return 0;
 444                ip = *(u64 *)(fp+8);
 445                if (!in_sched_functions(ip))
 446                        return ip;
 447                fp = *(u64 *)fp;
 448        } while (count++ < 16);
 449        return 0;
 450}
 451
 452long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 453{
 454        int ret = 0;
 455        int doit = task == current;
 456        int cpu;
 457
 458        switch (code) {
 459        case ARCH_SET_GS:
 460                if (addr >= TASK_SIZE_OF(task))
 461                        return -EPERM;
 462                cpu = get_cpu();
 463                /* handle small bases via the GDT because that's faster to
 464                   switch. */
 465                if (addr <= 0xffffffff) {
 466                        set_32bit_tls(task, GS_TLS, addr);
 467                        if (doit) {
 468                                load_TLS(&task->thread, cpu);
 469                                load_gs_index(GS_TLS_SEL);
 470                        }
 471                        task->thread.gsindex = GS_TLS_SEL;
 472                        task->thread.gs = 0;
 473                } else {
 474                        task->thread.gsindex = 0;
 475                        task->thread.gs = addr;
 476                        if (doit) {
 477                                load_gs_index(0);
 478                                ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
 479                        }
 480                }
 481                put_cpu();
 482                break;
 483        case ARCH_SET_FS:
 484                /* Not strictly needed for fs, but do it for symmetry
 485                   with gs */
 486                if (addr >= TASK_SIZE_OF(task))
 487                        return -EPERM;
 488                cpu = get_cpu();
 489                /* handle small bases via the GDT because that's faster to
 490                   switch. */
 491                if (addr <= 0xffffffff) {
 492                        set_32bit_tls(task, FS_TLS, addr);
 493                        if (doit) {
 494                                load_TLS(&task->thread, cpu);
 495                                loadsegment(fs, FS_TLS_SEL);
 496                        }
 497                        task->thread.fsindex = FS_TLS_SEL;
 498                        task->thread.fs = 0;
 499                } else {
 500                        task->thread.fsindex = 0;
 501                        task->thread.fs = addr;
 502                        if (doit) {
 503                                /* set the selector to 0 to not confuse
 504                                   __switch_to */
 505                                loadsegment(fs, 0);
 506                                ret = wrmsrl_safe(MSR_FS_BASE, addr);
 507                        }
 508                }
 509                put_cpu();
 510                break;
 511        case ARCH_GET_FS: {
 512                unsigned long base;
 513                if (task->thread.fsindex == FS_TLS_SEL)
 514                        base = read_32bit_tls(task, FS_TLS);
 515                else if (doit)
 516                        rdmsrl(MSR_FS_BASE, base);
 517                else
 518                        base = task->thread.fs;
 519                ret = put_user(base, (unsigned long __user *)addr);
 520                break;
 521        }
 522        case ARCH_GET_GS: {
 523                unsigned long base;
 524                unsigned gsindex;
 525                if (task->thread.gsindex == GS_TLS_SEL)
 526                        base = read_32bit_tls(task, GS_TLS);
 527                else if (doit) {
 528                        savesegment(gs, gsindex);
 529                        if (gsindex)
 530                                rdmsrl(MSR_KERNEL_GS_BASE, base);
 531                        else
 532                                base = task->thread.gs;
 533                } else
 534                        base = task->thread.gs;
 535                ret = put_user(base, (unsigned long __user *)addr);
 536                break;
 537        }
 538
 539        default:
 540                ret = -EINVAL;
 541                break;
 542        }
 543
 544        return ret;
 545}
 546
 547long sys_arch_prctl(int code, unsigned long addr)
 548{
 549        return do_arch_prctl(current, code, addr);
 550}
 551
 552unsigned long KSTK_ESP(struct task_struct *task)
 553{
 554        return (test_tsk_thread_flag(task, TIF_IA32)) ?
 555                        (task_pt_regs(task)->sp) : ((task)->thread.usersp);
 556}
 557