linux/arch/x86/kernel/process_64.c
<<
>>
Prefs
   1/*
   2 *  Copyright (C) 1995  Linus Torvalds
   3 *
   4 *  Pentium III FXSR, SSE support
   5 *      Gareth Hughes <gareth@valinux.com>, May 2000
   6 *
   7 *  X86-64 port
   8 *      Andi Kleen.
   9 *
  10 *      CPU hotplug support - ashok.raj@intel.com
  11 */
  12
  13/*
  14 * This file handles the architecture-dependent parts of process handling..
  15 */
  16
  17#include <linux/cpu.h>
  18#include <linux/errno.h>
  19#include <linux/sched.h>
  20#include <linux/fs.h>
  21#include <linux/kernel.h>
  22#include <linux/mm.h>
  23#include <linux/elfcore.h>
  24#include <linux/smp.h>
  25#include <linux/slab.h>
  26#include <linux/user.h>
  27#include <linux/interrupt.h>
  28#include <linux/delay.h>
  29#include <linux/module.h>
  30#include <linux/ptrace.h>
  31#include <linux/notifier.h>
  32#include <linux/kprobes.h>
  33#include <linux/kdebug.h>
  34#include <linux/prctl.h>
  35#include <linux/uaccess.h>
  36#include <linux/io.h>
  37#include <linux/ftrace.h>
  38
  39#include <asm/pgtable.h>
  40#include <asm/processor.h>
  41#include <asm/i387.h>
  42#include <asm/fpu-internal.h>
  43#include <asm/mmu_context.h>
  44#include <asm/prctl.h>
  45#include <asm/desc.h>
  46#include <asm/proto.h>
  47#include <asm/ia32.h>
  48#include <asm/idle.h>
  49#include <asm/syscalls.h>
  50#include <asm/debugreg.h>
  51#include <asm/switch_to.h>
  52
  53asmlinkage extern void ret_from_fork(void);
  54
  55DEFINE_PER_CPU(unsigned long, old_rsp);
  56
  57/* Prints also some state that isn't saved in the pt_regs */
  58void __show_regs(struct pt_regs *regs, int all)
  59{
  60        unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
  61        unsigned long d0, d1, d2, d3, d6, d7;
  62        unsigned int fsindex, gsindex;
  63        unsigned int ds, cs, es;
  64
  65        printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
  66        printk_address(regs->ip, 1);
  67        printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
  68                        regs->sp, regs->flags);
  69        printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
  70               regs->ax, regs->bx, regs->cx);
  71        printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
  72               regs->dx, regs->si, regs->di);
  73        printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
  74               regs->bp, regs->r8, regs->r9);
  75        printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
  76               regs->r10, regs->r11, regs->r12);
  77        printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
  78               regs->r13, regs->r14, regs->r15);
  79
  80        asm("movl %%ds,%0" : "=r" (ds));
  81        asm("movl %%cs,%0" : "=r" (cs));
  82        asm("movl %%es,%0" : "=r" (es));
  83        asm("movl %%fs,%0" : "=r" (fsindex));
  84        asm("movl %%gs,%0" : "=r" (gsindex));
  85
  86        rdmsrl(MSR_FS_BASE, fs);
  87        rdmsrl(MSR_GS_BASE, gs);
  88        rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
  89
  90        if (!all)
  91                return;
  92
  93        cr0 = read_cr0();
  94        cr2 = read_cr2();
  95        cr3 = read_cr3();
  96        cr4 = read_cr4();
  97
  98        printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
  99               fs, fsindex, gs, gsindex, shadowgs);
 100        printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
 101                        es, cr0);
 102        printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
 103                        cr4);
 104
 105        get_debugreg(d0, 0);
 106        get_debugreg(d1, 1);
 107        get_debugreg(d2, 2);
 108        printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
 109        get_debugreg(d3, 3);
 110        get_debugreg(d6, 6);
 111        get_debugreg(d7, 7);
 112        printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
 113}
 114
 115void release_thread(struct task_struct *dead_task)
 116{
 117        if (dead_task->mm) {
 118                if (dead_task->mm->context.size) {
 119                        pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
 120                                dead_task->comm,
 121                                dead_task->mm->context.ldt,
 122                                dead_task->mm->context.size);
 123                        BUG();
 124                }
 125        }
 126}
 127
 128static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
 129{
 130        struct user_desc ud = {
 131                .base_addr = addr,
 132                .limit = 0xfffff,
 133                .seg_32bit = 1,
 134                .limit_in_pages = 1,
 135                .useable = 1,
 136        };
 137        struct desc_struct *desc = t->thread.tls_array;
 138        desc += tls;
 139        fill_ldt(desc, &ud);
 140}
 141
 142static inline u32 read_32bit_tls(struct task_struct *t, int tls)
 143{
 144        return get_desc_base(&t->thread.tls_array[tls]);
 145}
 146
 147int copy_thread(unsigned long clone_flags, unsigned long sp,
 148                unsigned long arg, struct task_struct *p)
 149{
 150        int err;
 151        struct pt_regs *childregs;
 152        struct task_struct *me = current;
 153
 154        p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
 155        childregs = task_pt_regs(p);
 156        p->thread.sp = (unsigned long) childregs;
 157        p->thread.usersp = me->thread.usersp;
 158        set_tsk_thread_flag(p, TIF_FORK);
 159        p->fpu_counter = 0;
 160        p->thread.io_bitmap_ptr = NULL;
 161
 162        savesegment(gs, p->thread.gsindex);
 163        p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
 164        savesegment(fs, p->thread.fsindex);
 165        p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
 166        savesegment(es, p->thread.es);
 167        savesegment(ds, p->thread.ds);
 168        memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 169
 170        if (unlikely(p->flags & PF_KTHREAD)) {
 171                /* kernel thread */
 172                memset(childregs, 0, sizeof(struct pt_regs));
 173                childregs->sp = (unsigned long)childregs;
 174                childregs->ss = __KERNEL_DS;
 175                childregs->bx = sp; /* function */
 176                childregs->bp = arg;
 177                childregs->orig_ax = -1;
 178                childregs->cs = __KERNEL_CS | get_kernel_rpl();
 179                childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
 180                return 0;
 181        }
 182        *childregs = *current_pt_regs();
 183
 184        childregs->ax = 0;
 185        if (sp)
 186                childregs->sp = sp;
 187
 188        err = -ENOMEM;
 189        memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 190
 191        if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 192                p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
 193                                                  IO_BITMAP_BYTES, GFP_KERNEL);
 194                if (!p->thread.io_bitmap_ptr) {
 195                        p->thread.io_bitmap_max = 0;
 196                        return -ENOMEM;
 197                }
 198                set_tsk_thread_flag(p, TIF_IO_BITMAP);
 199        }
 200
 201        /*
 202         * Set a new TLS for the child thread?
 203         */
 204        if (clone_flags & CLONE_SETTLS) {
 205#ifdef CONFIG_IA32_EMULATION
 206                if (test_thread_flag(TIF_IA32))
 207                        err = do_set_thread_area(p, -1,
 208                                (struct user_desc __user *)childregs->si, 0);
 209                else
 210#endif
 211                        err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
 212                if (err)
 213                        goto out;
 214        }
 215        err = 0;
 216out:
 217        if (err && p->thread.io_bitmap_ptr) {
 218                kfree(p->thread.io_bitmap_ptr);
 219                p->thread.io_bitmap_max = 0;
 220        }
 221
 222        return err;
 223}
 224
 225static void
 226start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 227                    unsigned long new_sp,
 228                    unsigned int _cs, unsigned int _ss, unsigned int _ds)
 229{
 230        loadsegment(fs, 0);
 231        loadsegment(es, _ds);
 232        loadsegment(ds, _ds);
 233        load_gs_index(0);
 234        current->thread.usersp  = new_sp;
 235        regs->ip                = new_ip;
 236        regs->sp                = new_sp;
 237        this_cpu_write(old_rsp, new_sp);
 238        regs->cs                = _cs;
 239        regs->ss                = _ss;
 240        regs->flags             = X86_EFLAGS_IF;
 241}
 242
 243void
 244start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 245{
 246        start_thread_common(regs, new_ip, new_sp,
 247                            __USER_CS, __USER_DS, 0);
 248}
 249
 250#ifdef CONFIG_IA32_EMULATION
 251void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
 252{
 253        start_thread_common(regs, new_ip, new_sp,
 254                            test_thread_flag(TIF_X32)
 255                            ? __USER_CS : __USER32_CS,
 256                            __USER_DS, __USER_DS);
 257}
 258#endif
 259
 260/*
 261 *      switch_to(x,y) should switch tasks from x to y.
 262 *
 263 * This could still be optimized:
 264 * - fold all the options into a flag word and test it with a single test.
 265 * - could test fs/gs bitsliced
 266 *
 267 * Kprobes not supported here. Set the probe on schedule instead.
 268 * Function graph tracer not supported too.
 269 */
 270__notrace_funcgraph struct task_struct *
 271__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 272{
 273        struct thread_struct *prev = &prev_p->thread;
 274        struct thread_struct *next = &next_p->thread;
 275        int cpu = smp_processor_id();
 276        struct tss_struct *tss = &per_cpu(init_tss, cpu);
 277        unsigned fsindex, gsindex;
 278        fpu_switch_t fpu;
 279
 280        fpu = switch_fpu_prepare(prev_p, next_p, cpu);
 281
 282        /*
 283         * Reload esp0, LDT and the page table pointer:
 284         */
 285        load_sp0(tss, next);
 286
 287        /*
 288         * Switch DS and ES.
 289         * This won't pick up thread selector changes, but I guess that is ok.
 290         */
 291        savesegment(es, prev->es);
 292        if (unlikely(next->es | prev->es))
 293                loadsegment(es, next->es);
 294
 295        savesegment(ds, prev->ds);
 296        if (unlikely(next->ds | prev->ds))
 297                loadsegment(ds, next->ds);
 298
 299
 300        /* We must save %fs and %gs before load_TLS() because
 301         * %fs and %gs may be cleared by load_TLS().
 302         *
 303         * (e.g. xen_load_tls())
 304         */
 305        savesegment(fs, fsindex);
 306        savesegment(gs, gsindex);
 307
 308        load_TLS(next, cpu);
 309
 310        /*
 311         * Leave lazy mode, flushing any hypercalls made here.
 312         * This must be done before restoring TLS segments so
 313         * the GDT and LDT are properly updated, and must be
 314         * done before math_state_restore, so the TS bit is up
 315         * to date.
 316         */
 317        arch_end_context_switch(next_p);
 318
 319        /*
 320         * Switch FS and GS.
 321         *
 322         * Segment register != 0 always requires a reload.  Also
 323         * reload when it has changed.  When prev process used 64bit
 324         * base always reload to avoid an information leak.
 325         */
 326        if (unlikely(fsindex | next->fsindex | prev->fs)) {
 327                loadsegment(fs, next->fsindex);
 328                /*
 329                 * Check if the user used a selector != 0; if yes
 330                 *  clear 64bit base, since overloaded base is always
 331                 *  mapped to the Null selector
 332                 */
 333                if (fsindex)
 334                        prev->fs = 0;
 335        }
 336        /* when next process has a 64bit base use it */
 337        if (next->fs)
 338                wrmsrl(MSR_FS_BASE, next->fs);
 339        prev->fsindex = fsindex;
 340
 341        if (unlikely(gsindex | next->gsindex | prev->gs)) {
 342                load_gs_index(next->gsindex);
 343                if (gsindex)
 344                        prev->gs = 0;
 345        }
 346        if (next->gs)
 347                wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
 348        prev->gsindex = gsindex;
 349
 350        switch_fpu_finish(next_p, fpu);
 351
 352        /*
 353         * Switch the PDA and FPU contexts.
 354         */
 355        prev->usersp = this_cpu_read(old_rsp);
 356        this_cpu_write(old_rsp, next->usersp);
 357        this_cpu_write(current_task, next_p);
 358
 359        this_cpu_write(kernel_stack,
 360                  (unsigned long)task_stack_page(next_p) +
 361                  THREAD_SIZE - KERNEL_STACK_OFFSET);
 362
 363        /*
 364         * Now maybe reload the debug registers and handle I/O bitmaps
 365         */
 366        if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
 367                     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
 368                __switch_to_xtra(prev_p, next_p, tss);
 369
 370        return prev_p;
 371}
 372
 373void set_personality_64bit(void)
 374{
 375        /* inherit personality from parent */
 376
 377        /* Make sure to be in 64bit mode */
 378        clear_thread_flag(TIF_IA32);
 379        clear_thread_flag(TIF_ADDR32);
 380        clear_thread_flag(TIF_X32);
 381
 382        /* Ensure the corresponding mm is not marked. */
 383        if (current->mm)
 384                current->mm->context.ia32_compat = 0;
 385
 386        /* TBD: overwrites user setup. Should have two bits.
 387           But 64bit processes have always behaved this way,
 388           so it's not too bad. The main problem is just that
 389           32bit childs are affected again. */
 390        current->personality &= ~READ_IMPLIES_EXEC;
 391}
 392
 393void set_personality_ia32(bool x32)
 394{
 395        /* inherit personality from parent */
 396
 397        /* Make sure to be in 32bit mode */
 398        set_thread_flag(TIF_ADDR32);
 399
 400        /* Mark the associated mm as containing 32-bit tasks. */
 401        if (current->mm)
 402                current->mm->context.ia32_compat = 1;
 403
 404        if (x32) {
 405                clear_thread_flag(TIF_IA32);
 406                set_thread_flag(TIF_X32);
 407                current->personality &= ~READ_IMPLIES_EXEC;
 408                /* is_compat_task() uses the presence of the x32
 409                   syscall bit flag to determine compat status */
 410                current_thread_info()->status &= ~TS_COMPAT;
 411        } else {
 412                set_thread_flag(TIF_IA32);
 413                clear_thread_flag(TIF_X32);
 414                current->personality |= force_personality32;
 415                /* Prepare the first "return" to user space */
 416                current_thread_info()->status |= TS_COMPAT;
 417        }
 418}
 419EXPORT_SYMBOL_GPL(set_personality_ia32);
 420
 421unsigned long get_wchan(struct task_struct *p)
 422{
 423        unsigned long stack;
 424        u64 fp, ip;
 425        int count = 0;
 426
 427        if (!p || p == current || p->state == TASK_RUNNING)
 428                return 0;
 429        stack = (unsigned long)task_stack_page(p);
 430        if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
 431                return 0;
 432        fp = *(u64 *)(p->thread.sp);
 433        do {
 434                if (fp < (unsigned long)stack ||
 435                    fp >= (unsigned long)stack+THREAD_SIZE)
 436                        return 0;
 437                ip = *(u64 *)(fp+8);
 438                if (!in_sched_functions(ip))
 439                        return ip;
 440                fp = *(u64 *)fp;
 441        } while (count++ < 16);
 442        return 0;
 443}
 444
 445long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 446{
 447        int ret = 0;
 448        int doit = task == current;
 449        int cpu;
 450
 451        switch (code) {
 452        case ARCH_SET_GS:
 453                if (addr >= TASK_SIZE_OF(task))
 454                        return -EPERM;
 455                cpu = get_cpu();
 456                /* handle small bases via the GDT because that's faster to
 457                   switch. */
 458                if (addr <= 0xffffffff) {
 459                        set_32bit_tls(task, GS_TLS, addr);
 460                        if (doit) {
 461                                load_TLS(&task->thread, cpu);
 462                                load_gs_index(GS_TLS_SEL);
 463                        }
 464                        task->thread.gsindex = GS_TLS_SEL;
 465                        task->thread.gs = 0;
 466                } else {
 467                        task->thread.gsindex = 0;
 468                        task->thread.gs = addr;
 469                        if (doit) {
 470                                load_gs_index(0);
 471                                ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
 472                        }
 473                }
 474                put_cpu();
 475                break;
 476        case ARCH_SET_FS:
 477                /* Not strictly needed for fs, but do it for symmetry
 478                   with gs */
 479                if (addr >= TASK_SIZE_OF(task))
 480                        return -EPERM;
 481                cpu = get_cpu();
 482                /* handle small bases via the GDT because that's faster to
 483                   switch. */
 484                if (addr <= 0xffffffff) {
 485                        set_32bit_tls(task, FS_TLS, addr);
 486                        if (doit) {
 487                                load_TLS(&task->thread, cpu);
 488                                loadsegment(fs, FS_TLS_SEL);
 489                        }
 490                        task->thread.fsindex = FS_TLS_SEL;
 491                        task->thread.fs = 0;
 492                } else {
 493                        task->thread.fsindex = 0;
 494                        task->thread.fs = addr;
 495                        if (doit) {
 496                                /* set the selector to 0 to not confuse
 497                                   __switch_to */
 498                                loadsegment(fs, 0);
 499                                ret = wrmsrl_safe(MSR_FS_BASE, addr);
 500                        }
 501                }
 502                put_cpu();
 503                break;
 504        case ARCH_GET_FS: {
 505                unsigned long base;
 506                if (task->thread.fsindex == FS_TLS_SEL)
 507                        base = read_32bit_tls(task, FS_TLS);
 508                else if (doit)
 509                        rdmsrl(MSR_FS_BASE, base);
 510                else
 511                        base = task->thread.fs;
 512                ret = put_user(base, (unsigned long __user *)addr);
 513                break;
 514        }
 515        case ARCH_GET_GS: {
 516                unsigned long base;
 517                unsigned gsindex;
 518                if (task->thread.gsindex == GS_TLS_SEL)
 519                        base = read_32bit_tls(task, GS_TLS);
 520                else if (doit) {
 521                        savesegment(gs, gsindex);
 522                        if (gsindex)
 523                                rdmsrl(MSR_KERNEL_GS_BASE, base);
 524                        else
 525                                base = task->thread.gs;
 526                } else
 527                        base = task->thread.gs;
 528                ret = put_user(base, (unsigned long __user *)addr);
 529                break;
 530        }
 531
 532        default:
 533                ret = -EINVAL;
 534                break;
 535        }
 536
 537        return ret;
 538}
 539
 540long sys_arch_prctl(int code, unsigned long addr)
 541{
 542        return do_arch_prctl(current, code, addr);
 543}
 544
 545unsigned long KSTK_ESP(struct task_struct *task)
 546{
 547        return (test_tsk_thread_flag(task, TIF_IA32)) ?
 548                        (task_pt_regs(task)->sp) : ((task)->thread.usersp);
 549}
 550