linux/arch/x86/kernel/process_64.c
<<
>>
Prefs
   1/*
   2 *  Copyright (C) 1995  Linus Torvalds
   3 *
   4 *  Pentium III FXSR, SSE support
   5 *      Gareth Hughes <gareth@valinux.com>, May 2000
   6 *
   7 *  X86-64 port
   8 *      Andi Kleen.
   9 *
  10 *      CPU hotplug support - ashok.raj@intel.com
  11 */
  12
  13/*
  14 * This file handles the architecture-dependent parts of process handling..
  15 */
  16
  17#include <linux/cpu.h>
  18#include <linux/errno.h>
  19#include <linux/sched.h>
  20#include <linux/fs.h>
  21#include <linux/kernel.h>
  22#include <linux/mm.h>
  23#include <linux/elfcore.h>
  24#include <linux/smp.h>
  25#include <linux/slab.h>
  26#include <linux/user.h>
  27#include <linux/interrupt.h>
  28#include <linux/delay.h>
  29#include <linux/module.h>
  30#include <linux/ptrace.h>
  31#include <linux/notifier.h>
  32#include <linux/kprobes.h>
  33#include <linux/kdebug.h>
  34#include <linux/prctl.h>
  35#include <linux/uaccess.h>
  36#include <linux/io.h>
  37#include <linux/ftrace.h>
  38
  39#include <asm/pgtable.h>
  40#include <asm/processor.h>
  41#include <asm/i387.h>
  42#include <asm/fpu-internal.h>
  43#include <asm/mmu_context.h>
  44#include <asm/prctl.h>
  45#include <asm/desc.h>
  46#include <asm/proto.h>
  47#include <asm/ia32.h>
  48#include <asm/idle.h>
  49#include <asm/syscalls.h>
  50#include <asm/debugreg.h>
  51#include <asm/switch_to.h>
  52#include <asm/intel_rdt.h>
  53
  54asmlinkage extern void ret_from_fork(void);
  55
  56DEFINE_PER_CPU(unsigned long, old_rsp);
  57
  58/* Prints also some state that isn't saved in the pt_regs */
  59void __show_regs(struct pt_regs *regs, int all)
  60{
  61        unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
  62        unsigned long d0, d1, d2, d3, d6, d7;
  63        unsigned int fsindex, gsindex;
  64        unsigned int ds, cs, es;
  65
  66        printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
  67        printk_address(regs->ip, 1);
  68        printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
  69                        regs->sp, regs->flags);
  70        printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
  71               regs->ax, regs->bx, regs->cx);
  72        printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
  73               regs->dx, regs->si, regs->di);
  74        printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
  75               regs->bp, regs->r8, regs->r9);
  76        printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
  77               regs->r10, regs->r11, regs->r12);
  78        printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
  79               regs->r13, regs->r14, regs->r15);
  80
  81        asm("movl %%ds,%0" : "=r" (ds));
  82        asm("movl %%cs,%0" : "=r" (cs));
  83        asm("movl %%es,%0" : "=r" (es));
  84        asm("movl %%fs,%0" : "=r" (fsindex));
  85        asm("movl %%gs,%0" : "=r" (gsindex));
  86
  87        rdmsrl(MSR_FS_BASE, fs);
  88        rdmsrl(MSR_GS_BASE, gs);
  89        rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
  90
  91        if (!all)
  92                return;
  93
  94        cr0 = read_cr0();
  95        cr2 = read_cr2();
  96        cr3 = read_cr3();
  97        cr4 = read_cr4();
  98
  99        printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
 100               fs, fsindex, gs, gsindex, shadowgs);
 101        printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
 102                        es, cr0);
 103        printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
 104                        cr4);
 105
 106        get_debugreg(d0, 0);
 107        get_debugreg(d1, 1);
 108        get_debugreg(d2, 2);
 109        printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
 110        get_debugreg(d3, 3);
 111        get_debugreg(d6, 6);
 112        get_debugreg(d7, 7);
 113        printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
 114}
 115
 116void release_thread(struct task_struct *dead_task)
 117{
 118        if (dead_task->mm) {
 119                if (dead_task->mm->context.size) {
 120                        pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
 121                                dead_task->comm,
 122                                dead_task->mm->context.ldt,
 123                                dead_task->mm->context.size);
 124                        BUG();
 125                }
 126        }
 127}
 128
 129static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
 130{
 131        struct user_desc ud = {
 132                .base_addr = addr,
 133                .limit = 0xfffff,
 134                .seg_32bit = 1,
 135                .limit_in_pages = 1,
 136                .useable = 1,
 137        };
 138        struct desc_struct *desc = t->thread.tls_array;
 139        desc += tls;
 140        fill_ldt(desc, &ud);
 141}
 142
 143static inline u32 read_32bit_tls(struct task_struct *t, int tls)
 144{
 145        return get_desc_base(&t->thread.tls_array[tls]);
 146}
 147
 148int copy_thread(unsigned long clone_flags, unsigned long sp,
 149                unsigned long arg, struct task_struct *p)
 150{
 151        int err;
 152        struct pt_regs *childregs;
 153        struct task_struct *me = current;
 154
 155        p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
 156        childregs = task_pt_regs(p);
 157        p->thread.sp = (unsigned long) childregs;
 158        p->thread.usersp = me->thread.usersp;
 159        set_tsk_thread_flag(p, TIF_FORK);
 160        p->fpu_counter = 0;
 161        p->thread.io_bitmap_ptr = NULL;
 162
 163        savesegment(gs, p->thread.gsindex);
 164        p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
 165        savesegment(fs, p->thread.fsindex);
 166        p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
 167        savesegment(es, p->thread.es);
 168        savesegment(ds, p->thread.ds);
 169        memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 170
 171        if (unlikely(p->flags & PF_KTHREAD)) {
 172                /* kernel thread */
 173                memset(childregs, 0, sizeof(struct pt_regs));
 174                childregs->sp = (unsigned long)childregs;
 175                childregs->ss = __KERNEL_DS;
 176                childregs->bx = sp; /* function */
 177                childregs->bp = arg;
 178                childregs->orig_ax = -1;
 179                childregs->cs = __KERNEL_CS | get_kernel_rpl();
 180                childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
 181                return 0;
 182        }
 183        *childregs = *current_pt_regs();
 184
 185        childregs->ax = 0;
 186        if (sp)
 187                childregs->sp = sp;
 188
 189        err = -ENOMEM;
 190        memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 191
 192        if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 193                p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
 194                                                  IO_BITMAP_BYTES, GFP_KERNEL);
 195                if (!p->thread.io_bitmap_ptr) {
 196                        p->thread.io_bitmap_max = 0;
 197                        return -ENOMEM;
 198                }
 199                set_tsk_thread_flag(p, TIF_IO_BITMAP);
 200        }
 201
 202        /*
 203         * Set a new TLS for the child thread?
 204         */
 205        if (clone_flags & CLONE_SETTLS) {
 206#ifdef CONFIG_IA32_EMULATION
 207                if (test_thread_flag(TIF_IA32))
 208                        err = do_set_thread_area(p, -1,
 209                                (struct user_desc __user *)childregs->si, 0);
 210                else
 211#endif
 212                        err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
 213                if (err)
 214                        goto out;
 215        }
 216        err = 0;
 217out:
 218        if (err && p->thread.io_bitmap_ptr) {
 219                kfree(p->thread.io_bitmap_ptr);
 220                p->thread.io_bitmap_max = 0;
 221        }
 222
 223        return err;
 224}
 225
 226static void
 227start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 228                    unsigned long new_sp,
 229                    unsigned int _cs, unsigned int _ss, unsigned int _ds)
 230{
 231        loadsegment(fs, 0);
 232        loadsegment(es, _ds);
 233        loadsegment(ds, _ds);
 234        load_gs_index(0);
 235        current->thread.usersp  = new_sp;
 236        regs->ip                = new_ip;
 237        regs->sp                = new_sp;
 238        this_cpu_write(old_rsp, new_sp);
 239        regs->cs                = _cs;
 240        regs->ss                = _ss;
 241        regs->flags             = X86_EFLAGS_IF;
 242}
 243
 244void
 245start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 246{
 247        start_thread_common(regs, new_ip, new_sp,
 248                            __USER_CS, __USER_DS, 0);
 249}
 250
 251#ifdef CONFIG_IA32_EMULATION
 252void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
 253{
 254        start_thread_common(regs, new_ip, new_sp,
 255                            test_thread_flag(TIF_X32)
 256                            ? __USER_CS : __USER32_CS,
 257                            __USER_DS, __USER_DS);
 258}
 259#endif
 260
 261/*
 262 *      switch_to(x,y) should switch tasks from x to y.
 263 *
 264 * This could still be optimized:
 265 * - fold all the options into a flag word and test it with a single test.
 266 * - could test fs/gs bitsliced
 267 *
 268 * Kprobes not supported here. Set the probe on schedule instead.
 269 * Function graph tracer not supported too.
 270 */
 271__notrace_funcgraph struct task_struct *
 272__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 273{
 274        struct thread_struct *prev = &prev_p->thread;
 275        struct thread_struct *next = &next_p->thread;
 276        int cpu = smp_processor_id();
 277        struct tss_struct *tss = &per_cpu(init_tss, cpu);
 278        unsigned fsindex, gsindex;
 279        fpu_switch_t fpu;
 280
 281        fpu = switch_fpu_prepare(prev_p, next_p, cpu);
 282
 283        /* Reload esp0 and ss1. */
 284        load_sp0(tss, next);
 285
 286        /* We must save %fs and %gs before load_TLS() because
 287         * %fs and %gs may be cleared by load_TLS().
 288         *
 289         * (e.g. xen_load_tls())
 290         */
 291        savesegment(fs, fsindex);
 292        savesegment(gs, gsindex);
 293
 294        /*
 295         * Load TLS before restoring any segments so that segment loads
 296         * reference the correct GDT entries.
 297         */
 298        load_TLS(next, cpu);
 299
 300        /*
 301         * Leave lazy mode, flushing any hypercalls made here.  This
 302         * must be done after loading TLS entries in the GDT but before
 303         * loading segments that might reference them, and and it must
 304         * be done before math_state_restore, so the TS bit is up to
 305         * date.
 306         */
 307        arch_end_context_switch(next_p);
 308
 309        /* Switch DS and ES.
 310         *
 311         * Reading them only returns the selectors, but writing them (if
 312         * nonzero) loads the full descriptor from the GDT or LDT.  The
 313         * LDT for next is loaded in switch_mm, and the GDT is loaded
 314         * above.
 315         *
 316         * We therefore need to write new values to the segment
 317         * registers on every context switch unless both the new and old
 318         * values are zero.
 319         *
 320         * Note that we don't need to do anything for CS and SS, as
 321         * those are saved and restored as part of pt_regs.
 322         */
 323        savesegment(es, prev->es);
 324        if (unlikely(next->es | prev->es))
 325                loadsegment(es, next->es);
 326
 327        savesegment(ds, prev->ds);
 328        if (unlikely(next->ds | prev->ds))
 329                loadsegment(ds, next->ds);
 330
 331        /*
 332         * Switch FS and GS.
 333         *
 334         * These are even more complicated than FS and GS: they have
 335         * 64-bit bases are that controlled by arch_prctl.  Those bases
 336         * only differ from the values in the GDT or LDT if the selector
 337         * is 0.
 338         *
 339         * Loading the segment register resets the hidden base part of
 340         * the register to 0 or the value from the GDT / LDT.  If the
 341         * next base address zero, writing 0 to the segment register is
 342         * much faster than using wrmsr to explicitly zero the base.
 343         *
 344         * The thread_struct.fs and thread_struct.gs values are 0
 345         * if the fs and gs bases respectively are not overridden
 346         * from the values implied by fsindex and gsindex.  They
 347         * are nonzero, and store the nonzero base addresses, if
 348         * the bases are overridden.
 349         *
 350         * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should
 351         * be impossible.
 352         *
 353         * Therefore we need to reload the segment registers if either
 354         * the old or new selector is nonzero, and we need to override
 355         * the base address if next thread expects it to be overridden.
 356         *
 357         * This code is unnecessarily slow in the case where the old and
 358         * new indexes are zero and the new base is nonzero -- it will
 359         * unnecessarily write 0 to the selector before writing the new
 360         * base address.
 361         *
 362         * Note: This all depends on arch_prctl being the only way that
 363         * user code can override the segment base.  Once wrfsbase and
 364         * wrgsbase are enabled, most of this code will need to change.
 365         */
 366        if (unlikely(fsindex | next->fsindex | prev->fs)) {
 367                loadsegment(fs, next->fsindex);
 368
 369                /*
 370                 * If user code wrote a nonzero value to FS, then it also
 371                 * cleared the overridden base address.
 372                 *
 373                 * XXX: if user code wrote 0 to FS and cleared the base
 374                 * address itself, we won't notice and we'll incorrectly
 375                 * restore the prior base address next time we reschdule
 376                 * the process.
 377                 */
 378                if (fsindex)
 379                        prev->fs = 0;
 380        }
 381        if (next->fs)
 382                wrmsrl(MSR_FS_BASE, next->fs);
 383        prev->fsindex = fsindex;
 384
 385        if (unlikely(gsindex | next->gsindex | prev->gs)) {
 386                load_gs_index(next->gsindex);
 387
 388                /* This works (and fails) the same way as fsindex above. */
 389                if (gsindex)
 390                        prev->gs = 0;
 391        }
 392        if (next->gs)
 393                wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
 394        prev->gsindex = gsindex;
 395
 396        switch_fpu_finish(next_p, fpu);
 397
 398        /*
 399         * Switch the PDA and FPU contexts.
 400         */
 401        prev->usersp = this_cpu_read(old_rsp);
 402        this_cpu_write(old_rsp, next->usersp);
 403        this_cpu_write(current_task, next_p);
 404
 405        this_cpu_write(kernel_stack,
 406                  (unsigned long)task_stack_page(next_p) +
 407                  THREAD_SIZE - KERNEL_STACK_OFFSET);
 408        this_cpu_write(__kernel_stack_70__,
 409                  (unsigned long)task_stack_page(next_p) +
 410                  THREAD_SIZE - 8192 - KERNEL_STACK_OFFSET);
 411
 412        /*
 413         * Now maybe reload the debug registers and handle I/O bitmaps
 414         */
 415        if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
 416                     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
 417                __switch_to_xtra(prev_p, next_p, tss);
 418
 419        /* Load the Intel cache allocation PQR MSR. */
 420        intel_rdt_sched_in();
 421
 422        return prev_p;
 423}
 424
 425void set_personality_64bit(void)
 426{
 427        /* inherit personality from parent */
 428
 429        /* Make sure to be in 64bit mode */
 430        clear_thread_flag(TIF_IA32);
 431        clear_thread_flag(TIF_ADDR32);
 432        clear_thread_flag(TIF_X32);
 433
 434        /* Ensure the corresponding mm is not marked. */
 435        if (current->mm)
 436                current->mm->context.ia32_compat = 0;
 437
 438        /* TBD: overwrites user setup. Should have two bits.
 439           But 64bit processes have always behaved this way,
 440           so it's not too bad. The main problem is just that
 441           32bit childs are affected again. */
 442        current->personality &= ~READ_IMPLIES_EXEC;
 443}
 444
 445void set_personality_ia32(bool x32)
 446{
 447        /* inherit personality from parent */
 448
 449        /* Make sure to be in 32bit mode */
 450        set_thread_flag(TIF_ADDR32);
 451
 452        /* Mark the associated mm as containing 32-bit tasks. */
 453        if (x32) {
 454                clear_thread_flag(TIF_IA32);
 455                set_thread_flag(TIF_X32);
 456                if (current->mm)
 457                        current->mm->context.ia32_compat = TIF_X32;
 458                current->personality &= ~READ_IMPLIES_EXEC;
 459                /* is_compat_task() uses the presence of the x32
 460                   syscall bit flag to determine compat status */
 461                current_thread_info()->status &= ~TS_COMPAT;
 462        } else {
 463                set_thread_flag(TIF_IA32);
 464                clear_thread_flag(TIF_X32);
 465                if (current->mm)
 466                        current->mm->context.ia32_compat = TIF_IA32;
 467                current->personality |= force_personality32;
 468                /* Prepare the first "return" to user space */
 469                current_thread_info()->status |= TS_COMPAT;
 470        }
 471}
 472EXPORT_SYMBOL_GPL(set_personality_ia32);
 473
 474unsigned long get_wchan(struct task_struct *p)
 475{
 476        unsigned long stack;
 477        u64 fp, ip;
 478        int count = 0;
 479
 480        if (!p || p == current || p->state == TASK_RUNNING)
 481                return 0;
 482        stack = (unsigned long)task_stack_page(p);
 483        if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
 484                return 0;
 485        fp = *(u64 *)(p->thread.sp);
 486        do {
 487                if (fp < (unsigned long)stack ||
 488                    fp >= (unsigned long)stack+THREAD_SIZE)
 489                        return 0;
 490                ip = *(u64 *)(fp+8);
 491                if (!in_sched_functions(ip))
 492                        return ip;
 493                fp = *(u64 *)fp;
 494        } while (count++ < 16);
 495        return 0;
 496}
 497
 498long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 499{
 500        int ret = 0;
 501        int doit = task == current;
 502        int cpu;
 503
 504        switch (code) {
 505        case ARCH_SET_GS:
 506                if (addr >= TASK_SIZE_OF(task))
 507                        return -EPERM;
 508                cpu = get_cpu();
 509                /* handle small bases via the GDT because that's faster to
 510                   switch. */
 511                if (addr <= 0xffffffff) {
 512                        set_32bit_tls(task, GS_TLS, addr);
 513                        if (doit) {
 514                                load_TLS(&task->thread, cpu);
 515                                load_gs_index(GS_TLS_SEL);
 516                        }
 517                        task->thread.gsindex = GS_TLS_SEL;
 518                        task->thread.gs = 0;
 519                } else {
 520                        task->thread.gsindex = 0;
 521                        task->thread.gs = addr;
 522                        if (doit) {
 523                                load_gs_index(0);
 524                                ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
 525                        }
 526                }
 527                put_cpu();
 528                break;
 529        case ARCH_SET_FS:
 530                /* Not strictly needed for fs, but do it for symmetry
 531                   with gs */
 532                if (addr >= TASK_SIZE_OF(task))
 533                        return -EPERM;
 534                cpu = get_cpu();
 535                /* handle small bases via the GDT because that's faster to
 536                   switch. */
 537                if (addr <= 0xffffffff) {
 538                        set_32bit_tls(task, FS_TLS, addr);
 539                        if (doit) {
 540                                load_TLS(&task->thread, cpu);
 541                                loadsegment(fs, FS_TLS_SEL);
 542                        }
 543                        task->thread.fsindex = FS_TLS_SEL;
 544                        task->thread.fs = 0;
 545                } else {
 546                        task->thread.fsindex = 0;
 547                        task->thread.fs = addr;
 548                        if (doit) {
 549                                /* set the selector to 0 to not confuse
 550                                   __switch_to */
 551                                loadsegment(fs, 0);
 552                                ret = wrmsrl_safe(MSR_FS_BASE, addr);
 553                        }
 554                }
 555                put_cpu();
 556                break;
 557        case ARCH_GET_FS: {
 558                unsigned long base;
 559                if (task->thread.fsindex == FS_TLS_SEL)
 560                        base = read_32bit_tls(task, FS_TLS);
 561                else if (doit)
 562                        rdmsrl(MSR_FS_BASE, base);
 563                else
 564                        base = task->thread.fs;
 565                ret = put_user(base, (unsigned long __user *)addr);
 566                break;
 567        }
 568        case ARCH_GET_GS: {
 569                unsigned long base;
 570                unsigned gsindex;
 571                if (task->thread.gsindex == GS_TLS_SEL)
 572                        base = read_32bit_tls(task, GS_TLS);
 573                else if (doit) {
 574                        savesegment(gs, gsindex);
 575                        if (gsindex)
 576                                rdmsrl(MSR_KERNEL_GS_BASE, base);
 577                        else
 578                                base = task->thread.gs;
 579                } else
 580                        base = task->thread.gs;
 581                ret = put_user(base, (unsigned long __user *)addr);
 582                break;
 583        }
 584
 585        default:
 586                ret = -EINVAL;
 587                break;
 588        }
 589
 590        return ret;
 591}
 592
 593long sys_arch_prctl(int code, unsigned long addr)
 594{
 595        return do_arch_prctl(current, code, addr);
 596}
 597
 598unsigned long KSTK_ESP(struct task_struct *task)
 599{
 600        return (test_tsk_thread_flag(task, TIF_IA32)) ?
 601                        (task_pt_regs(task)->sp) : ((task)->thread.usersp);
 602}
 603