linux/arch/x86/kernel/process_64.c
<<
>>
Prefs
   1/*
   2 *  Copyright (C) 1995  Linus Torvalds
   3 *
   4 *  Pentium III FXSR, SSE support
   5 *      Gareth Hughes <gareth@valinux.com>, May 2000
   6 *
   7 *  X86-64 port
   8 *      Andi Kleen.
   9 *
  10 *      CPU hotplug support - ashok.raj@intel.com
  11 */
  12
  13/*
  14 * This file handles the architecture-dependent parts of process handling..
  15 */
  16
  17#include <linux/cpu.h>
  18#include <linux/errno.h>
  19#include <linux/sched.h>
  20#include <linux/sched/task.h>
  21#include <linux/sched/task_stack.h>
  22#include <linux/fs.h>
  23#include <linux/kernel.h>
  24#include <linux/mm.h>
  25#include <linux/elfcore.h>
  26#include <linux/smp.h>
  27#include <linux/slab.h>
  28#include <linux/user.h>
  29#include <linux/interrupt.h>
  30#include <linux/delay.h>
  31#include <linux/export.h>
  32#include <linux/ptrace.h>
  33#include <linux/notifier.h>
  34#include <linux/kprobes.h>
  35#include <linux/kdebug.h>
  36#include <linux/prctl.h>
  37#include <linux/uaccess.h>
  38#include <linux/io.h>
  39#include <linux/ftrace.h>
  40#include <linux/syscalls.h>
  41
  42#include <asm/pgtable.h>
  43#include <asm/processor.h>
  44#include <asm/fpu/internal.h>
  45#include <asm/mmu_context.h>
  46#include <asm/prctl.h>
  47#include <asm/desc.h>
  48#include <asm/proto.h>
  49#include <asm/ia32.h>
  50#include <asm/syscalls.h>
  51#include <asm/debugreg.h>
  52#include <asm/switch_to.h>
  53#include <asm/xen/hypervisor.h>
  54#include <asm/vdso.h>
  55#include <asm/intel_rdt_sched.h>
  56#include <asm/unistd.h>
  57#ifdef CONFIG_IA32_EMULATION
  58/* Not included via unistd.h */
  59#include <asm/unistd_32_ia32.h>
  60#endif
  61
  62__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
  63
  64/* Prints also some state that isn't saved in the pt_regs */
  65void __show_regs(struct pt_regs *regs, int all)
  66{
  67        unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
  68        unsigned long d0, d1, d2, d3, d6, d7;
  69        unsigned int fsindex, gsindex;
  70        unsigned int ds, cs, es;
  71
  72        show_iret_regs(regs);
  73
  74        if (regs->orig_ax != -1)
  75                pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
  76        else
  77                pr_cont("\n");
  78
  79        printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
  80               regs->ax, regs->bx, regs->cx);
  81        printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
  82               regs->dx, regs->si, regs->di);
  83        printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
  84               regs->bp, regs->r8, regs->r9);
  85        printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
  86               regs->r10, regs->r11, regs->r12);
  87        printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
  88               regs->r13, regs->r14, regs->r15);
  89
  90        if (!all)
  91                return;
  92
  93        asm("movl %%ds,%0" : "=r" (ds));
  94        asm("movl %%cs,%0" : "=r" (cs));
  95        asm("movl %%es,%0" : "=r" (es));
  96        asm("movl %%fs,%0" : "=r" (fsindex));
  97        asm("movl %%gs,%0" : "=r" (gsindex));
  98
  99        rdmsrl(MSR_FS_BASE, fs);
 100        rdmsrl(MSR_GS_BASE, gs);
 101        rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
 102
 103        cr0 = read_cr0();
 104        cr2 = read_cr2();
 105        cr3 = __read_cr3();
 106        cr4 = __read_cr4();
 107
 108        printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
 109               fs, fsindex, gs, gsindex, shadowgs);
 110        printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
 111                        es, cr0);
 112        printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
 113                        cr4);
 114
 115        get_debugreg(d0, 0);
 116        get_debugreg(d1, 1);
 117        get_debugreg(d2, 2);
 118        get_debugreg(d3, 3);
 119        get_debugreg(d6, 6);
 120        get_debugreg(d7, 7);
 121
 122        /* Only print out debug registers if they are in their non-default state. */
 123        if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
 124            (d6 == DR6_RESERVED) && (d7 == 0x400))) {
 125                printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n",
 126                       d0, d1, d2);
 127                printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n",
 128                       d3, d6, d7);
 129        }
 130
 131        if (boot_cpu_has(X86_FEATURE_OSPKE))
 132                printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
 133}
 134
 135void release_thread(struct task_struct *dead_task)
 136{
 137        if (dead_task->mm) {
 138#ifdef CONFIG_MODIFY_LDT_SYSCALL
 139                if (dead_task->mm->context.ldt) {
 140                        pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
 141                                dead_task->comm,
 142                                dead_task->mm->context.ldt->entries,
 143                                dead_task->mm->context.ldt->nr_entries);
 144                        BUG();
 145                }
 146#endif
 147        }
 148}
 149
 150enum which_selector {
 151        FS,
 152        GS
 153};
 154
 155/*
 156 * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
 157 * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
 158 * It's forcibly inlined because it'll generate better code and this function
 159 * is hot.
 160 */
 161static __always_inline void save_base_legacy(struct task_struct *prev_p,
 162                                             unsigned short selector,
 163                                             enum which_selector which)
 164{
 165        if (likely(selector == 0)) {
 166                /*
 167                 * On Intel (without X86_BUG_NULL_SEG), the segment base could
 168                 * be the pre-existing saved base or it could be zero.  On AMD
 169                 * (with X86_BUG_NULL_SEG), the segment base could be almost
 170                 * anything.
 171                 *
 172                 * This branch is very hot (it's hit twice on almost every
 173                 * context switch between 64-bit programs), and avoiding
 174                 * the RDMSR helps a lot, so we just assume that whatever
 175                 * value is already saved is correct.  This matches historical
 176                 * Linux behavior, so it won't break existing applications.
 177                 *
 178                 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
 179                 * report that the base is zero, it needs to actually be zero:
 180                 * see the corresponding logic in load_seg_legacy.
 181                 */
 182        } else {
 183                /*
 184                 * If the selector is 1, 2, or 3, then the base is zero on
 185                 * !X86_BUG_NULL_SEG CPUs and could be anything on
 186                 * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
 187                 * has never attempted to preserve the base across context
 188                 * switches.
 189                 *
 190                 * If selector > 3, then it refers to a real segment, and
 191                 * saving the base isn't necessary.
 192                 */
 193                if (which == FS)
 194                        prev_p->thread.fsbase = 0;
 195                else
 196                        prev_p->thread.gsbase = 0;
 197        }
 198}
 199
 200static __always_inline void save_fsgs(struct task_struct *task)
 201{
 202        savesegment(fs, task->thread.fsindex);
 203        savesegment(gs, task->thread.gsindex);
 204        save_base_legacy(task, task->thread.fsindex, FS);
 205        save_base_legacy(task, task->thread.gsindex, GS);
 206}
 207
 208#if IS_ENABLED(CONFIG_KVM)
 209/*
 210 * While a process is running,current->thread.fsbase and current->thread.gsbase
 211 * may not match the corresponding CPU registers (see save_base_legacy()). KVM
 212 * wants an efficient way to save and restore FSBASE and GSBASE.
 213 * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE.
 214 */
 215void save_fsgs_for_kvm(void)
 216{
 217        save_fsgs(current);
 218}
 219EXPORT_SYMBOL_GPL(save_fsgs_for_kvm);
 220#endif
 221
 222static __always_inline void loadseg(enum which_selector which,
 223                                    unsigned short sel)
 224{
 225        if (which == FS)
 226                loadsegment(fs, sel);
 227        else
 228                load_gs_index(sel);
 229}
 230
 231static __always_inline void load_seg_legacy(unsigned short prev_index,
 232                                            unsigned long prev_base,
 233                                            unsigned short next_index,
 234                                            unsigned long next_base,
 235                                            enum which_selector which)
 236{
 237        if (likely(next_index <= 3)) {
 238                /*
 239                 * The next task is using 64-bit TLS, is not using this
 240                 * segment at all, or is having fun with arcane CPU features.
 241                 */
 242                if (next_base == 0) {
 243                        /*
 244                         * Nasty case: on AMD CPUs, we need to forcibly zero
 245                         * the base.
 246                         */
 247                        if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
 248                                loadseg(which, __USER_DS);
 249                                loadseg(which, next_index);
 250                        } else {
 251                                /*
 252                                 * We could try to exhaustively detect cases
 253                                 * under which we can skip the segment load,
 254                                 * but there's really only one case that matters
 255                                 * for performance: if both the previous and
 256                                 * next states are fully zeroed, we can skip
 257                                 * the load.
 258                                 *
 259                                 * (This assumes that prev_base == 0 has no
 260                                 * false positives.  This is the case on
 261                                 * Intel-style CPUs.)
 262                                 */
 263                                if (likely(prev_index | next_index | prev_base))
 264                                        loadseg(which, next_index);
 265                        }
 266                } else {
 267                        if (prev_index != next_index)
 268                                loadseg(which, next_index);
 269                        wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
 270                               next_base);
 271                }
 272        } else {
 273                /*
 274                 * The next task is using a real segment.  Loading the selector
 275                 * is sufficient.
 276                 */
 277                loadseg(which, next_index);
 278        }
 279}
 280
 281int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
 282                unsigned long arg, struct task_struct *p, unsigned long tls)
 283{
 284        int err;
 285        struct pt_regs *childregs;
 286        struct fork_frame *fork_frame;
 287        struct inactive_task_frame *frame;
 288        struct task_struct *me = current;
 289
 290        childregs = task_pt_regs(p);
 291        fork_frame = container_of(childregs, struct fork_frame, regs);
 292        frame = &fork_frame->frame;
 293        frame->bp = 0;
 294        frame->ret_addr = (unsigned long) ret_from_fork;
 295        p->thread.sp = (unsigned long) fork_frame;
 296        p->thread.io_bitmap_ptr = NULL;
 297
 298        savesegment(gs, p->thread.gsindex);
 299        p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase;
 300        savesegment(fs, p->thread.fsindex);
 301        p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase;
 302        savesegment(es, p->thread.es);
 303        savesegment(ds, p->thread.ds);
 304        memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 305
 306        if (unlikely(p->flags & PF_KTHREAD)) {
 307                /* kernel thread */
 308                memset(childregs, 0, sizeof(struct pt_regs));
 309                frame->bx = sp;         /* function */
 310                frame->r12 = arg;
 311                return 0;
 312        }
 313        frame->bx = 0;
 314        *childregs = *current_pt_regs();
 315
 316        childregs->ax = 0;
 317        if (sp)
 318                childregs->sp = sp;
 319
 320        err = -ENOMEM;
 321        if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 322                p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
 323                                                  IO_BITMAP_BYTES, GFP_KERNEL);
 324                if (!p->thread.io_bitmap_ptr) {
 325                        p->thread.io_bitmap_max = 0;
 326                        return -ENOMEM;
 327                }
 328                set_tsk_thread_flag(p, TIF_IO_BITMAP);
 329        }
 330
 331        /*
 332         * Set a new TLS for the child thread?
 333         */
 334        if (clone_flags & CLONE_SETTLS) {
 335#ifdef CONFIG_IA32_EMULATION
 336                if (in_ia32_syscall())
 337                        err = do_set_thread_area(p, -1,
 338                                (struct user_desc __user *)tls, 0);
 339                else
 340#endif
 341                        err = do_arch_prctl_64(p, ARCH_SET_FS, tls);
 342                if (err)
 343                        goto out;
 344        }
 345        err = 0;
 346out:
 347        if (err && p->thread.io_bitmap_ptr) {
 348                kfree(p->thread.io_bitmap_ptr);
 349                p->thread.io_bitmap_max = 0;
 350        }
 351
 352        return err;
 353}
 354
 355static void
 356start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 357                    unsigned long new_sp,
 358                    unsigned int _cs, unsigned int _ss, unsigned int _ds)
 359{
 360        WARN_ON_ONCE(regs != current_pt_regs());
 361
 362        if (static_cpu_has(X86_BUG_NULL_SEG)) {
 363                /* Loading zero below won't clear the base. */
 364                loadsegment(fs, __USER_DS);
 365                load_gs_index(__USER_DS);
 366        }
 367
 368        loadsegment(fs, 0);
 369        loadsegment(es, _ds);
 370        loadsegment(ds, _ds);
 371        load_gs_index(0);
 372
 373        regs->ip                = new_ip;
 374        regs->sp                = new_sp;
 375        regs->cs                = _cs;
 376        regs->ss                = _ss;
 377        regs->flags             = X86_EFLAGS_IF;
 378        force_iret();
 379}
 380
 381void
 382start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 383{
 384        start_thread_common(regs, new_ip, new_sp,
 385                            __USER_CS, __USER_DS, 0);
 386}
 387
 388#ifdef CONFIG_COMPAT
 389void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
 390{
 391        start_thread_common(regs, new_ip, new_sp,
 392                            test_thread_flag(TIF_X32)
 393                            ? __USER_CS : __USER32_CS,
 394                            __USER_DS, __USER_DS);
 395}
 396#endif
 397
 398/*
 399 *      switch_to(x,y) should switch tasks from x to y.
 400 *
 401 * This could still be optimized:
 402 * - fold all the options into a flag word and test it with a single test.
 403 * - could test fs/gs bitsliced
 404 *
 405 * Kprobes not supported here. Set the probe on schedule instead.
 406 * Function graph tracer not supported too.
 407 */
 408__visible __notrace_funcgraph struct task_struct *
 409__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 410{
 411        struct thread_struct *prev = &prev_p->thread;
 412        struct thread_struct *next = &next_p->thread;
 413        struct fpu *prev_fpu = &prev->fpu;
 414        struct fpu *next_fpu = &next->fpu;
 415        int cpu = smp_processor_id();
 416        struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
 417
 418        WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
 419                     this_cpu_read(irq_count) != -1);
 420
 421        switch_fpu_prepare(prev_fpu, cpu);
 422
 423        /* We must save %fs and %gs before load_TLS() because
 424         * %fs and %gs may be cleared by load_TLS().
 425         *
 426         * (e.g. xen_load_tls())
 427         */
 428        save_fsgs(prev_p);
 429
 430        /*
 431         * Load TLS before restoring any segments so that segment loads
 432         * reference the correct GDT entries.
 433         */
 434        load_TLS(next, cpu);
 435
 436        /*
 437         * Leave lazy mode, flushing any hypercalls made here.  This
 438         * must be done after loading TLS entries in the GDT but before
 439         * loading segments that might reference them, and and it must
 440         * be done before fpu__restore(), so the TS bit is up to
 441         * date.
 442         */
 443        arch_end_context_switch(next_p);
 444
 445        /* Switch DS and ES.
 446         *
 447         * Reading them only returns the selectors, but writing them (if
 448         * nonzero) loads the full descriptor from the GDT or LDT.  The
 449         * LDT for next is loaded in switch_mm, and the GDT is loaded
 450         * above.
 451         *
 452         * We therefore need to write new values to the segment
 453         * registers on every context switch unless both the new and old
 454         * values are zero.
 455         *
 456         * Note that we don't need to do anything for CS and SS, as
 457         * those are saved and restored as part of pt_regs.
 458         */
 459        savesegment(es, prev->es);
 460        if (unlikely(next->es | prev->es))
 461                loadsegment(es, next->es);
 462
 463        savesegment(ds, prev->ds);
 464        if (unlikely(next->ds | prev->ds))
 465                loadsegment(ds, next->ds);
 466
 467        load_seg_legacy(prev->fsindex, prev->fsbase,
 468                        next->fsindex, next->fsbase, FS);
 469        load_seg_legacy(prev->gsindex, prev->gsbase,
 470                        next->gsindex, next->gsbase, GS);
 471
 472        switch_fpu_finish(next_fpu, cpu);
 473
 474        /*
 475         * Switch the PDA and FPU contexts.
 476         */
 477        this_cpu_write(current_task, next_p);
 478        this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
 479
 480        /* Reload sp0. */
 481        update_sp0(next_p);
 482
 483        /*
 484         * Now maybe reload the debug registers and handle I/O bitmaps
 485         */
 486        if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
 487                     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
 488                __switch_to_xtra(prev_p, next_p, tss);
 489
 490#ifdef CONFIG_XEN_PV
 491        /*
 492         * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
 493         * current_pt_regs()->flags may not match the current task's
 494         * intended IOPL.  We need to switch it manually.
 495         */
 496        if (unlikely(static_cpu_has(X86_FEATURE_XENPV) &&
 497                     prev->iopl != next->iopl))
 498                xen_set_iopl_mask(next->iopl);
 499#endif
 500
 501        if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
 502                /*
 503                 * AMD CPUs have a misfeature: SYSRET sets the SS selector but
 504                 * does not update the cached descriptor.  As a result, if we
 505                 * do SYSRET while SS is NULL, we'll end up in user mode with
 506                 * SS apparently equal to __USER_DS but actually unusable.
 507                 *
 508                 * The straightforward workaround would be to fix it up just
 509                 * before SYSRET, but that would slow down the system call
 510                 * fast paths.  Instead, we ensure that SS is never NULL in
 511                 * system call context.  We do this by replacing NULL SS
 512                 * selectors at every context switch.  SYSCALL sets up a valid
 513                 * SS, so the only way to get NULL is to re-enter the kernel
 514                 * from CPL 3 through an interrupt.  Since that can't happen
 515                 * in the same task as a running syscall, we are guaranteed to
 516                 * context switch between every interrupt vector entry and a
 517                 * subsequent SYSRET.
 518                 *
 519                 * We read SS first because SS reads are much faster than
 520                 * writes.  Out of caution, we force SS to __KERNEL_DS even if
 521                 * it previously had a different non-NULL value.
 522                 */
 523                unsigned short ss_sel;
 524                savesegment(ss, ss_sel);
 525                if (ss_sel != __KERNEL_DS)
 526                        loadsegment(ss, __KERNEL_DS);
 527        }
 528
 529        /* Load the Intel cache allocation PQR MSR. */
 530        intel_rdt_sched_in();
 531
 532        return prev_p;
 533}
 534
 535void set_personality_64bit(void)
 536{
 537        /* inherit personality from parent */
 538
 539        /* Make sure to be in 64bit mode */
 540        clear_thread_flag(TIF_IA32);
 541        clear_thread_flag(TIF_ADDR32);
 542        clear_thread_flag(TIF_X32);
 543        /* Pretend that this comes from a 64bit execve */
 544        task_pt_regs(current)->orig_ax = __NR_execve;
 545        current_thread_info()->status &= ~TS_COMPAT;
 546
 547        /* Ensure the corresponding mm is not marked. */
 548        if (current->mm)
 549                current->mm->context.ia32_compat = 0;
 550
 551        /* TBD: overwrites user setup. Should have two bits.
 552           But 64bit processes have always behaved this way,
 553           so it's not too bad. The main problem is just that
 554           32bit childs are affected again. */
 555        current->personality &= ~READ_IMPLIES_EXEC;
 556}
 557
 558static void __set_personality_x32(void)
 559{
 560#ifdef CONFIG_X86_X32
 561        clear_thread_flag(TIF_IA32);
 562        set_thread_flag(TIF_X32);
 563        if (current->mm)
 564                current->mm->context.ia32_compat = TIF_X32;
 565        current->personality &= ~READ_IMPLIES_EXEC;
 566        /*
 567         * in_compat_syscall() uses the presence of the x32 syscall bit
 568         * flag to determine compat status.  The x86 mmap() code relies on
 569         * the syscall bitness so set x32 syscall bit right here to make
 570         * in_compat_syscall() work during exec().
 571         *
 572         * Pretend to come from a x32 execve.
 573         */
 574        task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
 575        current_thread_info()->status &= ~TS_COMPAT;
 576#endif
 577}
 578
 579static void __set_personality_ia32(void)
 580{
 581#ifdef CONFIG_IA32_EMULATION
 582        set_thread_flag(TIF_IA32);
 583        clear_thread_flag(TIF_X32);
 584        if (current->mm)
 585                current->mm->context.ia32_compat = TIF_IA32;
 586        current->personality |= force_personality32;
 587        /* Prepare the first "return" to user space */
 588        task_pt_regs(current)->orig_ax = __NR_ia32_execve;
 589        current_thread_info()->status |= TS_COMPAT;
 590#endif
 591}
 592
 593void set_personality_ia32(bool x32)
 594{
 595        /* Make sure to be in 32bit mode */
 596        set_thread_flag(TIF_ADDR32);
 597
 598        if (x32)
 599                __set_personality_x32();
 600        else
 601                __set_personality_ia32();
 602}
 603EXPORT_SYMBOL_GPL(set_personality_ia32);
 604
 605#ifdef CONFIG_CHECKPOINT_RESTORE
 606static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
 607{
 608        int ret;
 609
 610        ret = map_vdso_once(image, addr);
 611        if (ret)
 612                return ret;
 613
 614        return (long)image->size;
 615}
 616#endif
 617
 618long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
 619{
 620        int ret = 0;
 621        int doit = task == current;
 622        int cpu;
 623
 624        switch (option) {
 625        case ARCH_SET_GS:
 626                if (arg2 >= TASK_SIZE_MAX)
 627                        return -EPERM;
 628                cpu = get_cpu();
 629                task->thread.gsindex = 0;
 630                task->thread.gsbase = arg2;
 631                if (doit) {
 632                        load_gs_index(0);
 633                        ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, arg2);
 634                }
 635                put_cpu();
 636                break;
 637        case ARCH_SET_FS:
 638                /* Not strictly needed for fs, but do it for symmetry
 639                   with gs */
 640                if (arg2 >= TASK_SIZE_MAX)
 641                        return -EPERM;
 642                cpu = get_cpu();
 643                task->thread.fsindex = 0;
 644                task->thread.fsbase = arg2;
 645                if (doit) {
 646                        /* set the selector to 0 to not confuse __switch_to */
 647                        loadsegment(fs, 0);
 648                        ret = wrmsrl_safe(MSR_FS_BASE, arg2);
 649                }
 650                put_cpu();
 651                break;
 652        case ARCH_GET_FS: {
 653                unsigned long base;
 654
 655                if (doit)
 656                        rdmsrl(MSR_FS_BASE, base);
 657                else
 658                        base = task->thread.fsbase;
 659                ret = put_user(base, (unsigned long __user *)arg2);
 660                break;
 661        }
 662        case ARCH_GET_GS: {
 663                unsigned long base;
 664
 665                if (doit)
 666                        rdmsrl(MSR_KERNEL_GS_BASE, base);
 667                else
 668                        base = task->thread.gsbase;
 669                ret = put_user(base, (unsigned long __user *)arg2);
 670                break;
 671        }
 672
 673#ifdef CONFIG_CHECKPOINT_RESTORE
 674# ifdef CONFIG_X86_X32_ABI
 675        case ARCH_MAP_VDSO_X32:
 676                return prctl_map_vdso(&vdso_image_x32, arg2);
 677# endif
 678# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
 679        case ARCH_MAP_VDSO_32:
 680                return prctl_map_vdso(&vdso_image_32, arg2);
 681# endif
 682        case ARCH_MAP_VDSO_64:
 683                return prctl_map_vdso(&vdso_image_64, arg2);
 684#endif
 685
 686        default:
 687                ret = -EINVAL;
 688                break;
 689        }
 690
 691        return ret;
 692}
 693
 694SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
 695{
 696        long ret;
 697
 698        ret = do_arch_prctl_64(current, option, arg2);
 699        if (ret == -EINVAL)
 700                ret = do_arch_prctl_common(current, option, arg2);
 701
 702        return ret;
 703}
 704
 705#ifdef CONFIG_IA32_EMULATION
 706COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
 707{
 708        return do_arch_prctl_common(current, option, arg2);
 709}
 710#endif
 711
 712unsigned long KSTK_ESP(struct task_struct *task)
 713{
 714        return task_pt_regs(task)->sp;
 715}
 716