linux/arch/x86/kernel/process_64.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *  Copyright (C) 1995  Linus Torvalds
   4 *
   5 *  Pentium III FXSR, SSE support
   6 *      Gareth Hughes <gareth@valinux.com>, May 2000
   7 *
   8 *  X86-64 port
   9 *      Andi Kleen.
  10 *
  11 *      CPU hotplug support - ashok.raj@intel.com
  12 */
  13
  14/*
  15 * This file handles the architecture-dependent parts of process handling..
  16 */
  17
  18#include <linux/cpu.h>
  19#include <linux/errno.h>
  20#include <linux/sched.h>
  21#include <linux/sched/task.h>
  22#include <linux/sched/task_stack.h>
  23#include <linux/fs.h>
  24#include <linux/kernel.h>
  25#include <linux/mm.h>
  26#include <linux/elfcore.h>
  27#include <linux/smp.h>
  28#include <linux/slab.h>
  29#include <linux/user.h>
  30#include <linux/interrupt.h>
  31#include <linux/delay.h>
  32#include <linux/export.h>
  33#include <linux/ptrace.h>
  34#include <linux/notifier.h>
  35#include <linux/kprobes.h>
  36#include <linux/kdebug.h>
  37#include <linux/prctl.h>
  38#include <linux/uaccess.h>
  39#include <linux/io.h>
  40#include <linux/ftrace.h>
  41#include <linux/syscalls.h>
  42
  43#include <asm/pgtable.h>
  44#include <asm/processor.h>
  45#include <asm/fpu/internal.h>
  46#include <asm/mmu_context.h>
  47#include <asm/prctl.h>
  48#include <asm/desc.h>
  49#include <asm/proto.h>
  50#include <asm/ia32.h>
  51#include <asm/syscalls.h>
  52#include <asm/debugreg.h>
  53#include <asm/switch_to.h>
  54#include <asm/xen/hypervisor.h>
  55#include <asm/vdso.h>
  56#include <asm/resctrl_sched.h>
  57#include <asm/unistd.h>
  58#include <asm/fsgsbase.h>
  59#ifdef CONFIG_IA32_EMULATION
  60/* Not included via unistd.h */
  61#include <asm/unistd_32_ia32.h>
  62#endif
  63
  64#include "process.h"
  65
  66/* Prints also some state that isn't saved in the pt_regs */
  67void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
  68{
  69        unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
  70        unsigned long d0, d1, d2, d3, d6, d7;
  71        unsigned int fsindex, gsindex;
  72        unsigned int ds, es;
  73
  74        show_iret_regs(regs);
  75
  76        if (regs->orig_ax != -1)
  77                pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
  78        else
  79                pr_cont("\n");
  80
  81        printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
  82               regs->ax, regs->bx, regs->cx);
  83        printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
  84               regs->dx, regs->si, regs->di);
  85        printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
  86               regs->bp, regs->r8, regs->r9);
  87        printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
  88               regs->r10, regs->r11, regs->r12);
  89        printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
  90               regs->r13, regs->r14, regs->r15);
  91
  92        if (mode == SHOW_REGS_SHORT)
  93                return;
  94
  95        if (mode == SHOW_REGS_USER) {
  96                rdmsrl(MSR_FS_BASE, fs);
  97                rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
  98                printk(KERN_DEFAULT "FS:  %016lx GS:  %016lx\n",
  99                       fs, shadowgs);
 100                return;
 101        }
 102
 103        asm("movl %%ds,%0" : "=r" (ds));
 104        asm("movl %%es,%0" : "=r" (es));
 105        asm("movl %%fs,%0" : "=r" (fsindex));
 106        asm("movl %%gs,%0" : "=r" (gsindex));
 107
 108        rdmsrl(MSR_FS_BASE, fs);
 109        rdmsrl(MSR_GS_BASE, gs);
 110        rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
 111
 112        cr0 = read_cr0();
 113        cr2 = read_cr2();
 114        cr3 = __read_cr3();
 115        cr4 = __read_cr4();
 116
 117        printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
 118               fs, fsindex, gs, gsindex, shadowgs);
 119        printk(KERN_DEFAULT "CS:  %04lx DS: %04x ES: %04x CR0: %016lx\n", regs->cs, ds,
 120                        es, cr0);
 121        printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
 122                        cr4);
 123
 124        get_debugreg(d0, 0);
 125        get_debugreg(d1, 1);
 126        get_debugreg(d2, 2);
 127        get_debugreg(d3, 3);
 128        get_debugreg(d6, 6);
 129        get_debugreg(d7, 7);
 130
 131        /* Only print out debug registers if they are in their non-default state. */
 132        if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
 133            (d6 == DR6_RESERVED) && (d7 == 0x400))) {
 134                printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n",
 135                       d0, d1, d2);
 136                printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n",
 137                       d3, d6, d7);
 138        }
 139
 140        if (boot_cpu_has(X86_FEATURE_OSPKE))
 141                printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
 142}
 143
 144void release_thread(struct task_struct *dead_task)
 145{
 146        if (dead_task->mm) {
 147#ifdef CONFIG_MODIFY_LDT_SYSCALL
 148                if (dead_task->mm->context.ldt) {
 149                        pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
 150                                dead_task->comm,
 151                                dead_task->mm->context.ldt->entries,
 152                                dead_task->mm->context.ldt->nr_entries);
 153                        BUG();
 154                }
 155#endif
 156        }
 157}
 158
 159enum which_selector {
 160        FS,
 161        GS
 162};
 163
 164/*
 165 * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
 166 * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
 167 * It's forcibly inlined because it'll generate better code and this function
 168 * is hot.
 169 */
 170static __always_inline void save_base_legacy(struct task_struct *prev_p,
 171                                             unsigned short selector,
 172                                             enum which_selector which)
 173{
 174        if (likely(selector == 0)) {
 175                /*
 176                 * On Intel (without X86_BUG_NULL_SEG), the segment base could
 177                 * be the pre-existing saved base or it could be zero.  On AMD
 178                 * (with X86_BUG_NULL_SEG), the segment base could be almost
 179                 * anything.
 180                 *
 181                 * This branch is very hot (it's hit twice on almost every
 182                 * context switch between 64-bit programs), and avoiding
 183                 * the RDMSR helps a lot, so we just assume that whatever
 184                 * value is already saved is correct.  This matches historical
 185                 * Linux behavior, so it won't break existing applications.
 186                 *
 187                 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
 188                 * report that the base is zero, it needs to actually be zero:
 189                 * see the corresponding logic in load_seg_legacy.
 190                 */
 191        } else {
 192                /*
 193                 * If the selector is 1, 2, or 3, then the base is zero on
 194                 * !X86_BUG_NULL_SEG CPUs and could be anything on
 195                 * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
 196                 * has never attempted to preserve the base across context
 197                 * switches.
 198                 *
 199                 * If selector > 3, then it refers to a real segment, and
 200                 * saving the base isn't necessary.
 201                 */
 202                if (which == FS)
 203                        prev_p->thread.fsbase = 0;
 204                else
 205                        prev_p->thread.gsbase = 0;
 206        }
 207}
 208
 209static __always_inline void save_fsgs(struct task_struct *task)
 210{
 211        savesegment(fs, task->thread.fsindex);
 212        savesegment(gs, task->thread.gsindex);
 213        save_base_legacy(task, task->thread.fsindex, FS);
 214        save_base_legacy(task, task->thread.gsindex, GS);
 215}
 216
 217#if IS_ENABLED(CONFIG_KVM)
 218/*
 219 * While a process is running,current->thread.fsbase and current->thread.gsbase
 220 * may not match the corresponding CPU registers (see save_base_legacy()). KVM
 221 * wants an efficient way to save and restore FSBASE and GSBASE.
 222 * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE.
 223 */
 224void save_fsgs_for_kvm(void)
 225{
 226        save_fsgs(current);
 227}
 228EXPORT_SYMBOL_GPL(save_fsgs_for_kvm);
 229#endif
 230
 231static __always_inline void loadseg(enum which_selector which,
 232                                    unsigned short sel)
 233{
 234        if (which == FS)
 235                loadsegment(fs, sel);
 236        else
 237                load_gs_index(sel);
 238}
 239
 240static __always_inline void load_seg_legacy(unsigned short prev_index,
 241                                            unsigned long prev_base,
 242                                            unsigned short next_index,
 243                                            unsigned long next_base,
 244                                            enum which_selector which)
 245{
 246        if (likely(next_index <= 3)) {
 247                /*
 248                 * The next task is using 64-bit TLS, is not using this
 249                 * segment at all, or is having fun with arcane CPU features.
 250                 */
 251                if (next_base == 0) {
 252                        /*
 253                         * Nasty case: on AMD CPUs, we need to forcibly zero
 254                         * the base.
 255                         */
 256                        if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
 257                                loadseg(which, __USER_DS);
 258                                loadseg(which, next_index);
 259                        } else {
 260                                /*
 261                                 * We could try to exhaustively detect cases
 262                                 * under which we can skip the segment load,
 263                                 * but there's really only one case that matters
 264                                 * for performance: if both the previous and
 265                                 * next states are fully zeroed, we can skip
 266                                 * the load.
 267                                 *
 268                                 * (This assumes that prev_base == 0 has no
 269                                 * false positives.  This is the case on
 270                                 * Intel-style CPUs.)
 271                                 */
 272                                if (likely(prev_index | next_index | prev_base))
 273                                        loadseg(which, next_index);
 274                        }
 275                } else {
 276                        if (prev_index != next_index)
 277                                loadseg(which, next_index);
 278                        wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
 279                               next_base);
 280                }
 281        } else {
 282                /*
 283                 * The next task is using a real segment.  Loading the selector
 284                 * is sufficient.
 285                 */
 286                loadseg(which, next_index);
 287        }
 288}
 289
 290static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
 291                                              struct thread_struct *next)
 292{
 293        load_seg_legacy(prev->fsindex, prev->fsbase,
 294                        next->fsindex, next->fsbase, FS);
 295        load_seg_legacy(prev->gsindex, prev->gsbase,
 296                        next->gsindex, next->gsbase, GS);
 297}
 298
 299static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
 300                                            unsigned short selector)
 301{
 302        unsigned short idx = selector >> 3;
 303        unsigned long base;
 304
 305        if (likely((selector & SEGMENT_TI_MASK) == 0)) {
 306                if (unlikely(idx >= GDT_ENTRIES))
 307                        return 0;
 308
 309                /*
 310                 * There are no user segments in the GDT with nonzero bases
 311                 * other than the TLS segments.
 312                 */
 313                if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
 314                        return 0;
 315
 316                idx -= GDT_ENTRY_TLS_MIN;
 317                base = get_desc_base(&task->thread.tls_array[idx]);
 318        } else {
 319#ifdef CONFIG_MODIFY_LDT_SYSCALL
 320                struct ldt_struct *ldt;
 321
 322                /*
 323                 * If performance here mattered, we could protect the LDT
 324                 * with RCU.  This is a slow path, though, so we can just
 325                 * take the mutex.
 326                 */
 327                mutex_lock(&task->mm->context.lock);
 328                ldt = task->mm->context.ldt;
 329                if (unlikely(idx >= ldt->nr_entries))
 330                        base = 0;
 331                else
 332                        base = get_desc_base(ldt->entries + idx);
 333                mutex_unlock(&task->mm->context.lock);
 334#else
 335                base = 0;
 336#endif
 337        }
 338
 339        return base;
 340}
 341
 342unsigned long x86_fsbase_read_task(struct task_struct *task)
 343{
 344        unsigned long fsbase;
 345
 346        if (task == current)
 347                fsbase = x86_fsbase_read_cpu();
 348        else if (task->thread.fsindex == 0)
 349                fsbase = task->thread.fsbase;
 350        else
 351                fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
 352
 353        return fsbase;
 354}
 355
 356unsigned long x86_gsbase_read_task(struct task_struct *task)
 357{
 358        unsigned long gsbase;
 359
 360        if (task == current)
 361                gsbase = x86_gsbase_read_cpu_inactive();
 362        else if (task->thread.gsindex == 0)
 363                gsbase = task->thread.gsbase;
 364        else
 365                gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
 366
 367        return gsbase;
 368}
 369
 370void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
 371{
 372        WARN_ON_ONCE(task == current);
 373
 374        task->thread.fsbase = fsbase;
 375}
 376
 377void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
 378{
 379        WARN_ON_ONCE(task == current);
 380
 381        task->thread.gsbase = gsbase;
 382}
 383
 384int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
 385                unsigned long arg, struct task_struct *p, unsigned long tls)
 386{
 387        int err;
 388        struct pt_regs *childregs;
 389        struct fork_frame *fork_frame;
 390        struct inactive_task_frame *frame;
 391        struct task_struct *me = current;
 392
 393        childregs = task_pt_regs(p);
 394        fork_frame = container_of(childregs, struct fork_frame, regs);
 395        frame = &fork_frame->frame;
 396
 397        frame->bp = 0;
 398        frame->ret_addr = (unsigned long) ret_from_fork;
 399        p->thread.sp = (unsigned long) fork_frame;
 400        p->thread.io_bitmap_ptr = NULL;
 401
 402        savesegment(gs, p->thread.gsindex);
 403        p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase;
 404        savesegment(fs, p->thread.fsindex);
 405        p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase;
 406        savesegment(es, p->thread.es);
 407        savesegment(ds, p->thread.ds);
 408        memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 409
 410        if (unlikely(p->flags & PF_KTHREAD)) {
 411                /* kernel thread */
 412                memset(childregs, 0, sizeof(struct pt_regs));
 413                frame->bx = sp;         /* function */
 414                frame->r12 = arg;
 415                return 0;
 416        }
 417        frame->bx = 0;
 418        *childregs = *current_pt_regs();
 419
 420        childregs->ax = 0;
 421        if (sp)
 422                childregs->sp = sp;
 423
 424        err = -ENOMEM;
 425        if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 426                p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
 427                                                  IO_BITMAP_BYTES, GFP_KERNEL);
 428                if (!p->thread.io_bitmap_ptr) {
 429                        p->thread.io_bitmap_max = 0;
 430                        return -ENOMEM;
 431                }
 432                set_tsk_thread_flag(p, TIF_IO_BITMAP);
 433        }
 434
 435        /*
 436         * Set a new TLS for the child thread?
 437         */
 438        if (clone_flags & CLONE_SETTLS) {
 439#ifdef CONFIG_IA32_EMULATION
 440                if (in_ia32_syscall())
 441                        err = do_set_thread_area(p, -1,
 442                                (struct user_desc __user *)tls, 0);
 443                else
 444#endif
 445                        err = do_arch_prctl_64(p, ARCH_SET_FS, tls);
 446                if (err)
 447                        goto out;
 448        }
 449        err = 0;
 450out:
 451        if (err && p->thread.io_bitmap_ptr) {
 452                kfree(p->thread.io_bitmap_ptr);
 453                p->thread.io_bitmap_max = 0;
 454        }
 455
 456        return err;
 457}
 458
 459static void
 460start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 461                    unsigned long new_sp,
 462                    unsigned int _cs, unsigned int _ss, unsigned int _ds)
 463{
 464        WARN_ON_ONCE(regs != current_pt_regs());
 465
 466        if (static_cpu_has(X86_BUG_NULL_SEG)) {
 467                /* Loading zero below won't clear the base. */
 468                loadsegment(fs, __USER_DS);
 469                load_gs_index(__USER_DS);
 470        }
 471
 472        loadsegment(fs, 0);
 473        loadsegment(es, _ds);
 474        loadsegment(ds, _ds);
 475        load_gs_index(0);
 476
 477        regs->ip                = new_ip;
 478        regs->sp                = new_sp;
 479        regs->cs                = _cs;
 480        regs->ss                = _ss;
 481        regs->flags             = X86_EFLAGS_IF;
 482        force_iret();
 483}
 484
 485void
 486start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 487{
 488        start_thread_common(regs, new_ip, new_sp,
 489                            __USER_CS, __USER_DS, 0);
 490}
 491EXPORT_SYMBOL_GPL(start_thread);
 492
 493#ifdef CONFIG_COMPAT
 494void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
 495{
 496        start_thread_common(regs, new_ip, new_sp,
 497                            test_thread_flag(TIF_X32)
 498                            ? __USER_CS : __USER32_CS,
 499                            __USER_DS, __USER_DS);
 500}
 501#endif
 502
 503/*
 504 *      switch_to(x,y) should switch tasks from x to y.
 505 *
 506 * This could still be optimized:
 507 * - fold all the options into a flag word and test it with a single test.
 508 * - could test fs/gs bitsliced
 509 *
 510 * Kprobes not supported here. Set the probe on schedule instead.
 511 * Function graph tracer not supported too.
 512 */
 513__visible __notrace_funcgraph struct task_struct *
 514__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 515{
 516        struct thread_struct *prev = &prev_p->thread;
 517        struct thread_struct *next = &next_p->thread;
 518        struct fpu *prev_fpu = &prev->fpu;
 519        struct fpu *next_fpu = &next->fpu;
 520        int cpu = smp_processor_id();
 521
 522        WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
 523                     this_cpu_read(irq_count) != -1);
 524
 525        if (!test_thread_flag(TIF_NEED_FPU_LOAD))
 526                switch_fpu_prepare(prev_fpu, cpu);
 527
 528        /* We must save %fs and %gs before load_TLS() because
 529         * %fs and %gs may be cleared by load_TLS().
 530         *
 531         * (e.g. xen_load_tls())
 532         */
 533        save_fsgs(prev_p);
 534
 535        /*
 536         * Load TLS before restoring any segments so that segment loads
 537         * reference the correct GDT entries.
 538         */
 539        load_TLS(next, cpu);
 540
 541        /*
 542         * Leave lazy mode, flushing any hypercalls made here.  This
 543         * must be done after loading TLS entries in the GDT but before
 544         * loading segments that might reference them.
 545         */
 546        arch_end_context_switch(next_p);
 547
 548        /* Switch DS and ES.
 549         *
 550         * Reading them only returns the selectors, but writing them (if
 551         * nonzero) loads the full descriptor from the GDT or LDT.  The
 552         * LDT for next is loaded in switch_mm, and the GDT is loaded
 553         * above.
 554         *
 555         * We therefore need to write new values to the segment
 556         * registers on every context switch unless both the new and old
 557         * values are zero.
 558         *
 559         * Note that we don't need to do anything for CS and SS, as
 560         * those are saved and restored as part of pt_regs.
 561         */
 562        savesegment(es, prev->es);
 563        if (unlikely(next->es | prev->es))
 564                loadsegment(es, next->es);
 565
 566        savesegment(ds, prev->ds);
 567        if (unlikely(next->ds | prev->ds))
 568                loadsegment(ds, next->ds);
 569
 570        x86_fsgsbase_load(prev, next);
 571
 572        /*
 573         * Switch the PDA and FPU contexts.
 574         */
 575        this_cpu_write(current_task, next_p);
 576        this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
 577
 578        switch_fpu_finish(next_fpu);
 579
 580        /* Reload sp0. */
 581        update_task_stack(next_p);
 582
 583        switch_to_extra(prev_p, next_p);
 584
 585#ifdef CONFIG_XEN_PV
 586        /*
 587         * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
 588         * current_pt_regs()->flags may not match the current task's
 589         * intended IOPL.  We need to switch it manually.
 590         */
 591        if (unlikely(static_cpu_has(X86_FEATURE_XENPV) &&
 592                     prev->iopl != next->iopl))
 593                xen_set_iopl_mask(next->iopl);
 594#endif
 595
 596        if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
 597                /*
 598                 * AMD CPUs have a misfeature: SYSRET sets the SS selector but
 599                 * does not update the cached descriptor.  As a result, if we
 600                 * do SYSRET while SS is NULL, we'll end up in user mode with
 601                 * SS apparently equal to __USER_DS but actually unusable.
 602                 *
 603                 * The straightforward workaround would be to fix it up just
 604                 * before SYSRET, but that would slow down the system call
 605                 * fast paths.  Instead, we ensure that SS is never NULL in
 606                 * system call context.  We do this by replacing NULL SS
 607                 * selectors at every context switch.  SYSCALL sets up a valid
 608                 * SS, so the only way to get NULL is to re-enter the kernel
 609                 * from CPL 3 through an interrupt.  Since that can't happen
 610                 * in the same task as a running syscall, we are guaranteed to
 611                 * context switch between every interrupt vector entry and a
 612                 * subsequent SYSRET.
 613                 *
 614                 * We read SS first because SS reads are much faster than
 615                 * writes.  Out of caution, we force SS to __KERNEL_DS even if
 616                 * it previously had a different non-NULL value.
 617                 */
 618                unsigned short ss_sel;
 619                savesegment(ss, ss_sel);
 620                if (ss_sel != __KERNEL_DS)
 621                        loadsegment(ss, __KERNEL_DS);
 622        }
 623
 624        /* Load the Intel cache allocation PQR MSR. */
 625        resctrl_sched_in();
 626
 627        return prev_p;
 628}
 629
 630void set_personality_64bit(void)
 631{
 632        /* inherit personality from parent */
 633
 634        /* Make sure to be in 64bit mode */
 635        clear_thread_flag(TIF_IA32);
 636        clear_thread_flag(TIF_ADDR32);
 637        clear_thread_flag(TIF_X32);
 638        /* Pretend that this comes from a 64bit execve */
 639        task_pt_regs(current)->orig_ax = __NR_execve;
 640        current_thread_info()->status &= ~TS_COMPAT;
 641
 642        /* Ensure the corresponding mm is not marked. */
 643        if (current->mm)
 644                current->mm->context.ia32_compat = 0;
 645
 646        /* TBD: overwrites user setup. Should have two bits.
 647           But 64bit processes have always behaved this way,
 648           so it's not too bad. The main problem is just that
 649           32bit children are affected again. */
 650        current->personality &= ~READ_IMPLIES_EXEC;
 651}
 652
 653static void __set_personality_x32(void)
 654{
 655#ifdef CONFIG_X86_X32
 656        clear_thread_flag(TIF_IA32);
 657        set_thread_flag(TIF_X32);
 658        if (current->mm)
 659                current->mm->context.ia32_compat = TIF_X32;
 660        current->personality &= ~READ_IMPLIES_EXEC;
 661        /*
 662         * in_32bit_syscall() uses the presence of the x32 syscall bit
 663         * flag to determine compat status.  The x86 mmap() code relies on
 664         * the syscall bitness so set x32 syscall bit right here to make
 665         * in_32bit_syscall() work during exec().
 666         *
 667         * Pretend to come from a x32 execve.
 668         */
 669        task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
 670        current_thread_info()->status &= ~TS_COMPAT;
 671#endif
 672}
 673
 674static void __set_personality_ia32(void)
 675{
 676#ifdef CONFIG_IA32_EMULATION
 677        set_thread_flag(TIF_IA32);
 678        clear_thread_flag(TIF_X32);
 679        if (current->mm)
 680                current->mm->context.ia32_compat = TIF_IA32;
 681        current->personality |= force_personality32;
 682        /* Prepare the first "return" to user space */
 683        task_pt_regs(current)->orig_ax = __NR_ia32_execve;
 684        current_thread_info()->status |= TS_COMPAT;
 685#endif
 686}
 687
 688void set_personality_ia32(bool x32)
 689{
 690        /* Make sure to be in 32bit mode */
 691        set_thread_flag(TIF_ADDR32);
 692
 693        if (x32)
 694                __set_personality_x32();
 695        else
 696                __set_personality_ia32();
 697}
 698EXPORT_SYMBOL_GPL(set_personality_ia32);
 699
 700#ifdef CONFIG_CHECKPOINT_RESTORE
 701static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
 702{
 703        int ret;
 704
 705        ret = map_vdso_once(image, addr);
 706        if (ret)
 707                return ret;
 708
 709        return (long)image->size;
 710}
 711#endif
 712
 713long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
 714{
 715        int ret = 0;
 716
 717        switch (option) {
 718        case ARCH_SET_GS: {
 719                if (unlikely(arg2 >= TASK_SIZE_MAX))
 720                        return -EPERM;
 721
 722                preempt_disable();
 723                /*
 724                 * ARCH_SET_GS has always overwritten the index
 725                 * and the base. Zero is the most sensible value
 726                 * to put in the index, and is the only value that
 727                 * makes any sense if FSGSBASE is unavailable.
 728                 */
 729                if (task == current) {
 730                        loadseg(GS, 0);
 731                        x86_gsbase_write_cpu_inactive(arg2);
 732
 733                        /*
 734                         * On non-FSGSBASE systems, save_base_legacy() expects
 735                         * that we also fill in thread.gsbase.
 736                         */
 737                        task->thread.gsbase = arg2;
 738
 739                } else {
 740                        task->thread.gsindex = 0;
 741                        x86_gsbase_write_task(task, arg2);
 742                }
 743                preempt_enable();
 744                break;
 745        }
 746        case ARCH_SET_FS: {
 747                /*
 748                 * Not strictly needed for %fs, but do it for symmetry
 749                 * with %gs
 750                 */
 751                if (unlikely(arg2 >= TASK_SIZE_MAX))
 752                        return -EPERM;
 753
 754                preempt_disable();
 755                /*
 756                 * Set the selector to 0 for the same reason
 757                 * as %gs above.
 758                 */
 759                if (task == current) {
 760                        loadseg(FS, 0);
 761                        x86_fsbase_write_cpu(arg2);
 762
 763                        /*
 764                         * On non-FSGSBASE systems, save_base_legacy() expects
 765                         * that we also fill in thread.fsbase.
 766                         */
 767                        task->thread.fsbase = arg2;
 768                } else {
 769                        task->thread.fsindex = 0;
 770                        x86_fsbase_write_task(task, arg2);
 771                }
 772                preempt_enable();
 773                break;
 774        }
 775        case ARCH_GET_FS: {
 776                unsigned long base = x86_fsbase_read_task(task);
 777
 778                ret = put_user(base, (unsigned long __user *)arg2);
 779                break;
 780        }
 781        case ARCH_GET_GS: {
 782                unsigned long base = x86_gsbase_read_task(task);
 783
 784                ret = put_user(base, (unsigned long __user *)arg2);
 785                break;
 786        }
 787
 788#ifdef CONFIG_CHECKPOINT_RESTORE
 789# ifdef CONFIG_X86_X32_ABI
 790        case ARCH_MAP_VDSO_X32:
 791                return prctl_map_vdso(&vdso_image_x32, arg2);
 792# endif
 793# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
 794        case ARCH_MAP_VDSO_32:
 795                return prctl_map_vdso(&vdso_image_32, arg2);
 796# endif
 797        case ARCH_MAP_VDSO_64:
 798                return prctl_map_vdso(&vdso_image_64, arg2);
 799#endif
 800
 801        default:
 802                ret = -EINVAL;
 803                break;
 804        }
 805
 806        return ret;
 807}
 808
 809SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
 810{
 811        long ret;
 812
 813        ret = do_arch_prctl_64(current, option, arg2);
 814        if (ret == -EINVAL)
 815                ret = do_arch_prctl_common(current, option, arg2);
 816
 817        return ret;
 818}
 819
 820#ifdef CONFIG_IA32_EMULATION
 821COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
 822{
 823        return do_arch_prctl_common(current, option, arg2);
 824}
 825#endif
 826
 827unsigned long KSTK_ESP(struct task_struct *task)
 828{
 829        return task_pt_regs(task)->sp;
 830}
 831