linux/arch/x86/kernel/process_64.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *  Copyright (C) 1995  Linus Torvalds
   4 *
   5 *  Pentium III FXSR, SSE support
   6 *      Gareth Hughes <gareth@valinux.com>, May 2000
   7 *
   8 *  X86-64 port
   9 *      Andi Kleen.
  10 *
  11 *      CPU hotplug support - ashok.raj@intel.com
  12 */
  13
  14/*
  15 * This file handles the architecture-dependent parts of process handling..
  16 */
  17
  18#include <linux/cpu.h>
  19#include <linux/errno.h>
  20#include <linux/sched.h>
  21#include <linux/sched/task.h>
  22#include <linux/sched/task_stack.h>
  23#include <linux/fs.h>
  24#include <linux/kernel.h>
  25#include <linux/mm.h>
  26#include <linux/elfcore.h>
  27#include <linux/smp.h>
  28#include <linux/slab.h>
  29#include <linux/user.h>
  30#include <linux/interrupt.h>
  31#include <linux/delay.h>
  32#include <linux/export.h>
  33#include <linux/ptrace.h>
  34#include <linux/notifier.h>
  35#include <linux/kprobes.h>
  36#include <linux/kdebug.h>
  37#include <linux/prctl.h>
  38#include <linux/uaccess.h>
  39#include <linux/io.h>
  40#include <linux/ftrace.h>
  41#include <linux/syscalls.h>
  42
  43#include <asm/pgtable.h>
  44#include <asm/processor.h>
  45#include <asm/fpu/internal.h>
  46#include <asm/mmu_context.h>
  47#include <asm/prctl.h>
  48#include <asm/desc.h>
  49#include <asm/proto.h>
  50#include <asm/ia32.h>
  51#include <asm/syscalls.h>
  52#include <asm/debugreg.h>
  53#include <asm/switch_to.h>
  54#include <asm/xen/hypervisor.h>
  55#include <asm/vdso.h>
  56#include <asm/resctrl_sched.h>
  57#include <asm/unistd.h>
  58#include <asm/fsgsbase.h>
  59#ifdef CONFIG_IA32_EMULATION
  60/* Not included via unistd.h */
  61#include <asm/unistd_32_ia32.h>
  62#endif
  63
  64#include "process.h"
  65
  66/* Prints also some state that isn't saved in the pt_regs */
  67void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
  68{
  69        unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
  70        unsigned long d0, d1, d2, d3, d6, d7;
  71        unsigned int fsindex, gsindex;
  72        unsigned int ds, es;
  73
  74        show_iret_regs(regs);
  75
  76        if (regs->orig_ax != -1)
  77                pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
  78        else
  79                pr_cont("\n");
  80
  81        printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
  82               regs->ax, regs->bx, regs->cx);
  83        printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
  84               regs->dx, regs->si, regs->di);
  85        printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
  86               regs->bp, regs->r8, regs->r9);
  87        printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
  88               regs->r10, regs->r11, regs->r12);
  89        printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
  90               regs->r13, regs->r14, regs->r15);
  91
  92        if (mode == SHOW_REGS_SHORT)
  93                return;
  94
  95        if (mode == SHOW_REGS_USER) {
  96                rdmsrl(MSR_FS_BASE, fs);
  97                rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
  98                printk(KERN_DEFAULT "FS:  %016lx GS:  %016lx\n",
  99                       fs, shadowgs);
 100                return;
 101        }
 102
 103        asm("movl %%ds,%0" : "=r" (ds));
 104        asm("movl %%es,%0" : "=r" (es));
 105        asm("movl %%fs,%0" : "=r" (fsindex));
 106        asm("movl %%gs,%0" : "=r" (gsindex));
 107
 108        rdmsrl(MSR_FS_BASE, fs);
 109        rdmsrl(MSR_GS_BASE, gs);
 110        rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
 111
 112        cr0 = read_cr0();
 113        cr2 = read_cr2();
 114        cr3 = __read_cr3();
 115        cr4 = __read_cr4();
 116
 117        printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
 118               fs, fsindex, gs, gsindex, shadowgs);
 119        printk(KERN_DEFAULT "CS:  %04lx DS: %04x ES: %04x CR0: %016lx\n", regs->cs, ds,
 120                        es, cr0);
 121        printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
 122                        cr4);
 123
 124        get_debugreg(d0, 0);
 125        get_debugreg(d1, 1);
 126        get_debugreg(d2, 2);
 127        get_debugreg(d3, 3);
 128        get_debugreg(d6, 6);
 129        get_debugreg(d7, 7);
 130
 131        /* Only print out debug registers if they are in their non-default state. */
 132        if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
 133            (d6 == DR6_RESERVED) && (d7 == 0x400))) {
 134                printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n",
 135                       d0, d1, d2);
 136                printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n",
 137                       d3, d6, d7);
 138        }
 139
 140        if (boot_cpu_has(X86_FEATURE_OSPKE))
 141                printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
 142}
 143
 144void release_thread(struct task_struct *dead_task)
 145{
 146        WARN_ON(dead_task->mm);
 147}
 148
 149enum which_selector {
 150        FS,
 151        GS
 152};
 153
 154/*
 155 * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
 156 * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
 157 * It's forcibly inlined because it'll generate better code and this function
 158 * is hot.
 159 */
 160static __always_inline void save_base_legacy(struct task_struct *prev_p,
 161                                             unsigned short selector,
 162                                             enum which_selector which)
 163{
 164        if (likely(selector == 0)) {
 165                /*
 166                 * On Intel (without X86_BUG_NULL_SEG), the segment base could
 167                 * be the pre-existing saved base or it could be zero.  On AMD
 168                 * (with X86_BUG_NULL_SEG), the segment base could be almost
 169                 * anything.
 170                 *
 171                 * This branch is very hot (it's hit twice on almost every
 172                 * context switch between 64-bit programs), and avoiding
 173                 * the RDMSR helps a lot, so we just assume that whatever
 174                 * value is already saved is correct.  This matches historical
 175                 * Linux behavior, so it won't break existing applications.
 176                 *
 177                 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
 178                 * report that the base is zero, it needs to actually be zero:
 179                 * see the corresponding logic in load_seg_legacy.
 180                 */
 181        } else {
 182                /*
 183                 * If the selector is 1, 2, or 3, then the base is zero on
 184                 * !X86_BUG_NULL_SEG CPUs and could be anything on
 185                 * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
 186                 * has never attempted to preserve the base across context
 187                 * switches.
 188                 *
 189                 * If selector > 3, then it refers to a real segment, and
 190                 * saving the base isn't necessary.
 191                 */
 192                if (which == FS)
 193                        prev_p->thread.fsbase = 0;
 194                else
 195                        prev_p->thread.gsbase = 0;
 196        }
 197}
 198
 199static __always_inline void save_fsgs(struct task_struct *task)
 200{
 201        savesegment(fs, task->thread.fsindex);
 202        savesegment(gs, task->thread.gsindex);
 203        save_base_legacy(task, task->thread.fsindex, FS);
 204        save_base_legacy(task, task->thread.gsindex, GS);
 205}
 206
 207#if IS_ENABLED(CONFIG_KVM)
 208/*
 209 * While a process is running,current->thread.fsbase and current->thread.gsbase
 210 * may not match the corresponding CPU registers (see save_base_legacy()). KVM
 211 * wants an efficient way to save and restore FSBASE and GSBASE.
 212 * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE.
 213 */
 214void save_fsgs_for_kvm(void)
 215{
 216        save_fsgs(current);
 217}
 218EXPORT_SYMBOL_GPL(save_fsgs_for_kvm);
 219#endif
 220
 221static __always_inline void loadseg(enum which_selector which,
 222                                    unsigned short sel)
 223{
 224        if (which == FS)
 225                loadsegment(fs, sel);
 226        else
 227                load_gs_index(sel);
 228}
 229
 230static __always_inline void load_seg_legacy(unsigned short prev_index,
 231                                            unsigned long prev_base,
 232                                            unsigned short next_index,
 233                                            unsigned long next_base,
 234                                            enum which_selector which)
 235{
 236        if (likely(next_index <= 3)) {
 237                /*
 238                 * The next task is using 64-bit TLS, is not using this
 239                 * segment at all, or is having fun with arcane CPU features.
 240                 */
 241                if (next_base == 0) {
 242                        /*
 243                         * Nasty case: on AMD CPUs, we need to forcibly zero
 244                         * the base.
 245                         */
 246                        if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
 247                                loadseg(which, __USER_DS);
 248                                loadseg(which, next_index);
 249                        } else {
 250                                /*
 251                                 * We could try to exhaustively detect cases
 252                                 * under which we can skip the segment load,
 253                                 * but there's really only one case that matters
 254                                 * for performance: if both the previous and
 255                                 * next states are fully zeroed, we can skip
 256                                 * the load.
 257                                 *
 258                                 * (This assumes that prev_base == 0 has no
 259                                 * false positives.  This is the case on
 260                                 * Intel-style CPUs.)
 261                                 */
 262                                if (likely(prev_index | next_index | prev_base))
 263                                        loadseg(which, next_index);
 264                        }
 265                } else {
 266                        if (prev_index != next_index)
 267                                loadseg(which, next_index);
 268                        wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
 269                               next_base);
 270                }
 271        } else {
 272                /*
 273                 * The next task is using a real segment.  Loading the selector
 274                 * is sufficient.
 275                 */
 276                loadseg(which, next_index);
 277        }
 278}
 279
 280static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
 281                                              struct thread_struct *next)
 282{
 283        load_seg_legacy(prev->fsindex, prev->fsbase,
 284                        next->fsindex, next->fsbase, FS);
 285        load_seg_legacy(prev->gsindex, prev->gsbase,
 286                        next->gsindex, next->gsbase, GS);
 287}
 288
 289static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
 290                                            unsigned short selector)
 291{
 292        unsigned short idx = selector >> 3;
 293        unsigned long base;
 294
 295        if (likely((selector & SEGMENT_TI_MASK) == 0)) {
 296                if (unlikely(idx >= GDT_ENTRIES))
 297                        return 0;
 298
 299                /*
 300                 * There are no user segments in the GDT with nonzero bases
 301                 * other than the TLS segments.
 302                 */
 303                if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
 304                        return 0;
 305
 306                idx -= GDT_ENTRY_TLS_MIN;
 307                base = get_desc_base(&task->thread.tls_array[idx]);
 308        } else {
 309#ifdef CONFIG_MODIFY_LDT_SYSCALL
 310                struct ldt_struct *ldt;
 311
 312                /*
 313                 * If performance here mattered, we could protect the LDT
 314                 * with RCU.  This is a slow path, though, so we can just
 315                 * take the mutex.
 316                 */
 317                mutex_lock(&task->mm->context.lock);
 318                ldt = task->mm->context.ldt;
 319                if (unlikely(idx >= ldt->nr_entries))
 320                        base = 0;
 321                else
 322                        base = get_desc_base(ldt->entries + idx);
 323                mutex_unlock(&task->mm->context.lock);
 324#else
 325                base = 0;
 326#endif
 327        }
 328
 329        return base;
 330}
 331
 332unsigned long x86_fsbase_read_task(struct task_struct *task)
 333{
 334        unsigned long fsbase;
 335
 336        if (task == current)
 337                fsbase = x86_fsbase_read_cpu();
 338        else if (task->thread.fsindex == 0)
 339                fsbase = task->thread.fsbase;
 340        else
 341                fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
 342
 343        return fsbase;
 344}
 345
 346unsigned long x86_gsbase_read_task(struct task_struct *task)
 347{
 348        unsigned long gsbase;
 349
 350        if (task == current)
 351                gsbase = x86_gsbase_read_cpu_inactive();
 352        else if (task->thread.gsindex == 0)
 353                gsbase = task->thread.gsbase;
 354        else
 355                gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
 356
 357        return gsbase;
 358}
 359
 360void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
 361{
 362        WARN_ON_ONCE(task == current);
 363
 364        task->thread.fsbase = fsbase;
 365}
 366
 367void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
 368{
 369        WARN_ON_ONCE(task == current);
 370
 371        task->thread.gsbase = gsbase;
 372}
 373
 374int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
 375                unsigned long arg, struct task_struct *p, unsigned long tls)
 376{
 377        int err;
 378        struct pt_regs *childregs;
 379        struct fork_frame *fork_frame;
 380        struct inactive_task_frame *frame;
 381        struct task_struct *me = current;
 382
 383        childregs = task_pt_regs(p);
 384        fork_frame = container_of(childregs, struct fork_frame, regs);
 385        frame = &fork_frame->frame;
 386
 387        frame->bp = 0;
 388        frame->ret_addr = (unsigned long) ret_from_fork;
 389        p->thread.sp = (unsigned long) fork_frame;
 390        p->thread.io_bitmap_ptr = NULL;
 391
 392        savesegment(gs, p->thread.gsindex);
 393        p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase;
 394        savesegment(fs, p->thread.fsindex);
 395        p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase;
 396        savesegment(es, p->thread.es);
 397        savesegment(ds, p->thread.ds);
 398        memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 399
 400        if (unlikely(p->flags & PF_KTHREAD)) {
 401                /* kernel thread */
 402                memset(childregs, 0, sizeof(struct pt_regs));
 403                frame->bx = sp;         /* function */
 404                frame->r12 = arg;
 405                return 0;
 406        }
 407        frame->bx = 0;
 408        *childregs = *current_pt_regs();
 409
 410        childregs->ax = 0;
 411        if (sp)
 412                childregs->sp = sp;
 413
 414        err = -ENOMEM;
 415        if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 416                p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
 417                                                  IO_BITMAP_BYTES, GFP_KERNEL);
 418                if (!p->thread.io_bitmap_ptr) {
 419                        p->thread.io_bitmap_max = 0;
 420                        return -ENOMEM;
 421                }
 422                set_tsk_thread_flag(p, TIF_IO_BITMAP);
 423        }
 424
 425        /*
 426         * Set a new TLS for the child thread?
 427         */
 428        if (clone_flags & CLONE_SETTLS) {
 429#ifdef CONFIG_IA32_EMULATION
 430                if (in_ia32_syscall())
 431                        err = do_set_thread_area(p, -1,
 432                                (struct user_desc __user *)tls, 0);
 433                else
 434#endif
 435                        err = do_arch_prctl_64(p, ARCH_SET_FS, tls);
 436                if (err)
 437                        goto out;
 438        }
 439        err = 0;
 440out:
 441        if (err && p->thread.io_bitmap_ptr) {
 442                kfree(p->thread.io_bitmap_ptr);
 443                p->thread.io_bitmap_max = 0;
 444        }
 445
 446        return err;
 447}
 448
 449static void
 450start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 451                    unsigned long new_sp,
 452                    unsigned int _cs, unsigned int _ss, unsigned int _ds)
 453{
 454        WARN_ON_ONCE(regs != current_pt_regs());
 455
 456        if (static_cpu_has(X86_BUG_NULL_SEG)) {
 457                /* Loading zero below won't clear the base. */
 458                loadsegment(fs, __USER_DS);
 459                load_gs_index(__USER_DS);
 460        }
 461
 462        loadsegment(fs, 0);
 463        loadsegment(es, _ds);
 464        loadsegment(ds, _ds);
 465        load_gs_index(0);
 466
 467        regs->ip                = new_ip;
 468        regs->sp                = new_sp;
 469        regs->cs                = _cs;
 470        regs->ss                = _ss;
 471        regs->flags             = X86_EFLAGS_IF;
 472        force_iret();
 473}
 474
 475void
 476start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 477{
 478        start_thread_common(regs, new_ip, new_sp,
 479                            __USER_CS, __USER_DS, 0);
 480}
 481EXPORT_SYMBOL_GPL(start_thread);
 482
 483#ifdef CONFIG_COMPAT
 484void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
 485{
 486        start_thread_common(regs, new_ip, new_sp,
 487                            test_thread_flag(TIF_X32)
 488                            ? __USER_CS : __USER32_CS,
 489                            __USER_DS, __USER_DS);
 490}
 491#endif
 492
 493/*
 494 *      switch_to(x,y) should switch tasks from x to y.
 495 *
 496 * This could still be optimized:
 497 * - fold all the options into a flag word and test it with a single test.
 498 * - could test fs/gs bitsliced
 499 *
 500 * Kprobes not supported here. Set the probe on schedule instead.
 501 * Function graph tracer not supported too.
 502 */
 503__visible __notrace_funcgraph struct task_struct *
 504__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 505{
 506        struct thread_struct *prev = &prev_p->thread;
 507        struct thread_struct *next = &next_p->thread;
 508        struct fpu *prev_fpu = &prev->fpu;
 509        struct fpu *next_fpu = &next->fpu;
 510        int cpu = smp_processor_id();
 511
 512        WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
 513                     this_cpu_read(irq_count) != -1);
 514
 515        if (!test_thread_flag(TIF_NEED_FPU_LOAD))
 516                switch_fpu_prepare(prev_fpu, cpu);
 517
 518        /* We must save %fs and %gs before load_TLS() because
 519         * %fs and %gs may be cleared by load_TLS().
 520         *
 521         * (e.g. xen_load_tls())
 522         */
 523        save_fsgs(prev_p);
 524
 525        /*
 526         * Load TLS before restoring any segments so that segment loads
 527         * reference the correct GDT entries.
 528         */
 529        load_TLS(next, cpu);
 530
 531        /*
 532         * Leave lazy mode, flushing any hypercalls made here.  This
 533         * must be done after loading TLS entries in the GDT but before
 534         * loading segments that might reference them.
 535         */
 536        arch_end_context_switch(next_p);
 537
 538        /* Switch DS and ES.
 539         *
 540         * Reading them only returns the selectors, but writing them (if
 541         * nonzero) loads the full descriptor from the GDT or LDT.  The
 542         * LDT for next is loaded in switch_mm, and the GDT is loaded
 543         * above.
 544         *
 545         * We therefore need to write new values to the segment
 546         * registers on every context switch unless both the new and old
 547         * values are zero.
 548         *
 549         * Note that we don't need to do anything for CS and SS, as
 550         * those are saved and restored as part of pt_regs.
 551         */
 552        savesegment(es, prev->es);
 553        if (unlikely(next->es | prev->es))
 554                loadsegment(es, next->es);
 555
 556        savesegment(ds, prev->ds);
 557        if (unlikely(next->ds | prev->ds))
 558                loadsegment(ds, next->ds);
 559
 560        x86_fsgsbase_load(prev, next);
 561
 562        /*
 563         * Switch the PDA and FPU contexts.
 564         */
 565        this_cpu_write(current_task, next_p);
 566        this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
 567
 568        switch_fpu_finish(next_fpu);
 569
 570        /* Reload sp0. */
 571        update_task_stack(next_p);
 572
 573        switch_to_extra(prev_p, next_p);
 574
 575#ifdef CONFIG_XEN_PV
 576        /*
 577         * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
 578         * current_pt_regs()->flags may not match the current task's
 579         * intended IOPL.  We need to switch it manually.
 580         */
 581        if (unlikely(static_cpu_has(X86_FEATURE_XENPV) &&
 582                     prev->iopl != next->iopl))
 583                xen_set_iopl_mask(next->iopl);
 584#endif
 585
 586        if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
 587                /*
 588                 * AMD CPUs have a misfeature: SYSRET sets the SS selector but
 589                 * does not update the cached descriptor.  As a result, if we
 590                 * do SYSRET while SS is NULL, we'll end up in user mode with
 591                 * SS apparently equal to __USER_DS but actually unusable.
 592                 *
 593                 * The straightforward workaround would be to fix it up just
 594                 * before SYSRET, but that would slow down the system call
 595                 * fast paths.  Instead, we ensure that SS is never NULL in
 596                 * system call context.  We do this by replacing NULL SS
 597                 * selectors at every context switch.  SYSCALL sets up a valid
 598                 * SS, so the only way to get NULL is to re-enter the kernel
 599                 * from CPL 3 through an interrupt.  Since that can't happen
 600                 * in the same task as a running syscall, we are guaranteed to
 601                 * context switch between every interrupt vector entry and a
 602                 * subsequent SYSRET.
 603                 *
 604                 * We read SS first because SS reads are much faster than
 605                 * writes.  Out of caution, we force SS to __KERNEL_DS even if
 606                 * it previously had a different non-NULL value.
 607                 */
 608                unsigned short ss_sel;
 609                savesegment(ss, ss_sel);
 610                if (ss_sel != __KERNEL_DS)
 611                        loadsegment(ss, __KERNEL_DS);
 612        }
 613
 614        /* Load the Intel cache allocation PQR MSR. */
 615        resctrl_sched_in();
 616
 617        return prev_p;
 618}
 619
 620void set_personality_64bit(void)
 621{
 622        /* inherit personality from parent */
 623
 624        /* Make sure to be in 64bit mode */
 625        clear_thread_flag(TIF_IA32);
 626        clear_thread_flag(TIF_ADDR32);
 627        clear_thread_flag(TIF_X32);
 628        /* Pretend that this comes from a 64bit execve */
 629        task_pt_regs(current)->orig_ax = __NR_execve;
 630        current_thread_info()->status &= ~TS_COMPAT;
 631
 632        /* Ensure the corresponding mm is not marked. */
 633        if (current->mm)
 634                current->mm->context.ia32_compat = 0;
 635
 636        /* TBD: overwrites user setup. Should have two bits.
 637           But 64bit processes have always behaved this way,
 638           so it's not too bad. The main problem is just that
 639           32bit children are affected again. */
 640        current->personality &= ~READ_IMPLIES_EXEC;
 641}
 642
 643static void __set_personality_x32(void)
 644{
 645#ifdef CONFIG_X86_X32
 646        clear_thread_flag(TIF_IA32);
 647        set_thread_flag(TIF_X32);
 648        if (current->mm)
 649                current->mm->context.ia32_compat = TIF_X32;
 650        current->personality &= ~READ_IMPLIES_EXEC;
 651        /*
 652         * in_32bit_syscall() uses the presence of the x32 syscall bit
 653         * flag to determine compat status.  The x86 mmap() code relies on
 654         * the syscall bitness so set x32 syscall bit right here to make
 655         * in_32bit_syscall() work during exec().
 656         *
 657         * Pretend to come from a x32 execve.
 658         */
 659        task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
 660        current_thread_info()->status &= ~TS_COMPAT;
 661#endif
 662}
 663
 664static void __set_personality_ia32(void)
 665{
 666#ifdef CONFIG_IA32_EMULATION
 667        set_thread_flag(TIF_IA32);
 668        clear_thread_flag(TIF_X32);
 669        if (current->mm)
 670                current->mm->context.ia32_compat = TIF_IA32;
 671        current->personality |= force_personality32;
 672        /* Prepare the first "return" to user space */
 673        task_pt_regs(current)->orig_ax = __NR_ia32_execve;
 674        current_thread_info()->status |= TS_COMPAT;
 675#endif
 676}
 677
 678void set_personality_ia32(bool x32)
 679{
 680        /* Make sure to be in 32bit mode */
 681        set_thread_flag(TIF_ADDR32);
 682
 683        if (x32)
 684                __set_personality_x32();
 685        else
 686                __set_personality_ia32();
 687}
 688EXPORT_SYMBOL_GPL(set_personality_ia32);
 689
 690#ifdef CONFIG_CHECKPOINT_RESTORE
 691static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
 692{
 693        int ret;
 694
 695        ret = map_vdso_once(image, addr);
 696        if (ret)
 697                return ret;
 698
 699        return (long)image->size;
 700}
 701#endif
 702
 703long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
 704{
 705        int ret = 0;
 706
 707        switch (option) {
 708        case ARCH_SET_GS: {
 709                if (unlikely(arg2 >= TASK_SIZE_MAX))
 710                        return -EPERM;
 711
 712                preempt_disable();
 713                /*
 714                 * ARCH_SET_GS has always overwritten the index
 715                 * and the base. Zero is the most sensible value
 716                 * to put in the index, and is the only value that
 717                 * makes any sense if FSGSBASE is unavailable.
 718                 */
 719                if (task == current) {
 720                        loadseg(GS, 0);
 721                        x86_gsbase_write_cpu_inactive(arg2);
 722
 723                        /*
 724                         * On non-FSGSBASE systems, save_base_legacy() expects
 725                         * that we also fill in thread.gsbase.
 726                         */
 727                        task->thread.gsbase = arg2;
 728
 729                } else {
 730                        task->thread.gsindex = 0;
 731                        x86_gsbase_write_task(task, arg2);
 732                }
 733                preempt_enable();
 734                break;
 735        }
 736        case ARCH_SET_FS: {
 737                /*
 738                 * Not strictly needed for %fs, but do it for symmetry
 739                 * with %gs
 740                 */
 741                if (unlikely(arg2 >= TASK_SIZE_MAX))
 742                        return -EPERM;
 743
 744                preempt_disable();
 745                /*
 746                 * Set the selector to 0 for the same reason
 747                 * as %gs above.
 748                 */
 749                if (task == current) {
 750                        loadseg(FS, 0);
 751                        x86_fsbase_write_cpu(arg2);
 752
 753                        /*
 754                         * On non-FSGSBASE systems, save_base_legacy() expects
 755                         * that we also fill in thread.fsbase.
 756                         */
 757                        task->thread.fsbase = arg2;
 758                } else {
 759                        task->thread.fsindex = 0;
 760                        x86_fsbase_write_task(task, arg2);
 761                }
 762                preempt_enable();
 763                break;
 764        }
 765        case ARCH_GET_FS: {
 766                unsigned long base = x86_fsbase_read_task(task);
 767
 768                ret = put_user(base, (unsigned long __user *)arg2);
 769                break;
 770        }
 771        case ARCH_GET_GS: {
 772                unsigned long base = x86_gsbase_read_task(task);
 773
 774                ret = put_user(base, (unsigned long __user *)arg2);
 775                break;
 776        }
 777
 778#ifdef CONFIG_CHECKPOINT_RESTORE
 779# ifdef CONFIG_X86_X32_ABI
 780        case ARCH_MAP_VDSO_X32:
 781                return prctl_map_vdso(&vdso_image_x32, arg2);
 782# endif
 783# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
 784        case ARCH_MAP_VDSO_32:
 785                return prctl_map_vdso(&vdso_image_32, arg2);
 786# endif
 787        case ARCH_MAP_VDSO_64:
 788                return prctl_map_vdso(&vdso_image_64, arg2);
 789#endif
 790
 791        default:
 792                ret = -EINVAL;
 793                break;
 794        }
 795
 796        return ret;
 797}
 798
 799SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
 800{
 801        long ret;
 802
 803        ret = do_arch_prctl_64(current, option, arg2);
 804        if (ret == -EINVAL)
 805                ret = do_arch_prctl_common(current, option, arg2);
 806
 807        return ret;
 808}
 809
 810#ifdef CONFIG_IA32_EMULATION
 811COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
 812{
 813        return do_arch_prctl_common(current, option, arg2);
 814}
 815#endif
 816
 817unsigned long KSTK_ESP(struct task_struct *task)
 818{
 819        return task_pt_regs(task)->sp;
 820}
 821