linux/arch/x86/kernel/process_64.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *  Copyright (C) 1995  Linus Torvalds
   4 *
   5 *  Pentium III FXSR, SSE support
   6 *      Gareth Hughes <gareth@valinux.com>, May 2000
   7 *
   8 *  X86-64 port
   9 *      Andi Kleen.
  10 *
  11 *      CPU hotplug support - ashok.raj@intel.com
  12 */
  13
  14/*
  15 * This file handles the architecture-dependent parts of process handling..
  16 */
  17
  18#include <linux/cpu.h>
  19#include <linux/errno.h>
  20#include <linux/sched.h>
  21#include <linux/sched/task.h>
  22#include <linux/sched/task_stack.h>
  23#include <linux/fs.h>
  24#include <linux/kernel.h>
  25#include <linux/mm.h>
  26#include <linux/elfcore.h>
  27#include <linux/smp.h>
  28#include <linux/slab.h>
  29#include <linux/user.h>
  30#include <linux/interrupt.h>
  31#include <linux/delay.h>
  32#include <linux/export.h>
  33#include <linux/ptrace.h>
  34#include <linux/notifier.h>
  35#include <linux/kprobes.h>
  36#include <linux/kdebug.h>
  37#include <linux/prctl.h>
  38#include <linux/uaccess.h>
  39#include <linux/io.h>
  40#include <linux/ftrace.h>
  41#include <linux/syscalls.h>
  42
  43#include <asm/pgtable.h>
  44#include <asm/processor.h>
  45#include <asm/fpu/internal.h>
  46#include <asm/mmu_context.h>
  47#include <asm/prctl.h>
  48#include <asm/desc.h>
  49#include <asm/proto.h>
  50#include <asm/ia32.h>
  51#include <asm/debugreg.h>
  52#include <asm/switch_to.h>
  53#include <asm/xen/hypervisor.h>
  54#include <asm/vdso.h>
  55#include <asm/resctrl_sched.h>
  56#include <asm/unistd.h>
  57#include <asm/fsgsbase.h>
  58#ifdef CONFIG_IA32_EMULATION
  59/* Not included via unistd.h */
  60#include <asm/unistd_32_ia32.h>
  61#endif
  62
  63#include "process.h"
  64
  65/* Prints also some state that isn't saved in the pt_regs */
  66void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
  67{
  68        unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
  69        unsigned long d0, d1, d2, d3, d6, d7;
  70        unsigned int fsindex, gsindex;
  71        unsigned int ds, es;
  72
  73        show_iret_regs(regs);
  74
  75        if (regs->orig_ax != -1)
  76                pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
  77        else
  78                pr_cont("\n");
  79
  80        printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
  81               regs->ax, regs->bx, regs->cx);
  82        printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
  83               regs->dx, regs->si, regs->di);
  84        printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
  85               regs->bp, regs->r8, regs->r9);
  86        printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
  87               regs->r10, regs->r11, regs->r12);
  88        printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
  89               regs->r13, regs->r14, regs->r15);
  90
  91        if (mode == SHOW_REGS_SHORT)
  92                return;
  93
  94        if (mode == SHOW_REGS_USER) {
  95                rdmsrl(MSR_FS_BASE, fs);
  96                rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
  97                printk(KERN_DEFAULT "FS:  %016lx GS:  %016lx\n",
  98                       fs, shadowgs);
  99                return;
 100        }
 101
 102        asm("movl %%ds,%0" : "=r" (ds));
 103        asm("movl %%es,%0" : "=r" (es));
 104        asm("movl %%fs,%0" : "=r" (fsindex));
 105        asm("movl %%gs,%0" : "=r" (gsindex));
 106
 107        rdmsrl(MSR_FS_BASE, fs);
 108        rdmsrl(MSR_GS_BASE, gs);
 109        rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
 110
 111        cr0 = read_cr0();
 112        cr2 = read_cr2();
 113        cr3 = __read_cr3();
 114        cr4 = __read_cr4();
 115
 116        printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
 117               fs, fsindex, gs, gsindex, shadowgs);
 118        printk(KERN_DEFAULT "CS:  %04lx DS: %04x ES: %04x CR0: %016lx\n", regs->cs, ds,
 119                        es, cr0);
 120        printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
 121                        cr4);
 122
 123        get_debugreg(d0, 0);
 124        get_debugreg(d1, 1);
 125        get_debugreg(d2, 2);
 126        get_debugreg(d3, 3);
 127        get_debugreg(d6, 6);
 128        get_debugreg(d7, 7);
 129
 130        /* Only print out debug registers if they are in their non-default state. */
 131        if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
 132            (d6 == DR6_RESERVED) && (d7 == 0x400))) {
 133                printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n",
 134                       d0, d1, d2);
 135                printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n",
 136                       d3, d6, d7);
 137        }
 138
 139        if (boot_cpu_has(X86_FEATURE_OSPKE))
 140                printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
 141}
 142
 143void release_thread(struct task_struct *dead_task)
 144{
 145        WARN_ON(dead_task->mm);
 146}
 147
 148enum which_selector {
 149        FS,
 150        GS
 151};
 152
 153/*
 154 * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
 155 * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
 156 * It's forcibly inlined because it'll generate better code and this function
 157 * is hot.
 158 */
 159static __always_inline void save_base_legacy(struct task_struct *prev_p,
 160                                             unsigned short selector,
 161                                             enum which_selector which)
 162{
 163        if (likely(selector == 0)) {
 164                /*
 165                 * On Intel (without X86_BUG_NULL_SEG), the segment base could
 166                 * be the pre-existing saved base or it could be zero.  On AMD
 167                 * (with X86_BUG_NULL_SEG), the segment base could be almost
 168                 * anything.
 169                 *
 170                 * This branch is very hot (it's hit twice on almost every
 171                 * context switch between 64-bit programs), and avoiding
 172                 * the RDMSR helps a lot, so we just assume that whatever
 173                 * value is already saved is correct.  This matches historical
 174                 * Linux behavior, so it won't break existing applications.
 175                 *
 176                 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
 177                 * report that the base is zero, it needs to actually be zero:
 178                 * see the corresponding logic in load_seg_legacy.
 179                 */
 180        } else {
 181                /*
 182                 * If the selector is 1, 2, or 3, then the base is zero on
 183                 * !X86_BUG_NULL_SEG CPUs and could be anything on
 184                 * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
 185                 * has never attempted to preserve the base across context
 186                 * switches.
 187                 *
 188                 * If selector > 3, then it refers to a real segment, and
 189                 * saving the base isn't necessary.
 190                 */
 191                if (which == FS)
 192                        prev_p->thread.fsbase = 0;
 193                else
 194                        prev_p->thread.gsbase = 0;
 195        }
 196}
 197
 198static __always_inline void save_fsgs(struct task_struct *task)
 199{
 200        savesegment(fs, task->thread.fsindex);
 201        savesegment(gs, task->thread.gsindex);
 202        save_base_legacy(task, task->thread.fsindex, FS);
 203        save_base_legacy(task, task->thread.gsindex, GS);
 204}
 205
 206#if IS_ENABLED(CONFIG_KVM)
 207/*
 208 * While a process is running,current->thread.fsbase and current->thread.gsbase
 209 * may not match the corresponding CPU registers (see save_base_legacy()). KVM
 210 * wants an efficient way to save and restore FSBASE and GSBASE.
 211 * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE.
 212 */
 213void save_fsgs_for_kvm(void)
 214{
 215        save_fsgs(current);
 216}
 217EXPORT_SYMBOL_GPL(save_fsgs_for_kvm);
 218#endif
 219
 220static __always_inline void loadseg(enum which_selector which,
 221                                    unsigned short sel)
 222{
 223        if (which == FS)
 224                loadsegment(fs, sel);
 225        else
 226                load_gs_index(sel);
 227}
 228
 229static __always_inline void load_seg_legacy(unsigned short prev_index,
 230                                            unsigned long prev_base,
 231                                            unsigned short next_index,
 232                                            unsigned long next_base,
 233                                            enum which_selector which)
 234{
 235        if (likely(next_index <= 3)) {
 236                /*
 237                 * The next task is using 64-bit TLS, is not using this
 238                 * segment at all, or is having fun with arcane CPU features.
 239                 */
 240                if (next_base == 0) {
 241                        /*
 242                         * Nasty case: on AMD CPUs, we need to forcibly zero
 243                         * the base.
 244                         */
 245                        if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
 246                                loadseg(which, __USER_DS);
 247                                loadseg(which, next_index);
 248                        } else {
 249                                /*
 250                                 * We could try to exhaustively detect cases
 251                                 * under which we can skip the segment load,
 252                                 * but there's really only one case that matters
 253                                 * for performance: if both the previous and
 254                                 * next states are fully zeroed, we can skip
 255                                 * the load.
 256                                 *
 257                                 * (This assumes that prev_base == 0 has no
 258                                 * false positives.  This is the case on
 259                                 * Intel-style CPUs.)
 260                                 */
 261                                if (likely(prev_index | next_index | prev_base))
 262                                        loadseg(which, next_index);
 263                        }
 264                } else {
 265                        if (prev_index != next_index)
 266                                loadseg(which, next_index);
 267                        wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
 268                               next_base);
 269                }
 270        } else {
 271                /*
 272                 * The next task is using a real segment.  Loading the selector
 273                 * is sufficient.
 274                 */
 275                loadseg(which, next_index);
 276        }
 277}
 278
 279static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
 280                                              struct thread_struct *next)
 281{
 282        load_seg_legacy(prev->fsindex, prev->fsbase,
 283                        next->fsindex, next->fsbase, FS);
 284        load_seg_legacy(prev->gsindex, prev->gsbase,
 285                        next->gsindex, next->gsbase, GS);
 286}
 287
 288static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
 289                                            unsigned short selector)
 290{
 291        unsigned short idx = selector >> 3;
 292        unsigned long base;
 293
 294        if (likely((selector & SEGMENT_TI_MASK) == 0)) {
 295                if (unlikely(idx >= GDT_ENTRIES))
 296                        return 0;
 297
 298                /*
 299                 * There are no user segments in the GDT with nonzero bases
 300                 * other than the TLS segments.
 301                 */
 302                if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
 303                        return 0;
 304
 305                idx -= GDT_ENTRY_TLS_MIN;
 306                base = get_desc_base(&task->thread.tls_array[idx]);
 307        } else {
 308#ifdef CONFIG_MODIFY_LDT_SYSCALL
 309                struct ldt_struct *ldt;
 310
 311                /*
 312                 * If performance here mattered, we could protect the LDT
 313                 * with RCU.  This is a slow path, though, so we can just
 314                 * take the mutex.
 315                 */
 316                mutex_lock(&task->mm->context.lock);
 317                ldt = task->mm->context.ldt;
 318                if (unlikely(idx >= ldt->nr_entries))
 319                        base = 0;
 320                else
 321                        base = get_desc_base(ldt->entries + idx);
 322                mutex_unlock(&task->mm->context.lock);
 323#else
 324                base = 0;
 325#endif
 326        }
 327
 328        return base;
 329}
 330
 331unsigned long x86_fsbase_read_task(struct task_struct *task)
 332{
 333        unsigned long fsbase;
 334
 335        if (task == current)
 336                fsbase = x86_fsbase_read_cpu();
 337        else if (task->thread.fsindex == 0)
 338                fsbase = task->thread.fsbase;
 339        else
 340                fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
 341
 342        return fsbase;
 343}
 344
 345unsigned long x86_gsbase_read_task(struct task_struct *task)
 346{
 347        unsigned long gsbase;
 348
 349        if (task == current)
 350                gsbase = x86_gsbase_read_cpu_inactive();
 351        else if (task->thread.gsindex == 0)
 352                gsbase = task->thread.gsbase;
 353        else
 354                gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
 355
 356        return gsbase;
 357}
 358
 359void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
 360{
 361        WARN_ON_ONCE(task == current);
 362
 363        task->thread.fsbase = fsbase;
 364}
 365
 366void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
 367{
 368        WARN_ON_ONCE(task == current);
 369
 370        task->thread.gsbase = gsbase;
 371}
 372
 373static void
 374start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 375                    unsigned long new_sp,
 376                    unsigned int _cs, unsigned int _ss, unsigned int _ds)
 377{
 378        WARN_ON_ONCE(regs != current_pt_regs());
 379
 380        if (static_cpu_has(X86_BUG_NULL_SEG)) {
 381                /* Loading zero below won't clear the base. */
 382                loadsegment(fs, __USER_DS);
 383                load_gs_index(__USER_DS);
 384        }
 385
 386        loadsegment(fs, 0);
 387        loadsegment(es, _ds);
 388        loadsegment(ds, _ds);
 389        load_gs_index(0);
 390
 391        regs->ip                = new_ip;
 392        regs->sp                = new_sp;
 393        regs->cs                = _cs;
 394        regs->ss                = _ss;
 395        regs->flags             = X86_EFLAGS_IF;
 396}
 397
 398void
 399start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 400{
 401        start_thread_common(regs, new_ip, new_sp,
 402                            __USER_CS, __USER_DS, 0);
 403}
 404EXPORT_SYMBOL_GPL(start_thread);
 405
 406#ifdef CONFIG_COMPAT
 407void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
 408{
 409        start_thread_common(regs, new_ip, new_sp,
 410                            test_thread_flag(TIF_X32)
 411                            ? __USER_CS : __USER32_CS,
 412                            __USER_DS, __USER_DS);
 413}
 414#endif
 415
 416/*
 417 *      switch_to(x,y) should switch tasks from x to y.
 418 *
 419 * This could still be optimized:
 420 * - fold all the options into a flag word and test it with a single test.
 421 * - could test fs/gs bitsliced
 422 *
 423 * Kprobes not supported here. Set the probe on schedule instead.
 424 * Function graph tracer not supported too.
 425 */
 426__visible __notrace_funcgraph struct task_struct *
 427__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 428{
 429        struct thread_struct *prev = &prev_p->thread;
 430        struct thread_struct *next = &next_p->thread;
 431        struct fpu *prev_fpu = &prev->fpu;
 432        struct fpu *next_fpu = &next->fpu;
 433        int cpu = smp_processor_id();
 434
 435        WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
 436                     this_cpu_read(irq_count) != -1);
 437
 438        if (!test_thread_flag(TIF_NEED_FPU_LOAD))
 439                switch_fpu_prepare(prev_fpu, cpu);
 440
 441        /* We must save %fs and %gs before load_TLS() because
 442         * %fs and %gs may be cleared by load_TLS().
 443         *
 444         * (e.g. xen_load_tls())
 445         */
 446        save_fsgs(prev_p);
 447
 448        /*
 449         * Load TLS before restoring any segments so that segment loads
 450         * reference the correct GDT entries.
 451         */
 452        load_TLS(next, cpu);
 453
 454        /*
 455         * Leave lazy mode, flushing any hypercalls made here.  This
 456         * must be done after loading TLS entries in the GDT but before
 457         * loading segments that might reference them.
 458         */
 459        arch_end_context_switch(next_p);
 460
 461        /* Switch DS and ES.
 462         *
 463         * Reading them only returns the selectors, but writing them (if
 464         * nonzero) loads the full descriptor from the GDT or LDT.  The
 465         * LDT for next is loaded in switch_mm, and the GDT is loaded
 466         * above.
 467         *
 468         * We therefore need to write new values to the segment
 469         * registers on every context switch unless both the new and old
 470         * values are zero.
 471         *
 472         * Note that we don't need to do anything for CS and SS, as
 473         * those are saved and restored as part of pt_regs.
 474         */
 475        savesegment(es, prev->es);
 476        if (unlikely(next->es | prev->es))
 477                loadsegment(es, next->es);
 478
 479        savesegment(ds, prev->ds);
 480        if (unlikely(next->ds | prev->ds))
 481                loadsegment(ds, next->ds);
 482
 483        x86_fsgsbase_load(prev, next);
 484
 485        /*
 486         * Switch the PDA and FPU contexts.
 487         */
 488        this_cpu_write(current_task, next_p);
 489        this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
 490
 491        switch_fpu_finish(next_fpu);
 492
 493        /* Reload sp0. */
 494        update_task_stack(next_p);
 495
 496        switch_to_extra(prev_p, next_p);
 497
 498        if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
 499                /*
 500                 * AMD CPUs have a misfeature: SYSRET sets the SS selector but
 501                 * does not update the cached descriptor.  As a result, if we
 502                 * do SYSRET while SS is NULL, we'll end up in user mode with
 503                 * SS apparently equal to __USER_DS but actually unusable.
 504                 *
 505                 * The straightforward workaround would be to fix it up just
 506                 * before SYSRET, but that would slow down the system call
 507                 * fast paths.  Instead, we ensure that SS is never NULL in
 508                 * system call context.  We do this by replacing NULL SS
 509                 * selectors at every context switch.  SYSCALL sets up a valid
 510                 * SS, so the only way to get NULL is to re-enter the kernel
 511                 * from CPL 3 through an interrupt.  Since that can't happen
 512                 * in the same task as a running syscall, we are guaranteed to
 513                 * context switch between every interrupt vector entry and a
 514                 * subsequent SYSRET.
 515                 *
 516                 * We read SS first because SS reads are much faster than
 517                 * writes.  Out of caution, we force SS to __KERNEL_DS even if
 518                 * it previously had a different non-NULL value.
 519                 */
 520                unsigned short ss_sel;
 521                savesegment(ss, ss_sel);
 522                if (ss_sel != __KERNEL_DS)
 523                        loadsegment(ss, __KERNEL_DS);
 524        }
 525
 526        /* Load the Intel cache allocation PQR MSR. */
 527        resctrl_sched_in();
 528
 529        return prev_p;
 530}
 531
 532void set_personality_64bit(void)
 533{
 534        /* inherit personality from parent */
 535
 536        /* Make sure to be in 64bit mode */
 537        clear_thread_flag(TIF_IA32);
 538        clear_thread_flag(TIF_ADDR32);
 539        clear_thread_flag(TIF_X32);
 540        /* Pretend that this comes from a 64bit execve */
 541        task_pt_regs(current)->orig_ax = __NR_execve;
 542        current_thread_info()->status &= ~TS_COMPAT;
 543
 544        /* Ensure the corresponding mm is not marked. */
 545        if (current->mm)
 546                current->mm->context.ia32_compat = 0;
 547
 548        /* TBD: overwrites user setup. Should have two bits.
 549           But 64bit processes have always behaved this way,
 550           so it's not too bad. The main problem is just that
 551           32bit children are affected again. */
 552        current->personality &= ~READ_IMPLIES_EXEC;
 553}
 554
 555static void __set_personality_x32(void)
 556{
 557#ifdef CONFIG_X86_X32
 558        clear_thread_flag(TIF_IA32);
 559        set_thread_flag(TIF_X32);
 560        if (current->mm)
 561                current->mm->context.ia32_compat = TIF_X32;
 562        current->personality &= ~READ_IMPLIES_EXEC;
 563        /*
 564         * in_32bit_syscall() uses the presence of the x32 syscall bit
 565         * flag to determine compat status.  The x86 mmap() code relies on
 566         * the syscall bitness so set x32 syscall bit right here to make
 567         * in_32bit_syscall() work during exec().
 568         *
 569         * Pretend to come from a x32 execve.
 570         */
 571        task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
 572        current_thread_info()->status &= ~TS_COMPAT;
 573#endif
 574}
 575
 576static void __set_personality_ia32(void)
 577{
 578#ifdef CONFIG_IA32_EMULATION
 579        set_thread_flag(TIF_IA32);
 580        clear_thread_flag(TIF_X32);
 581        if (current->mm)
 582                current->mm->context.ia32_compat = TIF_IA32;
 583        current->personality |= force_personality32;
 584        /* Prepare the first "return" to user space */
 585        task_pt_regs(current)->orig_ax = __NR_ia32_execve;
 586        current_thread_info()->status |= TS_COMPAT;
 587#endif
 588}
 589
 590void set_personality_ia32(bool x32)
 591{
 592        /* Make sure to be in 32bit mode */
 593        set_thread_flag(TIF_ADDR32);
 594
 595        if (x32)
 596                __set_personality_x32();
 597        else
 598                __set_personality_ia32();
 599}
 600EXPORT_SYMBOL_GPL(set_personality_ia32);
 601
 602#ifdef CONFIG_CHECKPOINT_RESTORE
 603static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
 604{
 605        int ret;
 606
 607        ret = map_vdso_once(image, addr);
 608        if (ret)
 609                return ret;
 610
 611        return (long)image->size;
 612}
 613#endif
 614
 615long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
 616{
 617        int ret = 0;
 618
 619        switch (option) {
 620        case ARCH_SET_GS: {
 621                if (unlikely(arg2 >= TASK_SIZE_MAX))
 622                        return -EPERM;
 623
 624                preempt_disable();
 625                /*
 626                 * ARCH_SET_GS has always overwritten the index
 627                 * and the base. Zero is the most sensible value
 628                 * to put in the index, and is the only value that
 629                 * makes any sense if FSGSBASE is unavailable.
 630                 */
 631                if (task == current) {
 632                        loadseg(GS, 0);
 633                        x86_gsbase_write_cpu_inactive(arg2);
 634
 635                        /*
 636                         * On non-FSGSBASE systems, save_base_legacy() expects
 637                         * that we also fill in thread.gsbase.
 638                         */
 639                        task->thread.gsbase = arg2;
 640
 641                } else {
 642                        task->thread.gsindex = 0;
 643                        x86_gsbase_write_task(task, arg2);
 644                }
 645                preempt_enable();
 646                break;
 647        }
 648        case ARCH_SET_FS: {
 649                /*
 650                 * Not strictly needed for %fs, but do it for symmetry
 651                 * with %gs
 652                 */
 653                if (unlikely(arg2 >= TASK_SIZE_MAX))
 654                        return -EPERM;
 655
 656                preempt_disable();
 657                /*
 658                 * Set the selector to 0 for the same reason
 659                 * as %gs above.
 660                 */
 661                if (task == current) {
 662                        loadseg(FS, 0);
 663                        x86_fsbase_write_cpu(arg2);
 664
 665                        /*
 666                         * On non-FSGSBASE systems, save_base_legacy() expects
 667                         * that we also fill in thread.fsbase.
 668                         */
 669                        task->thread.fsbase = arg2;
 670                } else {
 671                        task->thread.fsindex = 0;
 672                        x86_fsbase_write_task(task, arg2);
 673                }
 674                preempt_enable();
 675                break;
 676        }
 677        case ARCH_GET_FS: {
 678                unsigned long base = x86_fsbase_read_task(task);
 679
 680                ret = put_user(base, (unsigned long __user *)arg2);
 681                break;
 682        }
 683        case ARCH_GET_GS: {
 684                unsigned long base = x86_gsbase_read_task(task);
 685
 686                ret = put_user(base, (unsigned long __user *)arg2);
 687                break;
 688        }
 689
 690#ifdef CONFIG_CHECKPOINT_RESTORE
 691# ifdef CONFIG_X86_X32_ABI
 692        case ARCH_MAP_VDSO_X32:
 693                return prctl_map_vdso(&vdso_image_x32, arg2);
 694# endif
 695# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
 696        case ARCH_MAP_VDSO_32:
 697                return prctl_map_vdso(&vdso_image_32, arg2);
 698# endif
 699        case ARCH_MAP_VDSO_64:
 700                return prctl_map_vdso(&vdso_image_64, arg2);
 701#endif
 702
 703        default:
 704                ret = -EINVAL;
 705                break;
 706        }
 707
 708        return ret;
 709}
 710
 711SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
 712{
 713        long ret;
 714
 715        ret = do_arch_prctl_64(current, option, arg2);
 716        if (ret == -EINVAL)
 717                ret = do_arch_prctl_common(current, option, arg2);
 718
 719        return ret;
 720}
 721
 722#ifdef CONFIG_IA32_EMULATION
 723COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
 724{
 725        return do_arch_prctl_common(current, option, arg2);
 726}
 727#endif
 728
 729unsigned long KSTK_ESP(struct task_struct *task)
 730{
 731        return task_pt_regs(task)->sp;
 732}
 733