linux/arch/x86/entry/vsyscall/vsyscall_64.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net>
   4 *
   5 * Based on the original implementation which is:
   6 *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
   7 *  Copyright 2003 Andi Kleen, SuSE Labs.
   8 *
   9 *  Parts of the original code have been moved to arch/x86/vdso/vma.c
  10 *
  11 * This file implements vsyscall emulation.  vsyscalls are a legacy ABI:
  12 * Userspace can request certain kernel services by calling fixed
  13 * addresses.  This concept is problematic:
  14 *
  15 * - It interferes with ASLR.
  16 * - It's awkward to write code that lives in kernel addresses but is
  17 *   callable by userspace at fixed addresses.
  18 * - The whole concept is impossible for 32-bit compat userspace.
  19 * - UML cannot easily virtualize a vsyscall.
  20 *
  21 * As of mid-2014, I believe that there is no new userspace code that
  22 * will use a vsyscall if the vDSO is present.  I hope that there will
  23 * soon be no new userspace code that will ever use a vsyscall.
  24 *
  25 * The code in this file emulates vsyscalls when notified of a page
  26 * fault to a vsyscall address.
  27 */
  28
  29#include <linux/kernel.h>
  30#include <linux/timer.h>
  31#include <linux/sched/signal.h>
  32#include <linux/mm_types.h>
  33#include <linux/syscalls.h>
  34#include <linux/ratelimit.h>
  35
  36#include <asm/vsyscall.h>
  37#include <asm/unistd.h>
  38#include <asm/fixmap.h>
  39#include <asm/traps.h>
  40
  41#define CREATE_TRACE_POINTS
  42#include "vsyscall_trace.h"
  43
  44static enum { EMULATE, NATIVE, NONE } vsyscall_mode =
  45#if defined(CONFIG_LEGACY_VSYSCALL_NATIVE)
  46        NATIVE;
  47#elif defined(CONFIG_LEGACY_VSYSCALL_NONE)
  48        NONE;
  49#else
  50        EMULATE;
  51#endif
  52
  53static int __init vsyscall_setup(char *str)
  54{
  55        if (str) {
  56                if (!strcmp("emulate", str))
  57                        vsyscall_mode = EMULATE;
  58                else if (!strcmp("native", str))
  59                        vsyscall_mode = NATIVE;
  60                else if (!strcmp("none", str))
  61                        vsyscall_mode = NONE;
  62                else
  63                        return -EINVAL;
  64
  65                return 0;
  66        }
  67
  68        return -EINVAL;
  69}
  70early_param("vsyscall", vsyscall_setup);
  71
  72static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
  73                              const char *message)
  74{
  75        if (!show_unhandled_signals)
  76                return;
  77
  78        printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
  79                           level, current->comm, task_pid_nr(current),
  80                           message, regs->ip, regs->cs,
  81                           regs->sp, regs->ax, regs->si, regs->di);
  82}
  83
  84static int addr_to_vsyscall_nr(unsigned long addr)
  85{
  86        int nr;
  87
  88        if ((addr & ~0xC00UL) != VSYSCALL_ADDR)
  89                return -EINVAL;
  90
  91        nr = (addr & 0xC00UL) >> 10;
  92        if (nr >= 3)
  93                return -EINVAL;
  94
  95        return nr;
  96}
  97
  98static bool write_ok_or_segv(unsigned long ptr, size_t size)
  99{
 100        /*
 101         * XXX: if access_ok, get_user, and put_user handled
 102         * sig_on_uaccess_err, this could go away.
 103         */
 104
 105        if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
 106                siginfo_t info;
 107                struct thread_struct *thread = &current->thread;
 108
 109                thread->error_code      = 6;  /* user fault, no page, write */
 110                thread->cr2             = ptr;
 111                thread->trap_nr         = X86_TRAP_PF;
 112
 113                memset(&info, 0, sizeof(info));
 114                info.si_signo           = SIGSEGV;
 115                info.si_errno           = 0;
 116                info.si_code            = SEGV_MAPERR;
 117                info.si_addr            = (void __user *)ptr;
 118
 119                force_sig_info(SIGSEGV, &info, current);
 120                return false;
 121        } else {
 122                return true;
 123        }
 124}
 125
 126bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 127{
 128        struct task_struct *tsk;
 129        unsigned long caller;
 130        int vsyscall_nr, syscall_nr, tmp;
 131        int prev_sig_on_uaccess_err;
 132        long ret;
 133
 134        /*
 135         * No point in checking CS -- the only way to get here is a user mode
 136         * trap to a high address, which means that we're in 64-bit user code.
 137         */
 138
 139        WARN_ON_ONCE(address != regs->ip);
 140
 141        if (vsyscall_mode == NONE) {
 142                warn_bad_vsyscall(KERN_INFO, regs,
 143                                  "vsyscall attempted with vsyscall=none");
 144                return false;
 145        }
 146
 147        vsyscall_nr = addr_to_vsyscall_nr(address);
 148
 149        trace_emulate_vsyscall(vsyscall_nr);
 150
 151        if (vsyscall_nr < 0) {
 152                warn_bad_vsyscall(KERN_WARNING, regs,
 153                                  "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");
 154                goto sigsegv;
 155        }
 156
 157        if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
 158                warn_bad_vsyscall(KERN_WARNING, regs,
 159                                  "vsyscall with bad stack (exploit attempt?)");
 160                goto sigsegv;
 161        }
 162
 163        tsk = current;
 164
 165        /*
 166         * Check for access_ok violations and find the syscall nr.
 167         *
 168         * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
 169         * 64-bit, so we don't need to special-case it here.  For all the
 170         * vsyscalls, NULL means "don't write anything" not "write it at
 171         * address 0".
 172         */
 173        switch (vsyscall_nr) {
 174        case 0:
 175                if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
 176                    !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
 177                        ret = -EFAULT;
 178                        goto check_fault;
 179                }
 180
 181                syscall_nr = __NR_gettimeofday;
 182                break;
 183
 184        case 1:
 185                if (!write_ok_or_segv(regs->di, sizeof(time_t))) {
 186                        ret = -EFAULT;
 187                        goto check_fault;
 188                }
 189
 190                syscall_nr = __NR_time;
 191                break;
 192
 193        case 2:
 194                if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
 195                    !write_ok_or_segv(regs->si, sizeof(unsigned))) {
 196                        ret = -EFAULT;
 197                        goto check_fault;
 198                }
 199
 200                syscall_nr = __NR_getcpu;
 201                break;
 202        }
 203
 204        /*
 205         * Handle seccomp.  regs->ip must be the original value.
 206         * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt.
 207         *
 208         * We could optimize the seccomp disabled case, but performance
 209         * here doesn't matter.
 210         */
 211        regs->orig_ax = syscall_nr;
 212        regs->ax = -ENOSYS;
 213        tmp = secure_computing(NULL);
 214        if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
 215                warn_bad_vsyscall(KERN_DEBUG, regs,
 216                                  "seccomp tried to change syscall nr or ip");
 217                do_exit(SIGSYS);
 218        }
 219        regs->orig_ax = -1;
 220        if (tmp)
 221                goto do_ret;  /* skip requested */
 222
 223        /*
 224         * With a real vsyscall, page faults cause SIGSEGV.  We want to
 225         * preserve that behavior to make writing exploits harder.
 226         */
 227        prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err;
 228        current->thread.sig_on_uaccess_err = 1;
 229
 230        ret = -EFAULT;
 231        switch (vsyscall_nr) {
 232        case 0:
 233                ret = sys_gettimeofday(
 234                        (struct timeval __user *)regs->di,
 235                        (struct timezone __user *)regs->si);
 236                break;
 237
 238        case 1:
 239                ret = sys_time((time_t __user *)regs->di);
 240                break;
 241
 242        case 2:
 243                ret = sys_getcpu((unsigned __user *)regs->di,
 244                                 (unsigned __user *)regs->si,
 245                                 NULL);
 246                break;
 247        }
 248
 249        current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err;
 250
 251check_fault:
 252        if (ret == -EFAULT) {
 253                /* Bad news -- userspace fed a bad pointer to a vsyscall. */
 254                warn_bad_vsyscall(KERN_INFO, regs,
 255                                  "vsyscall fault (exploit attempt?)");
 256
 257                /*
 258                 * If we failed to generate a signal for any reason,
 259                 * generate one here.  (This should be impossible.)
 260                 */
 261                if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
 262                                 !sigismember(&tsk->pending.signal, SIGSEGV)))
 263                        goto sigsegv;
 264
 265                return true;  /* Don't emulate the ret. */
 266        }
 267
 268        regs->ax = ret;
 269
 270do_ret:
 271        /* Emulate a ret instruction. */
 272        regs->ip = caller;
 273        regs->sp += 8;
 274        return true;
 275
 276sigsegv:
 277        force_sig(SIGSEGV, current);
 278        return true;
 279}
 280
 281/*
 282 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 283 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 284 * not need special handling anymore:
 285 */
 286static const char *gate_vma_name(struct vm_area_struct *vma)
 287{
 288        return "[vsyscall]";
 289}
 290static const struct vm_operations_struct gate_vma_ops = {
 291        .name = gate_vma_name,
 292};
 293static struct vm_area_struct gate_vma = {
 294        .vm_start       = VSYSCALL_ADDR,
 295        .vm_end         = VSYSCALL_ADDR + PAGE_SIZE,
 296        .vm_page_prot   = PAGE_READONLY_EXEC,
 297        .vm_flags       = VM_READ | VM_EXEC,
 298        .vm_ops         = &gate_vma_ops,
 299};
 300
 301struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
 302{
 303#ifdef CONFIG_COMPAT
 304        if (!mm || mm->context.ia32_compat)
 305                return NULL;
 306#endif
 307        if (vsyscall_mode == NONE)
 308                return NULL;
 309        return &gate_vma;
 310}
 311
 312int in_gate_area(struct mm_struct *mm, unsigned long addr)
 313{
 314        struct vm_area_struct *vma = get_gate_vma(mm);
 315
 316        if (!vma)
 317                return 0;
 318
 319        return (addr >= vma->vm_start) && (addr < vma->vm_end);
 320}
 321
 322/*
 323 * Use this when you have no reliable mm, typically from interrupt
 324 * context. It is less reliable than using a task's mm and may give
 325 * false positives.
 326 */
 327int in_gate_area_no_mm(unsigned long addr)
 328{
 329        return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
 330}
 331
 332void __init map_vsyscall(void)
 333{
 334        extern char __vsyscall_page;
 335        unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
 336
 337        if (vsyscall_mode != NONE)
 338                __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
 339                             vsyscall_mode == NATIVE
 340                             ? PAGE_KERNEL_VSYSCALL
 341                             : PAGE_KERNEL_VVAR);
 342
 343        BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
 344                     (unsigned long)VSYSCALL_ADDR);
 345}
 346