linux/arch/x86/entry/vdso/vma.c
<<
>>
Prefs
   1/*
   2 * Copyright 2007 Andi Kleen, SUSE Labs.
   3 * Subject to the GPL, v.2
   4 *
   5 * This contains most of the x86 vDSO kernel-side code.
   6 */
   7#include <linux/mm.h>
   8#include <linux/err.h>
   9#include <linux/sched.h>
  10#include <linux/slab.h>
  11#include <linux/init.h>
  12#include <linux/random.h>
  13#include <linux/elf.h>
  14#include <linux/cpu.h>
  15#include <linux/ptrace.h>
  16#include <asm/pvclock.h>
  17#include <asm/vgtod.h>
  18#include <asm/proto.h>
  19#include <asm/vdso.h>
  20#include <asm/vvar.h>
  21#include <asm/page.h>
  22#include <asm/desc.h>
  23#include <asm/cpufeature.h>
  24
  25#if defined(CONFIG_X86_64)
  26unsigned int __read_mostly vdso64_enabled = 1;
  27#endif
  28
  29void __init init_vdso_image(const struct vdso_image *image)
  30{
  31        BUG_ON(image->size % PAGE_SIZE != 0);
  32
  33        apply_alternatives((struct alt_instr *)(image->data + image->alt),
  34                           (struct alt_instr *)(image->data + image->alt +
  35                                                image->alt_len));
  36}
  37
  38struct linux_binprm;
  39
  40static int vdso_fault(const struct vm_special_mapping *sm,
  41                      struct vm_area_struct *vma, struct vm_fault *vmf)
  42{
  43        const struct vdso_image *image = vma->vm_mm->context.vdso_image;
  44
  45        if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size)
  46                return VM_FAULT_SIGBUS;
  47
  48        vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT));
  49        get_page(vmf->page);
  50        return 0;
  51}
  52
  53static void vdso_fix_landing(const struct vdso_image *image,
  54                struct vm_area_struct *new_vma)
  55{
  56#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
  57        if (in_ia32_syscall() && image == &vdso_image_32) {
  58                struct pt_regs *regs = current_pt_regs();
  59                unsigned long vdso_land = image->sym_int80_landing_pad;
  60                unsigned long old_land_addr = vdso_land +
  61                        (unsigned long)current->mm->context.vdso;
  62
  63                /* Fixing userspace landing - look at do_fast_syscall_32 */
  64                if (regs->ip == old_land_addr)
  65                        regs->ip = new_vma->vm_start + vdso_land;
  66        }
  67#endif
  68}
  69
  70static int vdso_mremap(const struct vm_special_mapping *sm,
  71                struct vm_area_struct *new_vma)
  72{
  73        unsigned long new_size = new_vma->vm_end - new_vma->vm_start;
  74        const struct vdso_image *image = current->mm->context.vdso_image;
  75
  76        if (image->size != new_size)
  77                return -EINVAL;
  78
  79        if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
  80                return -EFAULT;
  81
  82        vdso_fix_landing(image, new_vma);
  83        current->mm->context.vdso = (void __user *)new_vma->vm_start;
  84
  85        return 0;
  86}
  87
  88static int vvar_fault(const struct vm_special_mapping *sm,
  89                      struct vm_area_struct *vma, struct vm_fault *vmf)
  90{
  91        const struct vdso_image *image = vma->vm_mm->context.vdso_image;
  92        long sym_offset;
  93        int ret = -EFAULT;
  94
  95        if (!image)
  96                return VM_FAULT_SIGBUS;
  97
  98        sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) +
  99                image->sym_vvar_start;
 100
 101        /*
 102         * Sanity check: a symbol offset of zero means that the page
 103         * does not exist for this vdso image, not that the page is at
 104         * offset zero relative to the text mapping.  This should be
 105         * impossible here, because sym_offset should only be zero for
 106         * the page past the end of the vvar mapping.
 107         */
 108        if (sym_offset == 0)
 109                return VM_FAULT_SIGBUS;
 110
 111        if (sym_offset == image->sym_vvar_page) {
 112                ret = vm_insert_pfn(vma, vmf->address,
 113                                    __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
 114        } else if (sym_offset == image->sym_pvclock_page) {
 115                struct pvclock_vsyscall_time_info *pvti =
 116                        pvclock_pvti_cpu0_va();
 117                if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
 118                        ret = vm_insert_pfn(
 119                                vma,
 120                                vmf->address,
 121                                __pa(pvti) >> PAGE_SHIFT);
 122                }
 123        }
 124
 125        if (ret == 0 || ret == -EBUSY)
 126                return VM_FAULT_NOPAGE;
 127
 128        return VM_FAULT_SIGBUS;
 129}
 130
 131static const struct vm_special_mapping vdso_mapping = {
 132        .name = "[vdso]",
 133        .fault = vdso_fault,
 134        .mremap = vdso_mremap,
 135};
 136static const struct vm_special_mapping vvar_mapping = {
 137        .name = "[vvar]",
 138        .fault = vvar_fault,
 139};
 140
 141/*
 142 * Add vdso and vvar mappings to current process.
 143 * @image          - blob to map
 144 * @addr           - request a specific address (zero to map at free addr)
 145 */
 146static int map_vdso(const struct vdso_image *image, unsigned long addr)
 147{
 148        struct mm_struct *mm = current->mm;
 149        struct vm_area_struct *vma;
 150        unsigned long text_start;
 151        int ret = 0;
 152
 153        if (down_write_killable(&mm->mmap_sem))
 154                return -EINTR;
 155
 156        addr = get_unmapped_area(NULL, addr,
 157                                 image->size - image->sym_vvar_start, 0, 0);
 158        if (IS_ERR_VALUE(addr)) {
 159                ret = addr;
 160                goto up_fail;
 161        }
 162
 163        text_start = addr - image->sym_vvar_start;
 164
 165        /*
 166         * MAYWRITE to allow gdb to COW and set breakpoints
 167         */
 168        vma = _install_special_mapping(mm,
 169                                       text_start,
 170                                       image->size,
 171                                       VM_READ|VM_EXEC|
 172                                       VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
 173                                       &vdso_mapping);
 174
 175        if (IS_ERR(vma)) {
 176                ret = PTR_ERR(vma);
 177                goto up_fail;
 178        }
 179
 180        vma = _install_special_mapping(mm,
 181                                       addr,
 182                                       -image->sym_vvar_start,
 183                                       VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP|
 184                                       VM_PFNMAP,
 185                                       &vvar_mapping);
 186
 187        if (IS_ERR(vma)) {
 188                ret = PTR_ERR(vma);
 189                do_munmap(mm, text_start, image->size);
 190        } else {
 191                current->mm->context.vdso = (void __user *)text_start;
 192                current->mm->context.vdso_image = image;
 193        }
 194
 195up_fail:
 196        up_write(&mm->mmap_sem);
 197        return ret;
 198}
 199
 200#ifdef CONFIG_X86_64
 201/*
 202 * Put the vdso above the (randomized) stack with another randomized
 203 * offset.  This way there is no hole in the middle of address space.
 204 * To save memory make sure it is still in the same PTE as the stack
 205 * top.  This doesn't give that many random bits.
 206 *
 207 * Note that this algorithm is imperfect: the distribution of the vdso
 208 * start address within a PMD is biased toward the end.
 209 *
 210 * Only used for the 64-bit and x32 vdsos.
 211 */
 212static unsigned long vdso_addr(unsigned long start, unsigned len)
 213{
 214        unsigned long addr, end;
 215        unsigned offset;
 216
 217        /*
 218         * Round up the start address.  It can start out unaligned as a result
 219         * of stack start randomization.
 220         */
 221        start = PAGE_ALIGN(start);
 222
 223        /* Round the lowest possible end address up to a PMD boundary. */
 224        end = (start + len + PMD_SIZE - 1) & PMD_MASK;
 225        if (end >= TASK_SIZE_MAX)
 226                end = TASK_SIZE_MAX;
 227        end -= len;
 228
 229        if (end > start) {
 230                offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1);
 231                addr = start + (offset << PAGE_SHIFT);
 232        } else {
 233                addr = start;
 234        }
 235
 236        /*
 237         * Forcibly align the final address in case we have a hardware
 238         * issue that requires alignment for performance reasons.
 239         */
 240        addr = align_vdso_addr(addr);
 241
 242        return addr;
 243}
 244
 245static int map_vdso_randomized(const struct vdso_image *image)
 246{
 247        unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start);
 248
 249        return map_vdso(image, addr);
 250}
 251#endif
 252
 253int map_vdso_once(const struct vdso_image *image, unsigned long addr)
 254{
 255        struct mm_struct *mm = current->mm;
 256        struct vm_area_struct *vma;
 257
 258        down_write(&mm->mmap_sem);
 259        /*
 260         * Check if we have already mapped vdso blob - fail to prevent
 261         * abusing from userspace install_speciall_mapping, which may
 262         * not do accounting and rlimit right.
 263         * We could search vma near context.vdso, but it's a slowpath,
 264         * so let's explicitely check all VMAs to be completely sure.
 265         */
 266        for (vma = mm->mmap; vma; vma = vma->vm_next) {
 267                if (vma_is_special_mapping(vma, &vdso_mapping) ||
 268                                vma_is_special_mapping(vma, &vvar_mapping)) {
 269                        up_write(&mm->mmap_sem);
 270                        return -EEXIST;
 271                }
 272        }
 273        up_write(&mm->mmap_sem);
 274
 275        return map_vdso(image, addr);
 276}
 277
 278#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
 279static int load_vdso32(void)
 280{
 281        if (vdso32_enabled != 1)  /* Other values all mean "disabled" */
 282                return 0;
 283
 284        return map_vdso(&vdso_image_32, 0);
 285}
 286#endif
 287
 288#ifdef CONFIG_X86_64
 289int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 290{
 291        if (!vdso64_enabled)
 292                return 0;
 293
 294        return map_vdso_randomized(&vdso_image_64);
 295}
 296
 297#ifdef CONFIG_COMPAT
 298int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
 299                                       int uses_interp)
 300{
 301#ifdef CONFIG_X86_X32_ABI
 302        if (test_thread_flag(TIF_X32)) {
 303                if (!vdso64_enabled)
 304                        return 0;
 305                return map_vdso_randomized(&vdso_image_x32);
 306        }
 307#endif
 308#ifdef CONFIG_IA32_EMULATION
 309        return load_vdso32();
 310#else
 311        return 0;
 312#endif
 313}
 314#endif
 315#else
 316int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 317{
 318        return load_vdso32();
 319}
 320#endif
 321
 322#ifdef CONFIG_X86_64
 323static __init int vdso_setup(char *s)
 324{
 325        vdso64_enabled = simple_strtoul(s, NULL, 0);
 326        return 0;
 327}
 328__setup("vdso=", vdso_setup);
 329#endif
 330
 331#ifdef CONFIG_X86_64
 332static void vgetcpu_cpu_init(void *arg)
 333{
 334        int cpu = smp_processor_id();
 335        struct desc_struct d = { };
 336        unsigned long node = 0;
 337#ifdef CONFIG_NUMA
 338        node = cpu_to_node(cpu);
 339#endif
 340        if (static_cpu_has(X86_FEATURE_RDTSCP))
 341                write_rdtscp_aux((node << 12) | cpu);
 342
 343        /*
 344         * Store cpu number in limit so that it can be loaded
 345         * quickly in user space in vgetcpu. (12 bits for the CPU
 346         * and 8 bits for the node)
 347         */
 348        d.limit0 = cpu | ((node & 0xf) << 12);
 349        d.limit = node >> 4;
 350        d.type = 5;             /* RO data, expand down, accessed */
 351        d.dpl = 3;              /* Visible to user code */
 352        d.s = 1;                /* Not a system segment */
 353        d.p = 1;                /* Present */
 354        d.d = 1;                /* 32-bit */
 355
 356        write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
 357}
 358
 359static int vgetcpu_online(unsigned int cpu)
 360{
 361        return smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1);
 362}
 363
 364static int __init init_vdso(void)
 365{
 366        init_vdso_image(&vdso_image_64);
 367
 368#ifdef CONFIG_X86_X32_ABI
 369        init_vdso_image(&vdso_image_x32);
 370#endif
 371
 372        /* notifier priority > KVM */
 373        return cpuhp_setup_state(CPUHP_AP_X86_VDSO_VMA_ONLINE,
 374                                 "x86/vdso/vma:online", vgetcpu_online, NULL);
 375}
 376subsys_initcall(init_vdso);
 377#endif /* CONFIG_X86_64 */
 378