linux/mm/userfaultfd.c
<<
>>
Prefs
   1/*
   2 *  mm/userfaultfd.c
   3 *
   4 *  Copyright (C) 2015  Red Hat, Inc.
   5 *
   6 *  This work is licensed under the terms of the GNU GPL, version 2. See
   7 *  the COPYING file in the top-level directory.
   8 */
   9
  10#include <linux/mm.h>
  11#include <linux/pagemap.h>
  12#include <linux/rmap.h>
  13#include <linux/swap.h>
  14#include <linux/swapops.h>
  15#include <linux/userfaultfd_k.h>
  16#include <linux/mmu_notifier.h>
  17#include <asm/tlbflush.h>
  18#include "internal.h"
  19
  20static int mcopy_atomic_pte(struct mm_struct *dst_mm,
  21                            pmd_t *dst_pmd,
  22                            struct vm_area_struct *dst_vma,
  23                            unsigned long dst_addr,
  24                            unsigned long src_addr,
  25                            struct page **pagep)
  26{
  27        struct mem_cgroup *memcg;
  28        pte_t _dst_pte, *dst_pte;
  29        spinlock_t *ptl;
  30        void *page_kaddr;
  31        int ret;
  32        struct page *page;
  33
  34        if (!*pagep) {
  35                ret = -ENOMEM;
  36                page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
  37                if (!page)
  38                        goto out;
  39
  40                page_kaddr = kmap_atomic(page);
  41                ret = copy_from_user(page_kaddr,
  42                                     (const void __user *) src_addr,
  43                                     PAGE_SIZE);
  44                kunmap_atomic(page_kaddr);
  45
  46                /* fallback to copy_from_user outside mmap_sem */
  47                if (unlikely(ret)) {
  48                        ret = -EFAULT;
  49                        *pagep = page;
  50                        /* don't free the page */
  51                        goto out;
  52                }
  53        } else {
  54                page = *pagep;
  55                *pagep = NULL;
  56        }
  57
  58        /*
  59         * The memory barrier inside __SetPageUptodate makes sure that
  60         * preceeding stores to the page contents become visible before
  61         * the set_pte_at() write.
  62         */
  63        __SetPageUptodate(page);
  64
  65        ret = -ENOMEM;
  66        if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg))
  67                goto out_release;
  68
  69        _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
  70        if (dst_vma->vm_flags & VM_WRITE)
  71                _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
  72
  73        ret = -EEXIST;
  74        dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
  75        if (!pte_none(*dst_pte))
  76                goto out_release_uncharge_unlock;
  77
  78        inc_mm_counter(dst_mm, MM_ANONPAGES);
  79        page_add_new_anon_rmap(page, dst_vma, dst_addr);
  80        mem_cgroup_commit_charge(page, memcg, false);
  81        lru_cache_add_active_or_unevictable(page, dst_vma);
  82
  83        set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
  84
  85        /* No need to invalidate - it was non-present before */
  86        update_mmu_cache(dst_vma, dst_addr, dst_pte);
  87
  88        pte_unmap_unlock(dst_pte, ptl);
  89        ret = 0;
  90out:
  91        return ret;
  92out_release_uncharge_unlock:
  93        pte_unmap_unlock(dst_pte, ptl);
  94        mem_cgroup_cancel_charge(page, memcg);
  95out_release:
  96        page_cache_release(page);
  97        goto out;
  98}
  99
 100static int mfill_zeropage_pte(struct mm_struct *dst_mm,
 101                              pmd_t *dst_pmd,
 102                              struct vm_area_struct *dst_vma,
 103                              unsigned long dst_addr)
 104{
 105        pte_t _dst_pte, *dst_pte;
 106        spinlock_t *ptl;
 107        int ret;
 108
 109        _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
 110                                         dst_vma->vm_page_prot));
 111        ret = -EEXIST;
 112        dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
 113        if (!pte_none(*dst_pte))
 114                goto out_unlock;
 115        set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
 116        /* No need to invalidate - it was non-present before */
 117        update_mmu_cache(dst_vma, dst_addr, dst_pte);
 118        ret = 0;
 119out_unlock:
 120        pte_unmap_unlock(dst_pte, ptl);
 121        return ret;
 122}
 123
 124static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
 125{
 126        pgd_t *pgd;
 127        pud_t *pud;
 128        pmd_t *pmd = NULL;
 129
 130        pgd = pgd_offset(mm, address);
 131        pud = pud_alloc(mm, pgd, address);
 132        if (pud)
 133                /*
 134                 * Note that we didn't run this because the pmd was
 135                 * missing, the *pmd may be already established and in
 136                 * turn it may also be a trans_huge_pmd.
 137                 */
 138                pmd = pmd_alloc(mm, pud, address);
 139        return pmd;
 140}
 141
 142static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 143                                              unsigned long dst_start,
 144                                              unsigned long src_start,
 145                                              unsigned long len,
 146                                              bool zeropage)
 147{
 148        struct vm_area_struct *dst_vma;
 149        ssize_t err;
 150        pmd_t *dst_pmd;
 151        unsigned long src_addr, dst_addr;
 152        long copied;
 153        struct page *page;
 154
 155        /*
 156         * Sanitize the command parameters:
 157         */
 158        BUG_ON(dst_start & ~PAGE_MASK);
 159        BUG_ON(len & ~PAGE_MASK);
 160
 161        /* Does the address range wrap, or is the span zero-sized? */
 162        BUG_ON(src_start + len <= src_start);
 163        BUG_ON(dst_start + len <= dst_start);
 164
 165        src_addr = src_start;
 166        dst_addr = dst_start;
 167        copied = 0;
 168        page = NULL;
 169retry:
 170        down_read(&dst_mm->mmap_sem);
 171
 172        /*
 173         * Make sure the vma is not shared, that the dst range is
 174         * both valid and fully within a single existing vma.
 175         */
 176        err = -EINVAL;
 177        dst_vma = find_vma(dst_mm, dst_start);
 178        if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
 179                goto out_unlock;
 180        if (dst_start < dst_vma->vm_start ||
 181            dst_start + len > dst_vma->vm_end)
 182                goto out_unlock;
 183
 184        /*
 185         * Be strict and only allow __mcopy_atomic on userfaultfd
 186         * registered ranges to prevent userland errors going
 187         * unnoticed. As far as the VM consistency is concerned, it
 188         * would be perfectly safe to remove this check, but there's
 189         * no useful usage for __mcopy_atomic ouside of userfaultfd
 190         * registered ranges. This is after all why these are ioctls
 191         * belonging to the userfaultfd and not syscalls.
 192         */
 193        if (!dst_vma->vm_userfaultfd_ctx.ctx)
 194                goto out_unlock;
 195
 196        /*
 197         * FIXME: only allow copying on anonymous vmas, tmpfs should
 198         * be added.
 199         */
 200        if (dst_vma->vm_ops)
 201                goto out_unlock;
 202
 203        /*
 204         * Ensure the dst_vma has a anon_vma or this page
 205         * would get a NULL anon_vma when moved in the
 206         * dst_vma.
 207         */
 208        err = -ENOMEM;
 209        if (unlikely(anon_vma_prepare(dst_vma)))
 210                goto out_unlock;
 211
 212        while (src_addr < src_start + len) {
 213                pmd_t dst_pmdval;
 214
 215                BUG_ON(dst_addr >= dst_start + len);
 216
 217                dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
 218                if (unlikely(!dst_pmd)) {
 219                        err = -ENOMEM;
 220                        break;
 221                }
 222
 223                dst_pmdval = pmd_read_atomic(dst_pmd);
 224                /*
 225                 * If the dst_pmd is mapped as THP don't
 226                 * override it and just be strict.
 227                 */
 228                if (unlikely(pmd_trans_huge(dst_pmdval))) {
 229                        err = -EEXIST;
 230                        break;
 231                }
 232                if (unlikely(pmd_none(dst_pmdval)) &&
 233                    unlikely(__pte_alloc(dst_mm, dst_vma, dst_pmd,
 234                                         dst_addr))) {
 235                        err = -ENOMEM;
 236                        break;
 237                }
 238                /* If an huge pmd materialized from under us fail */
 239                if (unlikely(pmd_trans_huge(*dst_pmd))) {
 240                        err = -EFAULT;
 241                        break;
 242                }
 243
 244                BUG_ON(pmd_none(*dst_pmd));
 245                BUG_ON(pmd_trans_huge(*dst_pmd));
 246
 247                if (!zeropage)
 248                        err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
 249                                               dst_addr, src_addr, &page);
 250                else
 251                        err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma,
 252                                                 dst_addr);
 253
 254                cond_resched();
 255
 256                if (unlikely(err == -EFAULT)) {
 257                        void *page_kaddr;
 258
 259                        up_read(&dst_mm->mmap_sem);
 260                        BUG_ON(!page);
 261
 262                        page_kaddr = kmap(page);
 263                        err = copy_from_user(page_kaddr,
 264                                             (const void __user *) src_addr,
 265                                             PAGE_SIZE);
 266                        kunmap(page);
 267                        if (unlikely(err)) {
 268                                err = -EFAULT;
 269                                goto out;
 270                        }
 271                        goto retry;
 272                } else
 273                        BUG_ON(page);
 274
 275                if (!err) {
 276                        dst_addr += PAGE_SIZE;
 277                        src_addr += PAGE_SIZE;
 278                        copied += PAGE_SIZE;
 279
 280                        if (fatal_signal_pending(current))
 281                                err = -EINTR;
 282                }
 283                if (err)
 284                        break;
 285        }
 286
 287out_unlock:
 288        up_read(&dst_mm->mmap_sem);
 289out:
 290        if (page)
 291                page_cache_release(page);
 292        BUG_ON(copied < 0);
 293        BUG_ON(err > 0);
 294        BUG_ON(!copied && !err);
 295        return copied ? copied : err;
 296}
 297
 298ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
 299                     unsigned long src_start, unsigned long len)
 300{
 301        return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
 302}
 303
 304ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
 305                       unsigned long len)
 306{
 307        return __mcopy_atomic(dst_mm, start, 0, len, true);
 308}
 309