linux/mm/memfd.c
<<
>>
Prefs
   1/*
   2 * memfd_create system call and file sealing support
   3 *
   4 * Code was originally included in shmem.c, and broken out to facilitate
   5 * use by hugetlbfs as well as tmpfs.
   6 *
   7 * This file is released under the GPL.
   8 */
   9
  10#include <linux/fs.h>
  11#include <linux/vfs.h>
  12#include <linux/pagemap.h>
  13#include <linux/file.h>
  14#include <linux/mm.h>
  15#include <linux/sched/signal.h>
  16#include <linux/khugepaged.h>
  17#include <linux/syscalls.h>
  18#include <linux/hugetlb.h>
  19#include <linux/shmem_fs.h>
  20#include <linux/memfd.h>
  21#include <uapi/linux/memfd.h>
  22
  23/*
  24 * We need a tag: a new tag would expand every xa_node by 8 bytes,
  25 * so reuse a tag which we firmly believe is never set or cleared on tmpfs
  26 * or hugetlbfs because they are memory only filesystems.
  27 */
  28#define MEMFD_TAG_PINNED        PAGECACHE_TAG_TOWRITE
  29#define LAST_SCAN               4       /* about 150ms max */
  30
  31static void memfd_tag_pins(struct xa_state *xas)
  32{
  33        struct page *page;
  34        unsigned int tagged = 0;
  35
  36        lru_add_drain();
  37
  38        xas_lock_irq(xas);
  39        xas_for_each(xas, page, ULONG_MAX) {
  40                if (xa_is_value(page))
  41                        continue;
  42                if (page_count(page) - page_mapcount(page) > 1)
  43                        xas_set_mark(xas, MEMFD_TAG_PINNED);
  44
  45                if (++tagged % XA_CHECK_SCHED)
  46                        continue;
  47
  48                xas_pause(xas);
  49                xas_unlock_irq(xas);
  50                cond_resched();
  51                xas_lock_irq(xas);
  52        }
  53        xas_unlock_irq(xas);
  54}
  55
  56/*
  57 * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
  58 * via get_user_pages(), drivers might have some pending I/O without any active
  59 * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
  60 * and see whether it has an elevated ref-count. If so, we tag them and wait for
  61 * them to be dropped.
  62 * The caller must guarantee that no new user will acquire writable references
  63 * to those pages to avoid races.
  64 */
  65static int memfd_wait_for_pins(struct address_space *mapping)
  66{
  67        XA_STATE(xas, &mapping->i_pages, 0);
  68        struct page *page;
  69        int error, scan;
  70
  71        memfd_tag_pins(&xas);
  72
  73        error = 0;
  74        for (scan = 0; scan <= LAST_SCAN; scan++) {
  75                unsigned int tagged = 0;
  76
  77                if (!xas_marked(&xas, MEMFD_TAG_PINNED))
  78                        break;
  79
  80                if (!scan)
  81                        lru_add_drain_all();
  82                else if (schedule_timeout_killable((HZ << scan) / 200))
  83                        scan = LAST_SCAN;
  84
  85                xas_set(&xas, 0);
  86                xas_lock_irq(&xas);
  87                xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
  88                        bool clear = true;
  89                        if (xa_is_value(page))
  90                                continue;
  91                        if (page_count(page) - page_mapcount(page) != 1) {
  92                                /*
  93                                 * On the last scan, we clean up all those tags
  94                                 * we inserted; but make a note that we still
  95                                 * found pages pinned.
  96                                 */
  97                                if (scan == LAST_SCAN)
  98                                        error = -EBUSY;
  99                                else
 100                                        clear = false;
 101                        }
 102                        if (clear)
 103                                xas_clear_mark(&xas, MEMFD_TAG_PINNED);
 104                        if (++tagged % XA_CHECK_SCHED)
 105                                continue;
 106
 107                        xas_pause(&xas);
 108                        xas_unlock_irq(&xas);
 109                        cond_resched();
 110                        xas_lock_irq(&xas);
 111                }
 112                xas_unlock_irq(&xas);
 113        }
 114
 115        return error;
 116}
 117
 118static unsigned int *memfd_file_seals_ptr(struct file *file)
 119{
 120        if (shmem_file(file))
 121                return &SHMEM_I(file_inode(file))->seals;
 122
 123#ifdef CONFIG_HUGETLBFS
 124        if (is_file_hugepages(file))
 125                return &HUGETLBFS_I(file_inode(file))->seals;
 126#endif
 127
 128        return NULL;
 129}
 130
 131#define F_ALL_SEALS (F_SEAL_SEAL | \
 132                     F_SEAL_SHRINK | \
 133                     F_SEAL_GROW | \
 134                     F_SEAL_WRITE | \
 135                     F_SEAL_FUTURE_WRITE)
 136
 137static int memfd_add_seals(struct file *file, unsigned int seals)
 138{
 139        struct inode *inode = file_inode(file);
 140        unsigned int *file_seals;
 141        int error;
 142
 143        /*
 144         * SEALING
 145         * Sealing allows multiple parties to share a tmpfs or hugetlbfs file
 146         * but restrict access to a specific subset of file operations. Seals
 147         * can only be added, but never removed. This way, mutually untrusted
 148         * parties can share common memory regions with a well-defined policy.
 149         * A malicious peer can thus never perform unwanted operations on a
 150         * shared object.
 151         *
 152         * Seals are only supported on special tmpfs or hugetlbfs files and
 153         * always affect the whole underlying inode. Once a seal is set, it
 154         * may prevent some kinds of access to the file. Currently, the
 155         * following seals are defined:
 156         *   SEAL_SEAL: Prevent further seals from being set on this file
 157         *   SEAL_SHRINK: Prevent the file from shrinking
 158         *   SEAL_GROW: Prevent the file from growing
 159         *   SEAL_WRITE: Prevent write access to the file
 160         *
 161         * As we don't require any trust relationship between two parties, we
 162         * must prevent seals from being removed. Therefore, sealing a file
 163         * only adds a given set of seals to the file, it never touches
 164         * existing seals. Furthermore, the "setting seals"-operation can be
 165         * sealed itself, which basically prevents any further seal from being
 166         * added.
 167         *
 168         * Semantics of sealing are only defined on volatile files. Only
 169         * anonymous tmpfs and hugetlbfs files support sealing. More
 170         * importantly, seals are never written to disk. Therefore, there's
 171         * no plan to support it on other file types.
 172         */
 173
 174        if (!(file->f_mode & FMODE_WRITE))
 175                return -EPERM;
 176        if (seals & ~(unsigned int)F_ALL_SEALS)
 177                return -EINVAL;
 178
 179        inode_lock(inode);
 180
 181        file_seals = memfd_file_seals_ptr(file);
 182        if (!file_seals) {
 183                error = -EINVAL;
 184                goto unlock;
 185        }
 186
 187        if (*file_seals & F_SEAL_SEAL) {
 188                error = -EPERM;
 189                goto unlock;
 190        }
 191
 192        if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
 193                error = mapping_deny_writable(file->f_mapping);
 194                if (error)
 195                        goto unlock;
 196
 197                error = memfd_wait_for_pins(file->f_mapping);
 198                if (error) {
 199                        mapping_allow_writable(file->f_mapping);
 200                        goto unlock;
 201                }
 202        }
 203
 204        *file_seals |= seals;
 205        error = 0;
 206
 207unlock:
 208        inode_unlock(inode);
 209        return error;
 210}
 211
 212static int memfd_get_seals(struct file *file)
 213{
 214        unsigned int *seals = memfd_file_seals_ptr(file);
 215
 216        return seals ? *seals : -EINVAL;
 217}
 218
 219long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 220{
 221        long error;
 222
 223        switch (cmd) {
 224        case F_ADD_SEALS:
 225                /* disallow upper 32bit */
 226                if (arg > UINT_MAX)
 227                        return -EINVAL;
 228
 229                error = memfd_add_seals(file, arg);
 230                break;
 231        case F_GET_SEALS:
 232                error = memfd_get_seals(file);
 233                break;
 234        default:
 235                error = -EINVAL;
 236                break;
 237        }
 238
 239        return error;
 240}
 241
 242#define MFD_NAME_PREFIX "memfd:"
 243#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
 244#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
 245
 246#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
 247
 248SYSCALL_DEFINE2(memfd_create,
 249                const char __user *, uname,
 250                unsigned int, flags)
 251{
 252        unsigned int *file_seals;
 253        struct file *file;
 254        int fd, error;
 255        char *name;
 256        long len;
 257
 258        if (!(flags & MFD_HUGETLB)) {
 259                if (flags & ~(unsigned int)MFD_ALL_FLAGS)
 260                        return -EINVAL;
 261        } else {
 262                /* Allow huge page size encoding in flags. */
 263                if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
 264                                (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
 265                        return -EINVAL;
 266        }
 267
 268        /* length includes terminating zero */
 269        len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
 270        if (len <= 0)
 271                return -EFAULT;
 272        if (len > MFD_NAME_MAX_LEN + 1)
 273                return -EINVAL;
 274
 275        name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
 276        if (!name)
 277                return -ENOMEM;
 278
 279        strcpy(name, MFD_NAME_PREFIX);
 280        if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
 281                error = -EFAULT;
 282                goto err_name;
 283        }
 284
 285        /* terminating-zero may have changed after strnlen_user() returned */
 286        if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
 287                error = -EFAULT;
 288                goto err_name;
 289        }
 290
 291        fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
 292        if (fd < 0) {
 293                error = fd;
 294                goto err_name;
 295        }
 296
 297        if (flags & MFD_HUGETLB) {
 298                struct user_struct *user = NULL;
 299
 300                file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
 301                                        HUGETLB_ANONHUGE_INODE,
 302                                        (flags >> MFD_HUGE_SHIFT) &
 303                                        MFD_HUGE_MASK);
 304        } else
 305                file = shmem_file_setup(name, 0, VM_NORESERVE);
 306        if (IS_ERR(file)) {
 307                error = PTR_ERR(file);
 308                goto err_fd;
 309        }
 310        file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
 311        file->f_flags |= O_LARGEFILE;
 312
 313        if (flags & MFD_ALLOW_SEALING) {
 314                file_seals = memfd_file_seals_ptr(file);
 315                *file_seals &= ~F_SEAL_SEAL;
 316        }
 317
 318        fd_install(fd, file);
 319        kfree(name);
 320        return fd;
 321
 322err_fd:
 323        put_unused_fd(fd);
 324err_name:
 325        kfree(name);
 326        return error;
 327}
 328