linux/mm/memfd.c
<<
>>
Prefs
   1/*
   2 * memfd_create system call and file sealing support
   3 *
   4 * Code was originally included in shmem.c, and broken out to facilitate
   5 * use by hugetlbfs as well as tmpfs.
   6 *
   7 * This file is released under the GPL.
   8 */
   9
  10#include <linux/fs.h>
  11#include <linux/vfs.h>
  12#include <linux/pagemap.h>
  13#include <linux/file.h>
  14#include <linux/mm.h>
  15#include <linux/sched/signal.h>
  16#include <linux/khugepaged.h>
  17#include <linux/syscalls.h>
  18#include <linux/hugetlb.h>
  19#include <linux/shmem_fs.h>
  20#include <linux/memfd.h>
  21#include <uapi/linux/memfd.h>
  22
  23/*
  24 * We need a tag: a new tag would expand every xa_node by 8 bytes,
  25 * so reuse a tag which we firmly believe is never set or cleared on tmpfs
  26 * or hugetlbfs because they are memory only filesystems.
  27 */
  28#define MEMFD_TAG_PINNED        PAGECACHE_TAG_TOWRITE
  29#define LAST_SCAN               4       /* about 150ms max */
  30
  31static void memfd_tag_pins(struct xa_state *xas)
  32{
  33        struct page *page;
  34        int latency = 0;
  35        int cache_count;
  36
  37        lru_add_drain();
  38
  39        xas_lock_irq(xas);
  40        xas_for_each(xas, page, ULONG_MAX) {
  41                cache_count = 1;
  42                if (!xa_is_value(page) &&
  43                    PageTransHuge(page) && !PageHuge(page))
  44                        cache_count = HPAGE_PMD_NR;
  45
  46                if (!xa_is_value(page) &&
  47                    page_count(page) - total_mapcount(page) != cache_count)
  48                        xas_set_mark(xas, MEMFD_TAG_PINNED);
  49                if (cache_count != 1)
  50                        xas_set(xas, page->index + cache_count);
  51
  52                latency += cache_count;
  53                if (latency < XA_CHECK_SCHED)
  54                        continue;
  55                latency = 0;
  56
  57                xas_pause(xas);
  58                xas_unlock_irq(xas);
  59                cond_resched();
  60                xas_lock_irq(xas);
  61        }
  62        xas_unlock_irq(xas);
  63}
  64
  65/*
  66 * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
  67 * via get_user_pages(), drivers might have some pending I/O without any active
  68 * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
  69 * and see whether it has an elevated ref-count. If so, we tag them and wait for
  70 * them to be dropped.
  71 * The caller must guarantee that no new user will acquire writable references
  72 * to those pages to avoid races.
  73 */
  74static int memfd_wait_for_pins(struct address_space *mapping)
  75{
  76        XA_STATE(xas, &mapping->i_pages, 0);
  77        struct page *page;
  78        int error, scan;
  79
  80        memfd_tag_pins(&xas);
  81
  82        error = 0;
  83        for (scan = 0; scan <= LAST_SCAN; scan++) {
  84                int latency = 0;
  85                int cache_count;
  86
  87                if (!xas_marked(&xas, MEMFD_TAG_PINNED))
  88                        break;
  89
  90                if (!scan)
  91                        lru_add_drain_all();
  92                else if (schedule_timeout_killable((HZ << scan) / 200))
  93                        scan = LAST_SCAN;
  94
  95                xas_set(&xas, 0);
  96                xas_lock_irq(&xas);
  97                xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
  98                        bool clear = true;
  99
 100                        cache_count = 1;
 101                        if (!xa_is_value(page) &&
 102                            PageTransHuge(page) && !PageHuge(page))
 103                                cache_count = HPAGE_PMD_NR;
 104
 105                        if (!xa_is_value(page) && cache_count !=
 106                            page_count(page) - total_mapcount(page)) {
 107                                /*
 108                                 * On the last scan, we clean up all those tags
 109                                 * we inserted; but make a note that we still
 110                                 * found pages pinned.
 111                                 */
 112                                if (scan == LAST_SCAN)
 113                                        error = -EBUSY;
 114                                else
 115                                        clear = false;
 116                        }
 117                        if (clear)
 118                                xas_clear_mark(&xas, MEMFD_TAG_PINNED);
 119
 120                        latency += cache_count;
 121                        if (latency < XA_CHECK_SCHED)
 122                                continue;
 123                        latency = 0;
 124
 125                        xas_pause(&xas);
 126                        xas_unlock_irq(&xas);
 127                        cond_resched();
 128                        xas_lock_irq(&xas);
 129                }
 130                xas_unlock_irq(&xas);
 131        }
 132
 133        return error;
 134}
 135
 136static unsigned int *memfd_file_seals_ptr(struct file *file)
 137{
 138        if (shmem_file(file))
 139                return &SHMEM_I(file_inode(file))->seals;
 140
 141#ifdef CONFIG_HUGETLBFS
 142        if (is_file_hugepages(file))
 143                return &HUGETLBFS_I(file_inode(file))->seals;
 144#endif
 145
 146        return NULL;
 147}
 148
 149#define F_ALL_SEALS (F_SEAL_SEAL | \
 150                     F_SEAL_SHRINK | \
 151                     F_SEAL_GROW | \
 152                     F_SEAL_WRITE | \
 153                     F_SEAL_FUTURE_WRITE)
 154
 155static int memfd_add_seals(struct file *file, unsigned int seals)
 156{
 157        struct inode *inode = file_inode(file);
 158        unsigned int *file_seals;
 159        int error;
 160
 161        /*
 162         * SEALING
 163         * Sealing allows multiple parties to share a tmpfs or hugetlbfs file
 164         * but restrict access to a specific subset of file operations. Seals
 165         * can only be added, but never removed. This way, mutually untrusted
 166         * parties can share common memory regions with a well-defined policy.
 167         * A malicious peer can thus never perform unwanted operations on a
 168         * shared object.
 169         *
 170         * Seals are only supported on special tmpfs or hugetlbfs files and
 171         * always affect the whole underlying inode. Once a seal is set, it
 172         * may prevent some kinds of access to the file. Currently, the
 173         * following seals are defined:
 174         *   SEAL_SEAL: Prevent further seals from being set on this file
 175         *   SEAL_SHRINK: Prevent the file from shrinking
 176         *   SEAL_GROW: Prevent the file from growing
 177         *   SEAL_WRITE: Prevent write access to the file
 178         *
 179         * As we don't require any trust relationship between two parties, we
 180         * must prevent seals from being removed. Therefore, sealing a file
 181         * only adds a given set of seals to the file, it never touches
 182         * existing seals. Furthermore, the "setting seals"-operation can be
 183         * sealed itself, which basically prevents any further seal from being
 184         * added.
 185         *
 186         * Semantics of sealing are only defined on volatile files. Only
 187         * anonymous tmpfs and hugetlbfs files support sealing. More
 188         * importantly, seals are never written to disk. Therefore, there's
 189         * no plan to support it on other file types.
 190         */
 191
 192        if (!(file->f_mode & FMODE_WRITE))
 193                return -EPERM;
 194        if (seals & ~(unsigned int)F_ALL_SEALS)
 195                return -EINVAL;
 196
 197        inode_lock(inode);
 198
 199        file_seals = memfd_file_seals_ptr(file);
 200        if (!file_seals) {
 201                error = -EINVAL;
 202                goto unlock;
 203        }
 204
 205        if (*file_seals & F_SEAL_SEAL) {
 206                error = -EPERM;
 207                goto unlock;
 208        }
 209
 210        if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
 211                error = mapping_deny_writable(file->f_mapping);
 212                if (error)
 213                        goto unlock;
 214
 215                error = memfd_wait_for_pins(file->f_mapping);
 216                if (error) {
 217                        mapping_allow_writable(file->f_mapping);
 218                        goto unlock;
 219                }
 220        }
 221
 222        *file_seals |= seals;
 223        error = 0;
 224
 225unlock:
 226        inode_unlock(inode);
 227        return error;
 228}
 229
 230static int memfd_get_seals(struct file *file)
 231{
 232        unsigned int *seals = memfd_file_seals_ptr(file);
 233
 234        return seals ? *seals : -EINVAL;
 235}
 236
 237long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 238{
 239        long error;
 240
 241        switch (cmd) {
 242        case F_ADD_SEALS:
 243                /* disallow upper 32bit */
 244                if (arg > UINT_MAX)
 245                        return -EINVAL;
 246
 247                error = memfd_add_seals(file, arg);
 248                break;
 249        case F_GET_SEALS:
 250                error = memfd_get_seals(file);
 251                break;
 252        default:
 253                error = -EINVAL;
 254                break;
 255        }
 256
 257        return error;
 258}
 259
 260#define MFD_NAME_PREFIX "memfd:"
 261#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
 262#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
 263
 264#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
 265
 266SYSCALL_DEFINE2(memfd_create,
 267                const char __user *, uname,
 268                unsigned int, flags)
 269{
 270        unsigned int *file_seals;
 271        struct file *file;
 272        int fd, error;
 273        char *name;
 274        long len;
 275
 276        if (!(flags & MFD_HUGETLB)) {
 277                if (flags & ~(unsigned int)MFD_ALL_FLAGS)
 278                        return -EINVAL;
 279        } else {
 280                /* Allow huge page size encoding in flags. */
 281                if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
 282                                (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
 283                        return -EINVAL;
 284        }
 285
 286        /* length includes terminating zero */
 287        len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
 288        if (len <= 0)
 289                return -EFAULT;
 290        if (len > MFD_NAME_MAX_LEN + 1)
 291                return -EINVAL;
 292
 293        name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
 294        if (!name)
 295                return -ENOMEM;
 296
 297        strcpy(name, MFD_NAME_PREFIX);
 298        if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
 299                error = -EFAULT;
 300                goto err_name;
 301        }
 302
 303        /* terminating-zero may have changed after strnlen_user() returned */
 304        if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
 305                error = -EFAULT;
 306                goto err_name;
 307        }
 308
 309        fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
 310        if (fd < 0) {
 311                error = fd;
 312                goto err_name;
 313        }
 314
 315        if (flags & MFD_HUGETLB) {
 316                file = hugetlb_file_setup(name, 0, VM_NORESERVE,
 317                                        HUGETLB_ANONHUGE_INODE,
 318                                        (flags >> MFD_HUGE_SHIFT) &
 319                                        MFD_HUGE_MASK);
 320        } else
 321                file = shmem_file_setup(name, 0, VM_NORESERVE);
 322        if (IS_ERR(file)) {
 323                error = PTR_ERR(file);
 324                goto err_fd;
 325        }
 326        file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
 327        file->f_flags |= O_LARGEFILE;
 328
 329        if (flags & MFD_ALLOW_SEALING) {
 330                file_seals = memfd_file_seals_ptr(file);
 331                *file_seals &= ~F_SEAL_SEAL;
 332        }
 333
 334        fd_install(fd, file);
 335        kfree(name);
 336        return fd;
 337
 338err_fd:
 339        put_unused_fd(fd);
 340err_name:
 341        kfree(name);
 342        return error;
 343}
 344