1
2
3
4
5
6
7
8
9
10#include <linux/fs.h>
11#include <linux/vfs.h>
12#include <linux/pagemap.h>
13#include <linux/file.h>
14#include <linux/mm.h>
15#include <linux/sched/signal.h>
16#include <linux/khugepaged.h>
17#include <linux/syscalls.h>
18#include <linux/hugetlb.h>
19#include <linux/shmem_fs.h>
20#include <linux/memfd.h>
21#include <uapi/linux/memfd.h>
22
23
24
25
26
27
28#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE
29#define LAST_SCAN 4
30
31static void memfd_tag_pins(struct xa_state *xas)
32{
33 struct page *page;
34 unsigned int tagged = 0;
35
36 lru_add_drain();
37
38 xas_lock_irq(xas);
39 xas_for_each(xas, page, ULONG_MAX) {
40 if (xa_is_value(page))
41 continue;
42 page = find_subpage(page, xas->xa_index);
43 if (page_count(page) - page_mapcount(page) > 1)
44 xas_set_mark(xas, MEMFD_TAG_PINNED);
45
46 if (++tagged % XA_CHECK_SCHED)
47 continue;
48
49 xas_pause(xas);
50 xas_unlock_irq(xas);
51 cond_resched();
52 xas_lock_irq(xas);
53 }
54 xas_unlock_irq(xas);
55}
56
57
58
59
60
61
62
63
64
65
66static int memfd_wait_for_pins(struct address_space *mapping)
67{
68 XA_STATE(xas, &mapping->i_pages, 0);
69 struct page *page;
70 int error, scan;
71
72 memfd_tag_pins(&xas);
73
74 error = 0;
75 for (scan = 0; scan <= LAST_SCAN; scan++) {
76 unsigned int tagged = 0;
77
78 if (!xas_marked(&xas, MEMFD_TAG_PINNED))
79 break;
80
81 if (!scan)
82 lru_add_drain_all();
83 else if (schedule_timeout_killable((HZ << scan) / 200))
84 scan = LAST_SCAN;
85
86 xas_set(&xas, 0);
87 xas_lock_irq(&xas);
88 xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
89 bool clear = true;
90 if (xa_is_value(page))
91 continue;
92 page = find_subpage(page, xas.xa_index);
93 if (page_count(page) - page_mapcount(page) != 1) {
94
95
96
97
98
99 if (scan == LAST_SCAN)
100 error = -EBUSY;
101 else
102 clear = false;
103 }
104 if (clear)
105 xas_clear_mark(&xas, MEMFD_TAG_PINNED);
106 if (++tagged % XA_CHECK_SCHED)
107 continue;
108
109 xas_pause(&xas);
110 xas_unlock_irq(&xas);
111 cond_resched();
112 xas_lock_irq(&xas);
113 }
114 xas_unlock_irq(&xas);
115 }
116
117 return error;
118}
119
120static unsigned int *memfd_file_seals_ptr(struct file *file)
121{
122 if (shmem_file(file))
123 return &SHMEM_I(file_inode(file))->seals;
124
125#ifdef CONFIG_HUGETLBFS
126 if (is_file_hugepages(file))
127 return &HUGETLBFS_I(file_inode(file))->seals;
128#endif
129
130 return NULL;
131}
132
133#define F_ALL_SEALS (F_SEAL_SEAL | \
134 F_SEAL_SHRINK | \
135 F_SEAL_GROW | \
136 F_SEAL_WRITE)
137
138static int memfd_add_seals(struct file *file, unsigned int seals)
139{
140 struct inode *inode = file_inode(file);
141 unsigned int *file_seals;
142 int error;
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175 if (!(file->f_mode & FMODE_WRITE))
176 return -EPERM;
177 if (seals & ~(unsigned int)F_ALL_SEALS)
178 return -EINVAL;
179
180 inode_lock(inode);
181
182 file_seals = memfd_file_seals_ptr(file);
183 if (!file_seals) {
184 error = -EINVAL;
185 goto unlock;
186 }
187
188 if (*file_seals & F_SEAL_SEAL) {
189 error = -EPERM;
190 goto unlock;
191 }
192
193 if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
194 error = mapping_deny_writable(file->f_mapping);
195 if (error)
196 goto unlock;
197
198 error = memfd_wait_for_pins(file->f_mapping);
199 if (error) {
200 mapping_allow_writable(file->f_mapping);
201 goto unlock;
202 }
203 }
204
205 *file_seals |= seals;
206 error = 0;
207
208unlock:
209 inode_unlock(inode);
210 return error;
211}
212
213static int memfd_get_seals(struct file *file)
214{
215 unsigned int *seals = memfd_file_seals_ptr(file);
216
217 return seals ? *seals : -EINVAL;
218}
219
220long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
221{
222 long error;
223
224 switch (cmd) {
225 case F_ADD_SEALS:
226
227 if (arg > UINT_MAX)
228 return -EINVAL;
229
230 error = memfd_add_seals(file, arg);
231 break;
232 case F_GET_SEALS:
233 error = memfd_get_seals(file);
234 break;
235 default:
236 error = -EINVAL;
237 break;
238 }
239
240 return error;
241}
242
243#define MFD_NAME_PREFIX "memfd:"
244#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
245#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
246
247#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
248
249SYSCALL_DEFINE2(memfd_create,
250 const char __user *, uname,
251 unsigned int, flags)
252{
253 unsigned int *file_seals;
254 struct file *file;
255 int fd, error;
256 char *name;
257 long len;
258
259 if (!(flags & MFD_HUGETLB)) {
260 if (flags & ~(unsigned int)MFD_ALL_FLAGS)
261 return -EINVAL;
262 } else {
263
264 if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
265 (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
266 return -EINVAL;
267 }
268
269
270 len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
271 if (len <= 0)
272 return -EFAULT;
273 if (len > MFD_NAME_MAX_LEN + 1)
274 return -EINVAL;
275
276 name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
277 if (!name)
278 return -ENOMEM;
279
280 strcpy(name, MFD_NAME_PREFIX);
281 if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
282 error = -EFAULT;
283 goto err_name;
284 }
285
286
287 if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
288 error = -EFAULT;
289 goto err_name;
290 }
291
292 fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
293 if (fd < 0) {
294 error = fd;
295 goto err_name;
296 }
297
298 if (flags & MFD_HUGETLB) {
299 struct user_struct *user = NULL;
300
301 file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
302 HUGETLB_ANONHUGE_INODE,
303 (flags >> MFD_HUGE_SHIFT) &
304 MFD_HUGE_MASK);
305 } else
306 file = shmem_file_setup(name, 0, VM_NORESERVE);
307 if (IS_ERR(file)) {
308 error = PTR_ERR(file);
309 goto err_fd;
310 }
311 file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
312 file->f_flags |= O_LARGEFILE;
313
314 if (flags & MFD_ALLOW_SEALING) {
315 file_seals = memfd_file_seals_ptr(file);
316 *file_seals &= ~F_SEAL_SEAL;
317 }
318
319 fd_install(fd, file);
320 kfree(name);
321 return fd;
322
323err_fd:
324 put_unused_fd(fd);
325err_name:
326 kfree(name);
327 return error;
328}
329