1
2
3
4
5
6
7
8
9
10#include <linux/fs.h>
11#include <linux/vfs.h>
12#include <linux/pagemap.h>
13#include <linux/file.h>
14#include <linux/mm.h>
15#include <linux/sched/signal.h>
16#include <linux/khugepaged.h>
17#include <linux/syscalls.h>
18#include <linux/hugetlb.h>
19#include <linux/shmem_fs.h>
20#include <linux/memfd.h>
21#include <uapi/linux/memfd.h>
22
23
24
25
26
27
28#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE
29#define LAST_SCAN 4
30
31static void memfd_tag_pins(struct xa_state *xas)
32{
33 struct page *page;
34 int latency = 0;
35 int cache_count;
36
37 lru_add_drain();
38
39 xas_lock_irq(xas);
40 xas_for_each(xas, page, ULONG_MAX) {
41 cache_count = 1;
42 if (!xa_is_value(page) &&
43 PageTransHuge(page) && !PageHuge(page))
44 cache_count = HPAGE_PMD_NR;
45
46 if (!xa_is_value(page) &&
47 page_count(page) - total_mapcount(page) != cache_count)
48 xas_set_mark(xas, MEMFD_TAG_PINNED);
49 if (cache_count != 1)
50 xas_set(xas, page->index + cache_count);
51
52 latency += cache_count;
53 if (latency < XA_CHECK_SCHED)
54 continue;
55 latency = 0;
56
57 xas_pause(xas);
58 xas_unlock_irq(xas);
59 cond_resched();
60 xas_lock_irq(xas);
61 }
62 xas_unlock_irq(xas);
63}
64
65
66
67
68
69
70
71
72
73
74static int memfd_wait_for_pins(struct address_space *mapping)
75{
76 XA_STATE(xas, &mapping->i_pages, 0);
77 struct page *page;
78 int error, scan;
79
80 memfd_tag_pins(&xas);
81
82 error = 0;
83 for (scan = 0; scan <= LAST_SCAN; scan++) {
84 int latency = 0;
85 int cache_count;
86
87 if (!xas_marked(&xas, MEMFD_TAG_PINNED))
88 break;
89
90 if (!scan)
91 lru_add_drain_all();
92 else if (schedule_timeout_killable((HZ << scan) / 200))
93 scan = LAST_SCAN;
94
95 xas_set(&xas, 0);
96 xas_lock_irq(&xas);
97 xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
98 bool clear = true;
99
100 cache_count = 1;
101 if (!xa_is_value(page) &&
102 PageTransHuge(page) && !PageHuge(page))
103 cache_count = HPAGE_PMD_NR;
104
105 if (!xa_is_value(page) && cache_count !=
106 page_count(page) - total_mapcount(page)) {
107
108
109
110
111
112 if (scan == LAST_SCAN)
113 error = -EBUSY;
114 else
115 clear = false;
116 }
117 if (clear)
118 xas_clear_mark(&xas, MEMFD_TAG_PINNED);
119
120 latency += cache_count;
121 if (latency < XA_CHECK_SCHED)
122 continue;
123 latency = 0;
124
125 xas_pause(&xas);
126 xas_unlock_irq(&xas);
127 cond_resched();
128 xas_lock_irq(&xas);
129 }
130 xas_unlock_irq(&xas);
131 }
132
133 return error;
134}
135
136static unsigned int *memfd_file_seals_ptr(struct file *file)
137{
138 if (shmem_file(file))
139 return &SHMEM_I(file_inode(file))->seals;
140
141#ifdef CONFIG_HUGETLBFS
142 if (is_file_hugepages(file))
143 return &HUGETLBFS_I(file_inode(file))->seals;
144#endif
145
146 return NULL;
147}
148
149#define F_ALL_SEALS (F_SEAL_SEAL | \
150 F_SEAL_SHRINK | \
151 F_SEAL_GROW | \
152 F_SEAL_WRITE | \
153 F_SEAL_FUTURE_WRITE)
154
155static int memfd_add_seals(struct file *file, unsigned int seals)
156{
157 struct inode *inode = file_inode(file);
158 unsigned int *file_seals;
159 int error;
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192 if (!(file->f_mode & FMODE_WRITE))
193 return -EPERM;
194 if (seals & ~(unsigned int)F_ALL_SEALS)
195 return -EINVAL;
196
197 inode_lock(inode);
198
199 file_seals = memfd_file_seals_ptr(file);
200 if (!file_seals) {
201 error = -EINVAL;
202 goto unlock;
203 }
204
205 if (*file_seals & F_SEAL_SEAL) {
206 error = -EPERM;
207 goto unlock;
208 }
209
210 if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
211 error = mapping_deny_writable(file->f_mapping);
212 if (error)
213 goto unlock;
214
215 error = memfd_wait_for_pins(file->f_mapping);
216 if (error) {
217 mapping_allow_writable(file->f_mapping);
218 goto unlock;
219 }
220 }
221
222 *file_seals |= seals;
223 error = 0;
224
225unlock:
226 inode_unlock(inode);
227 return error;
228}
229
230static int memfd_get_seals(struct file *file)
231{
232 unsigned int *seals = memfd_file_seals_ptr(file);
233
234 return seals ? *seals : -EINVAL;
235}
236
237long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
238{
239 long error;
240
241 switch (cmd) {
242 case F_ADD_SEALS:
243
244 if (arg > UINT_MAX)
245 return -EINVAL;
246
247 error = memfd_add_seals(file, arg);
248 break;
249 case F_GET_SEALS:
250 error = memfd_get_seals(file);
251 break;
252 default:
253 error = -EINVAL;
254 break;
255 }
256
257 return error;
258}
259
260#define MFD_NAME_PREFIX "memfd:"
261#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
262#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
263
264#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
265
266SYSCALL_DEFINE2(memfd_create,
267 const char __user *, uname,
268 unsigned int, flags)
269{
270 unsigned int *file_seals;
271 struct file *file;
272 int fd, error;
273 char *name;
274 long len;
275
276 if (!(flags & MFD_HUGETLB)) {
277 if (flags & ~(unsigned int)MFD_ALL_FLAGS)
278 return -EINVAL;
279 } else {
280
281 if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
282 (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
283 return -EINVAL;
284 }
285
286
287 len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
288 if (len <= 0)
289 return -EFAULT;
290 if (len > MFD_NAME_MAX_LEN + 1)
291 return -EINVAL;
292
293 name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
294 if (!name)
295 return -ENOMEM;
296
297 strcpy(name, MFD_NAME_PREFIX);
298 if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
299 error = -EFAULT;
300 goto err_name;
301 }
302
303
304 if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
305 error = -EFAULT;
306 goto err_name;
307 }
308
309 fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
310 if (fd < 0) {
311 error = fd;
312 goto err_name;
313 }
314
315 if (flags & MFD_HUGETLB) {
316 file = hugetlb_file_setup(name, 0, VM_NORESERVE,
317 HUGETLB_ANONHUGE_INODE,
318 (flags >> MFD_HUGE_SHIFT) &
319 MFD_HUGE_MASK);
320 } else
321 file = shmem_file_setup(name, 0, VM_NORESERVE);
322 if (IS_ERR(file)) {
323 error = PTR_ERR(file);
324 goto err_fd;
325 }
326 file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
327 file->f_flags |= O_LARGEFILE;
328
329 if (flags & MFD_ALLOW_SEALING) {
330 file_seals = memfd_file_seals_ptr(file);
331 *file_seals &= ~F_SEAL_SEAL;
332 }
333
334 fd_install(fd, file);
335 kfree(name);
336 return fd;
337
338err_fd:
339 put_unused_fd(fd);
340err_name:
341 kfree(name);
342 return error;
343}
344