1
2
3
4
5
6
7
8
9
10#include <linux/fs.h>
11#include <linux/vfs.h>
12#include <linux/pagemap.h>
13#include <linux/file.h>
14#include <linux/mm.h>
15#include <linux/sched/signal.h>
16#include <linux/khugepaged.h>
17#include <linux/syscalls.h>
18#include <linux/hugetlb.h>
19#include <linux/shmem_fs.h>
20#include <linux/memfd.h>
21#include <uapi/linux/memfd.h>
22
23
24
25
26
27
28#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE
29#define LAST_SCAN 4
30
31static void memfd_tag_pins(struct xa_state *xas)
32{
33 struct page *page;
34 unsigned int tagged = 0;
35
36 lru_add_drain();
37
38 xas_lock_irq(xas);
39 xas_for_each(xas, page, ULONG_MAX) {
40 if (xa_is_value(page))
41 continue;
42 if (page_count(page) - page_mapcount(page) > 1)
43 xas_set_mark(xas, MEMFD_TAG_PINNED);
44
45 if (++tagged % XA_CHECK_SCHED)
46 continue;
47
48 xas_pause(xas);
49 xas_unlock_irq(xas);
50 cond_resched();
51 xas_lock_irq(xas);
52 }
53 xas_unlock_irq(xas);
54}
55
56
57
58
59
60
61
62
63
64
65static int memfd_wait_for_pins(struct address_space *mapping)
66{
67 XA_STATE(xas, &mapping->i_pages, 0);
68 struct page *page;
69 int error, scan;
70
71 memfd_tag_pins(&xas);
72
73 error = 0;
74 for (scan = 0; scan <= LAST_SCAN; scan++) {
75 unsigned int tagged = 0;
76
77 if (!xas_marked(&xas, MEMFD_TAG_PINNED))
78 break;
79
80 if (!scan)
81 lru_add_drain_all();
82 else if (schedule_timeout_killable((HZ << scan) / 200))
83 scan = LAST_SCAN;
84
85 xas_set(&xas, 0);
86 xas_lock_irq(&xas);
87 xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
88 bool clear = true;
89 if (xa_is_value(page))
90 continue;
91 if (page_count(page) - page_mapcount(page) != 1) {
92
93
94
95
96
97 if (scan == LAST_SCAN)
98 error = -EBUSY;
99 else
100 clear = false;
101 }
102 if (clear)
103 xas_clear_mark(&xas, MEMFD_TAG_PINNED);
104 if (++tagged % XA_CHECK_SCHED)
105 continue;
106
107 xas_pause(&xas);
108 xas_unlock_irq(&xas);
109 cond_resched();
110 xas_lock_irq(&xas);
111 }
112 xas_unlock_irq(&xas);
113 }
114
115 return error;
116}
117
118static unsigned int *memfd_file_seals_ptr(struct file *file)
119{
120 if (shmem_file(file))
121 return &SHMEM_I(file_inode(file))->seals;
122
123#ifdef CONFIG_HUGETLBFS
124 if (is_file_hugepages(file))
125 return &HUGETLBFS_I(file_inode(file))->seals;
126#endif
127
128 return NULL;
129}
130
131#define F_ALL_SEALS (F_SEAL_SEAL | \
132 F_SEAL_SHRINK | \
133 F_SEAL_GROW | \
134 F_SEAL_WRITE | \
135 F_SEAL_FUTURE_WRITE)
136
137static int memfd_add_seals(struct file *file, unsigned int seals)
138{
139 struct inode *inode = file_inode(file);
140 unsigned int *file_seals;
141 int error;
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174 if (!(file->f_mode & FMODE_WRITE))
175 return -EPERM;
176 if (seals & ~(unsigned int)F_ALL_SEALS)
177 return -EINVAL;
178
179 inode_lock(inode);
180
181 file_seals = memfd_file_seals_ptr(file);
182 if (!file_seals) {
183 error = -EINVAL;
184 goto unlock;
185 }
186
187 if (*file_seals & F_SEAL_SEAL) {
188 error = -EPERM;
189 goto unlock;
190 }
191
192 if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
193 error = mapping_deny_writable(file->f_mapping);
194 if (error)
195 goto unlock;
196
197 error = memfd_wait_for_pins(file->f_mapping);
198 if (error) {
199 mapping_allow_writable(file->f_mapping);
200 goto unlock;
201 }
202 }
203
204 *file_seals |= seals;
205 error = 0;
206
207unlock:
208 inode_unlock(inode);
209 return error;
210}
211
212static int memfd_get_seals(struct file *file)
213{
214 unsigned int *seals = memfd_file_seals_ptr(file);
215
216 return seals ? *seals : -EINVAL;
217}
218
219long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
220{
221 long error;
222
223 switch (cmd) {
224 case F_ADD_SEALS:
225
226 if (arg > UINT_MAX)
227 return -EINVAL;
228
229 error = memfd_add_seals(file, arg);
230 break;
231 case F_GET_SEALS:
232 error = memfd_get_seals(file);
233 break;
234 default:
235 error = -EINVAL;
236 break;
237 }
238
239 return error;
240}
241
242#define MFD_NAME_PREFIX "memfd:"
243#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
244#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
245
246#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
247
248SYSCALL_DEFINE2(memfd_create,
249 const char __user *, uname,
250 unsigned int, flags)
251{
252 unsigned int *file_seals;
253 struct file *file;
254 int fd, error;
255 char *name;
256 long len;
257
258 if (!(flags & MFD_HUGETLB)) {
259 if (flags & ~(unsigned int)MFD_ALL_FLAGS)
260 return -EINVAL;
261 } else {
262
263 if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
264 (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
265 return -EINVAL;
266 }
267
268
269 len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
270 if (len <= 0)
271 return -EFAULT;
272 if (len > MFD_NAME_MAX_LEN + 1)
273 return -EINVAL;
274
275 name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
276 if (!name)
277 return -ENOMEM;
278
279 strcpy(name, MFD_NAME_PREFIX);
280 if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
281 error = -EFAULT;
282 goto err_name;
283 }
284
285
286 if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
287 error = -EFAULT;
288 goto err_name;
289 }
290
291 fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
292 if (fd < 0) {
293 error = fd;
294 goto err_name;
295 }
296
297 if (flags & MFD_HUGETLB) {
298 struct user_struct *user = NULL;
299
300 file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
301 HUGETLB_ANONHUGE_INODE,
302 (flags >> MFD_HUGE_SHIFT) &
303 MFD_HUGE_MASK);
304 } else
305 file = shmem_file_setup(name, 0, VM_NORESERVE);
306 if (IS_ERR(file)) {
307 error = PTR_ERR(file);
308 goto err_fd;
309 }
310 file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
311 file->f_flags |= O_LARGEFILE;
312
313 if (flags & MFD_ALLOW_SEALING) {
314 file_seals = memfd_file_seals_ptr(file);
315 *file_seals &= ~F_SEAL_SEAL;
316 }
317
318 fd_install(fd, file);
319 kfree(name);
320 return fd;
321
322err_fd:
323 put_unused_fd(fd);
324err_name:
325 kfree(name);
326 return error;
327}
328