1
2
3
4
5
6
7
8#include <linux/mman.h>
9#include <linux/pagemap.h>
10#include <linux/syscalls.h>
11#include <linux/mempolicy.h>
12#include <linux/page-isolation.h>
13#include <linux/hugetlb.h>
14#include <linux/falloc.h>
15#include <linux/sched.h>
16#include <linux/ksm.h>
17#include <linux/fs.h>
18#include <linux/file.h>
19#include <linux/blkdev.h>
20#include <linux/swap.h>
21#include <linux/swapops.h>
22
23
24
25
26
27
28static int madvise_need_mmap_write(int behavior)
29{
30 switch (behavior) {
31 case MADV_REMOVE:
32 case MADV_WILLNEED:
33 case MADV_DONTNEED:
34 return 0;
35 default:
36
37 return 1;
38 }
39}
40
41
42
43
44
45static long madvise_behavior(struct vm_area_struct *vma,
46 struct vm_area_struct **prev,
47 unsigned long start, unsigned long end, int behavior)
48{
49 struct mm_struct *mm = vma->vm_mm;
50 int error = 0;
51 pgoff_t pgoff;
52 unsigned long new_flags = vma->vm_flags;
53
54 switch (behavior) {
55 case MADV_NORMAL:
56 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
57 break;
58 case MADV_SEQUENTIAL:
59 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
60 break;
61 case MADV_RANDOM:
62 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
63 break;
64 case MADV_DONTFORK:
65 new_flags |= VM_DONTCOPY;
66 break;
67 case MADV_DOFORK:
68 if (vma->vm_flags & VM_IO) {
69 error = -EINVAL;
70 goto out;
71 }
72 new_flags &= ~VM_DONTCOPY;
73 break;
74 case MADV_DONTDUMP:
75 new_flags |= VM_DONTDUMP;
76 break;
77 case MADV_DODUMP:
78 if (new_flags & VM_SPECIAL) {
79 error = -EINVAL;
80 goto out;
81 }
82 new_flags &= ~VM_DONTDUMP;
83 break;
84 case MADV_MERGEABLE:
85 case MADV_UNMERGEABLE:
86 error = ksm_madvise(vma, start, end, behavior, &new_flags);
87 if (error)
88 goto out;
89 break;
90 case MADV_HUGEPAGE:
91 case MADV_NOHUGEPAGE:
92 error = hugepage_madvise(vma, &new_flags, behavior);
93 if (error)
94 goto out;
95 break;
96 }
97
98 if (new_flags == vma->vm_flags) {
99 *prev = vma;
100 goto out;
101 }
102
103 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
104 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
105 vma->vm_file, pgoff, vma_policy(vma));
106 if (*prev) {
107 vma = *prev;
108 goto success;
109 }
110
111 *prev = vma;
112
113 if (start != vma->vm_start) {
114 error = split_vma(mm, vma, start, 1);
115 if (error)
116 goto out;
117 }
118
119 if (end != vma->vm_end) {
120 error = split_vma(mm, vma, end, 0);
121 if (error)
122 goto out;
123 }
124
125success:
126
127
128
129 vma->vm_flags = new_flags;
130
131out:
132 if (error == -ENOMEM)
133 error = -EAGAIN;
134 return error;
135}
136
137#ifdef CONFIG_SWAP
138static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
139 unsigned long end, struct mm_walk *walk)
140{
141 pte_t *orig_pte;
142 struct vm_area_struct *vma = walk->private;
143 unsigned long index;
144
145 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
146 return 0;
147
148 for (index = start; index != end; index += PAGE_SIZE) {
149 pte_t pte;
150 swp_entry_t entry;
151 struct page *page;
152 spinlock_t *ptl;
153
154 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
155 pte = *(orig_pte + ((index - start) / PAGE_SIZE));
156 pte_unmap_unlock(orig_pte, ptl);
157
158 if (pte_present(pte) || pte_none(pte) || pte_file(pte))
159 continue;
160 entry = pte_to_swp_entry(pte);
161 if (unlikely(non_swap_entry(entry)))
162 continue;
163
164 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
165 vma, index);
166 if (page)
167 page_cache_release(page);
168 }
169
170 return 0;
171}
172
173static void force_swapin_readahead(struct vm_area_struct *vma,
174 unsigned long start, unsigned long end)
175{
176 struct mm_walk walk = {
177 .mm = vma->vm_mm,
178 .pmd_entry = swapin_walk_pmd_entry,
179 .private = vma,
180 };
181
182 walk_page_range(start, end, &walk);
183
184 lru_add_drain();
185}
186
187static void force_shm_swapin_readahead(struct vm_area_struct *vma,
188 unsigned long start, unsigned long end,
189 struct address_space *mapping)
190{
191 pgoff_t index;
192 struct page *page;
193 swp_entry_t swap;
194
195 for (; start < end; start += PAGE_SIZE) {
196 index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
197
198 page = find_get_entry(mapping, index);
199 if (!radix_tree_exceptional_entry(page)) {
200 if (page)
201 page_cache_release(page);
202 continue;
203 }
204 swap = radix_to_swp_entry(page);
205 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
206 NULL, 0);
207 if (page)
208 page_cache_release(page);
209 }
210
211 lru_add_drain();
212}
213#endif
214
215
216
217
218static long madvise_willneed(struct vm_area_struct *vma,
219 struct vm_area_struct **prev,
220 unsigned long start, unsigned long end)
221{
222 struct file *file = vma->vm_file;
223
224#ifdef CONFIG_SWAP
225 if (!file || mapping_cap_swap_backed(file->f_mapping)) {
226 *prev = vma;
227 if (!file)
228 force_swapin_readahead(vma, start, end);
229 else
230 force_shm_swapin_readahead(vma, start, end,
231 file->f_mapping);
232 return 0;
233 }
234#endif
235
236 if (!file)
237 return -EBADF;
238
239 if (file->f_mapping->a_ops->get_xip_mem) {
240
241 return 0;
242 }
243
244 *prev = vma;
245 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
246 if (end > vma->vm_end)
247 end = vma->vm_end;
248 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
249
250 force_page_cache_readahead(file->f_mapping, file, start, end - start);
251 return 0;
252}
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273static long madvise_dontneed(struct vm_area_struct *vma,
274 struct vm_area_struct **prev,
275 unsigned long start, unsigned long end)
276{
277 *prev = vma;
278 if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
279 return -EINVAL;
280
281 if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
282 struct zap_details details = {
283 .nonlinear_vma = vma,
284 .last_index = ULONG_MAX,
285 };
286 zap_page_range(vma, start, end - start, &details);
287 } else
288 zap_page_range(vma, start, end - start, NULL);
289 return 0;
290}
291
292
293
294
295
296
297
298
299static long madvise_remove(struct vm_area_struct *vma,
300 struct vm_area_struct **prev,
301 unsigned long start, unsigned long end)
302{
303 loff_t offset;
304 int error;
305 struct file *f;
306
307 *prev = NULL;
308
309 if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
310 return -EINVAL;
311
312 f = vma->vm_file;
313
314 if (!f || !f->f_mapping || !f->f_mapping->host) {
315 return -EINVAL;
316 }
317
318 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
319 return -EACCES;
320
321 offset = (loff_t)(start - vma->vm_start)
322 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
323
324
325
326
327
328
329
330 get_file(f);
331 up_read(¤t->mm->mmap_sem);
332 error = do_fallocate(f,
333 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
334 offset, end - start);
335 fput(f);
336 down_read(¤t->mm->mmap_sem);
337 return error;
338}
339
340#ifdef CONFIG_MEMORY_FAILURE
341
342
343
344static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
345{
346 struct page *p;
347 if (!capable(CAP_SYS_ADMIN))
348 return -EPERM;
349 for (; start < end; start += PAGE_SIZE <<
350 compound_order(compound_head(p))) {
351 int ret;
352
353 ret = get_user_pages_fast(start, 1, 0, &p);
354 if (ret != 1)
355 return ret;
356
357 if (PageHWPoison(p)) {
358 put_page(p);
359 continue;
360 }
361 if (bhv == MADV_SOFT_OFFLINE) {
362 pr_info("Soft offlining page %#lx at %#lx\n",
363 page_to_pfn(p), start);
364 ret = soft_offline_page(p, MF_COUNT_INCREASED);
365 if (ret)
366 return ret;
367 continue;
368 }
369 pr_info("Injecting memory failure for page %#lx at %#lx\n",
370 page_to_pfn(p), start);
371
372 memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
373 }
374 return 0;
375}
376#endif
377
378static long
379madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
380 unsigned long start, unsigned long end, int behavior)
381{
382 switch (behavior) {
383 case MADV_REMOVE:
384 return madvise_remove(vma, prev, start, end);
385 case MADV_WILLNEED:
386 return madvise_willneed(vma, prev, start, end);
387 case MADV_DONTNEED:
388 return madvise_dontneed(vma, prev, start, end);
389 default:
390 return madvise_behavior(vma, prev, start, end, behavior);
391 }
392}
393
394static int
395madvise_behavior_valid(int behavior)
396{
397 switch (behavior) {
398 case MADV_DOFORK:
399 case MADV_DONTFORK:
400 case MADV_NORMAL:
401 case MADV_SEQUENTIAL:
402 case MADV_RANDOM:
403 case MADV_REMOVE:
404 case MADV_WILLNEED:
405 case MADV_DONTNEED:
406#ifdef CONFIG_KSM
407 case MADV_MERGEABLE:
408 case MADV_UNMERGEABLE:
409#endif
410#ifdef CONFIG_TRANSPARENT_HUGEPAGE
411 case MADV_HUGEPAGE:
412 case MADV_NOHUGEPAGE:
413#endif
414 case MADV_DONTDUMP:
415 case MADV_DODUMP:
416 return 1;
417
418 default:
419 return 0;
420 }
421}
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
466{
467 unsigned long end, tmp;
468 struct vm_area_struct *vma, *prev;
469 int unmapped_error = 0;
470 int error = -EINVAL;
471 int write;
472 size_t len;
473 struct blk_plug plug;
474
475#ifdef CONFIG_MEMORY_FAILURE
476 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
477 return madvise_hwpoison(behavior, start, start+len_in);
478#endif
479 if (!madvise_behavior_valid(behavior))
480 return error;
481
482 if (start & ~PAGE_MASK)
483 return error;
484 len = (len_in + ~PAGE_MASK) & PAGE_MASK;
485
486
487 if (len_in && !len)
488 return error;
489
490 end = start + len;
491 if (end < start)
492 return error;
493
494 error = 0;
495 if (end == start)
496 return error;
497
498 write = madvise_need_mmap_write(behavior);
499 if (write)
500 down_write(¤t->mm->mmap_sem);
501 else
502 down_read(¤t->mm->mmap_sem);
503
504
505
506
507
508
509 vma = find_vma_prev(current->mm, start, &prev);
510 if (vma && start > vma->vm_start)
511 prev = vma;
512
513 blk_start_plug(&plug);
514 for (;;) {
515
516 error = -ENOMEM;
517 if (!vma)
518 goto out;
519
520
521 if (start < vma->vm_start) {
522 unmapped_error = -ENOMEM;
523 start = vma->vm_start;
524 if (start >= end)
525 goto out;
526 }
527
528
529 tmp = vma->vm_end;
530 if (end < tmp)
531 tmp = end;
532
533
534 error = madvise_vma(vma, &prev, start, tmp, behavior);
535 if (error)
536 goto out;
537 start = tmp;
538 if (prev && start < prev->vm_end)
539 start = prev->vm_end;
540 error = unmapped_error;
541 if (start >= end)
542 goto out;
543 if (prev)
544 vma = prev->vm_next;
545 else
546 vma = find_vma(current->mm, start);
547 }
548out:
549 blk_finish_plug(&plug);
550 if (write)
551 up_write(¤t->mm->mmap_sem);
552 else
553 up_read(¤t->mm->mmap_sem);
554
555 return error;
556}
557