1
2
3
4
5
6
7
8#include <linux/mman.h>
9#include <linux/pagemap.h>
10#include <linux/syscalls.h>
11#include <linux/mempolicy.h>
12#include <linux/page-isolation.h>
13#include <linux/hugetlb.h>
14#include <linux/falloc.h>
15#include <linux/sched.h>
16#include <linux/ksm.h>
17#include <linux/fs.h>
18#include <linux/file.h>
19#include <linux/blkdev.h>
20#include <linux/swap.h>
21#include <linux/swapops.h>
22
23
24
25
26
27
28static int madvise_need_mmap_write(int behavior)
29{
30 switch (behavior) {
31 case MADV_REMOVE:
32 case MADV_WILLNEED:
33 case MADV_DONTNEED:
34 return 0;
35 default:
36
37 return 1;
38 }
39}
40
41
42
43
44
45static long madvise_behavior(struct vm_area_struct * vma,
46 struct vm_area_struct **prev,
47 unsigned long start, unsigned long end, int behavior)
48{
49 struct mm_struct * mm = vma->vm_mm;
50 int error = 0;
51 pgoff_t pgoff;
52 unsigned long new_flags = vma->vm_flags;
53
54 switch (behavior) {
55 case MADV_NORMAL:
56 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
57 break;
58 case MADV_SEQUENTIAL:
59 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
60 break;
61 case MADV_RANDOM:
62 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
63 break;
64 case MADV_DONTFORK:
65 new_flags |= VM_DONTCOPY;
66 break;
67 case MADV_DOFORK:
68 if (vma->vm_flags & VM_IO) {
69 error = -EINVAL;
70 goto out;
71 }
72 new_flags &= ~VM_DONTCOPY;
73 break;
74 case MADV_DONTDUMP:
75 new_flags |= VM_DONTDUMP;
76 break;
77 case MADV_DODUMP:
78 if (new_flags & VM_SPECIAL) {
79 error = -EINVAL;
80 goto out;
81 }
82 new_flags &= ~VM_DONTDUMP;
83 break;
84 case MADV_MERGEABLE:
85 case MADV_UNMERGEABLE:
86 error = ksm_madvise(vma, start, end, behavior, &new_flags);
87 if (error)
88 goto out;
89 break;
90 case MADV_HUGEPAGE:
91 case MADV_NOHUGEPAGE:
92 error = hugepage_madvise(vma, &new_flags, behavior);
93 if (error)
94 goto out;
95 break;
96 }
97
98 if (new_flags == vma->vm_flags) {
99 *prev = vma;
100 goto out;
101 }
102
103 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
104 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
105 vma->vm_file, pgoff, vma_policy(vma));
106 if (*prev) {
107 vma = *prev;
108 goto success;
109 }
110
111 *prev = vma;
112
113 if (start != vma->vm_start) {
114 error = split_vma(mm, vma, start, 1);
115 if (error)
116 goto out;
117 }
118
119 if (end != vma->vm_end) {
120 error = split_vma(mm, vma, end, 0);
121 if (error)
122 goto out;
123 }
124
125success:
126
127
128
129 vma->vm_flags = new_flags;
130
131out:
132 if (error == -ENOMEM)
133 error = -EAGAIN;
134 return error;
135}
136
137#ifdef CONFIG_SWAP
138static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
139 unsigned long end, struct mm_walk *walk)
140{
141 pte_t *orig_pte;
142 struct vm_area_struct *vma = walk->private;
143 unsigned long index;
144
145 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
146 return 0;
147
148 for (index = start; index != end; index += PAGE_SIZE) {
149 pte_t pte;
150 swp_entry_t entry;
151 struct page *page;
152 spinlock_t *ptl;
153
154 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
155 pte = *(orig_pte + ((index - start) / PAGE_SIZE));
156 pte_unmap_unlock(orig_pte, ptl);
157
158 if (pte_present(pte) || pte_none(pte) || pte_file(pte))
159 continue;
160 entry = pte_to_swp_entry(pte);
161 if (unlikely(non_swap_entry(entry)))
162 continue;
163
164 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
165 vma, index);
166 if (page)
167 page_cache_release(page);
168 }
169
170 return 0;
171}
172
173static void force_swapin_readahead(struct vm_area_struct *vma,
174 unsigned long start, unsigned long end)
175{
176 struct mm_walk walk = {
177 .mm = vma->vm_mm,
178 .pmd_entry = swapin_walk_pmd_entry,
179 .private = vma,
180 };
181
182 walk_page_range(start, end, &walk);
183
184 lru_add_drain();
185}
186
187static void force_shm_swapin_readahead(struct vm_area_struct *vma,
188 unsigned long start, unsigned long end,
189 struct address_space *mapping)
190{
191 pgoff_t index;
192 struct page *page;
193 swp_entry_t swap;
194
195 for (; start < end; start += PAGE_SIZE) {
196 index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
197
198 page = find_get_page(mapping, index);
199 if (!radix_tree_exceptional_entry(page)) {
200 if (page)
201 page_cache_release(page);
202 continue;
203 }
204 swap = radix_to_swp_entry(page);
205 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
206 NULL, 0);
207 if (page)
208 page_cache_release(page);
209 }
210
211 lru_add_drain();
212}
213#endif
214
215
216
217
218static long madvise_willneed(struct vm_area_struct * vma,
219 struct vm_area_struct ** prev,
220 unsigned long start, unsigned long end)
221{
222 struct file *file = vma->vm_file;
223
224#ifdef CONFIG_SWAP
225 if (!file || mapping_cap_swap_backed(file->f_mapping)) {
226 *prev = vma;
227 if (!file)
228 force_swapin_readahead(vma, start, end);
229 else
230 force_shm_swapin_readahead(vma, start, end,
231 file->f_mapping);
232 return 0;
233 }
234#endif
235
236 if (!file)
237 return -EBADF;
238
239 if (file->f_mapping->a_ops->get_xip_mem) {
240
241 return 0;
242 }
243
244 *prev = vma;
245 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
246 if (end > vma->vm_end)
247 end = vma->vm_end;
248 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
249
250 force_page_cache_readahead(file->f_mapping, file, start, end - start);
251 return 0;
252}
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273static long madvise_dontneed(struct vm_area_struct * vma,
274 struct vm_area_struct ** prev,
275 unsigned long start, unsigned long end)
276{
277 *prev = vma;
278 if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
279 return -EINVAL;
280
281 if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
282 struct zap_details details = {
283 .nonlinear_vma = vma,
284 .last_index = ULONG_MAX,
285 };
286 zap_page_range(vma, start, end - start, &details);
287 } else
288 zap_page_range(vma, start, end - start, NULL);
289 return 0;
290}
291
292
293
294
295
296
297
298
299static long madvise_remove(struct vm_area_struct *vma,
300 struct vm_area_struct **prev,
301 unsigned long start, unsigned long end)
302{
303 loff_t offset;
304 int error;
305 struct file *f;
306
307 *prev = NULL;
308
309 if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
310 return -EINVAL;
311
312 f = vma->vm_file;
313
314 if (!f || !f->f_mapping || !f->f_mapping->host) {
315 return -EINVAL;
316 }
317
318 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
319 return -EACCES;
320
321 offset = (loff_t)(start - vma->vm_start)
322 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
323
324
325
326
327
328
329
330 get_file(f);
331 up_read(¤t->mm->mmap_sem);
332 error = do_fallocate(f,
333 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
334 offset, end - start);
335 fput(f);
336 down_read(¤t->mm->mmap_sem);
337 return error;
338}
339
340#ifdef CONFIG_MEMORY_FAILURE
341
342
343
344static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
345{
346 int ret = 0;
347
348 if (!capable(CAP_SYS_ADMIN))
349 return -EPERM;
350 for (; start < end; start += PAGE_SIZE) {
351 struct page *p;
352 int ret = get_user_pages_fast(start, 1, 0, &p);
353 if (ret != 1)
354 return ret;
355 if (bhv == MADV_SOFT_OFFLINE) {
356 printk(KERN_INFO "Soft offlining page %lx at %lx\n",
357 page_to_pfn(p), start);
358 ret = soft_offline_page(p, MF_COUNT_INCREASED);
359 if (ret)
360 break;
361 continue;
362 }
363 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
364 page_to_pfn(p), start);
365
366 memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
367 }
368 return ret;
369}
370#endif
371
372static long
373madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
374 unsigned long start, unsigned long end, int behavior)
375{
376 switch (behavior) {
377 case MADV_REMOVE:
378 return madvise_remove(vma, prev, start, end);
379 case MADV_WILLNEED:
380 return madvise_willneed(vma, prev, start, end);
381 case MADV_DONTNEED:
382 return madvise_dontneed(vma, prev, start, end);
383 default:
384 return madvise_behavior(vma, prev, start, end, behavior);
385 }
386}
387
388static int
389madvise_behavior_valid(int behavior)
390{
391 switch (behavior) {
392 case MADV_DOFORK:
393 case MADV_DONTFORK:
394 case MADV_NORMAL:
395 case MADV_SEQUENTIAL:
396 case MADV_RANDOM:
397 case MADV_REMOVE:
398 case MADV_WILLNEED:
399 case MADV_DONTNEED:
400#ifdef CONFIG_KSM
401 case MADV_MERGEABLE:
402 case MADV_UNMERGEABLE:
403#endif
404#ifdef CONFIG_TRANSPARENT_HUGEPAGE
405 case MADV_HUGEPAGE:
406 case MADV_NOHUGEPAGE:
407#endif
408 case MADV_DONTDUMP:
409 case MADV_DODUMP:
410 return 1;
411
412 default:
413 return 0;
414 }
415}
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
460{
461 unsigned long end, tmp;
462 struct vm_area_struct * vma, *prev;
463 int unmapped_error = 0;
464 int error = -EINVAL;
465 int write;
466 size_t len;
467 struct blk_plug plug;
468
469#ifdef CONFIG_MEMORY_FAILURE
470 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
471 return madvise_hwpoison(behavior, start, start+len_in);
472#endif
473 if (!madvise_behavior_valid(behavior))
474 return error;
475
476 if (start & ~PAGE_MASK)
477 return error;
478 len = (len_in + ~PAGE_MASK) & PAGE_MASK;
479
480
481 if (len_in && !len)
482 return error;
483
484 end = start + len;
485 if (end < start)
486 return error;
487
488 error = 0;
489 if (end == start)
490 return error;
491
492 write = madvise_need_mmap_write(behavior);
493 if (write)
494 down_write(¤t->mm->mmap_sem);
495 else
496 down_read(¤t->mm->mmap_sem);
497
498
499
500
501
502
503 vma = find_vma_prev(current->mm, start, &prev);
504 if (vma && start > vma->vm_start)
505 prev = vma;
506
507 blk_start_plug(&plug);
508 for (;;) {
509
510 error = -ENOMEM;
511 if (!vma)
512 goto out;
513
514
515 if (start < vma->vm_start) {
516 unmapped_error = -ENOMEM;
517 start = vma->vm_start;
518 if (start >= end)
519 goto out;
520 }
521
522
523 tmp = vma->vm_end;
524 if (end < tmp)
525 tmp = end;
526
527
528 error = madvise_vma(vma, &prev, start, tmp, behavior);
529 if (error)
530 goto out;
531 start = tmp;
532 if (prev && start < prev->vm_end)
533 start = prev->vm_end;
534 error = unmapped_error;
535 if (start >= end)
536 goto out;
537 if (prev)
538 vma = prev->vm_next;
539 else
540 vma = find_vma(current->mm, start);
541 }
542out:
543 blk_finish_plug(&plug);
544 if (write)
545 up_write(¤t->mm->mmap_sem);
546 else
547 up_read(¤t->mm->mmap_sem);
548
549 return error;
550}
551