1
2
3
4
5
6
7
8#include <linux/capability.h>
9#include <linux/mman.h>
10#include <linux/mm.h>
11#include <linux/swap.h>
12#include <linux/swapops.h>
13#include <linux/pagemap.h>
14#include <linux/pagevec.h>
15#include <linux/mempolicy.h>
16#include <linux/syscalls.h>
17#include <linux/sched.h>
18#include <linux/export.h>
19#include <linux/rmap.h>
20#include <linux/mmzone.h>
21#include <linux/hugetlb.h>
22#include <linux/memcontrol.h>
23#include <linux/mm_inline.h>
24
25#include "internal.h"
26
27int can_do_mlock(void)
28{
29 if (rlimit(RLIMIT_MEMLOCK) != 0)
30 return 1;
31 if (capable(CAP_IPC_LOCK))
32 return 1;
33 return 0;
34}
35EXPORT_SYMBOL(can_do_mlock);
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57void clear_page_mlock(struct page *page)
58{
59 if (!TestClearPageMlocked(page))
60 return;
61
62 mod_zone_page_state(page_zone(page), NR_MLOCK,
63 -hpage_nr_pages(page));
64 count_vm_event(UNEVICTABLE_PGCLEARED);
65 if (!isolate_lru_page(page)) {
66 putback_lru_page(page);
67 } else {
68
69
70
71 if (PageUnevictable(page))
72 count_vm_event(UNEVICTABLE_PGSTRANDED);
73 }
74}
75
76
77
78
79
80void mlock_vma_page(struct page *page)
81{
82
83 BUG_ON(!PageLocked(page));
84
85 if (!TestSetPageMlocked(page)) {
86 mod_zone_page_state(page_zone(page), NR_MLOCK,
87 hpage_nr_pages(page));
88 count_vm_event(UNEVICTABLE_PGMLOCKED);
89 if (!isolate_lru_page(page))
90 putback_lru_page(page);
91 }
92}
93
94
95
96
97
98static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
99{
100 if (PageLRU(page)) {
101 struct lruvec *lruvec;
102
103 lruvec = mem_cgroup_page_lruvec(page, page_zone(page));
104 if (getpage)
105 get_page(page);
106 ClearPageLRU(page);
107 del_page_from_lru_list(page, lruvec, page_lru(page));
108 return true;
109 }
110
111 return false;
112}
113
114
115
116
117
118
119
120static void __munlock_isolated_page(struct page *page)
121{
122 int ret = SWAP_AGAIN;
123
124
125
126
127
128 if (page_mapcount(page) > 1)
129 ret = try_to_munlock(page);
130
131
132 if (ret != SWAP_MLOCK)
133 count_vm_event(UNEVICTABLE_PGMUNLOCKED);
134
135 putback_lru_page(page);
136}
137
138
139
140
141
142
143
144
145
146
147static void __munlock_isolation_failed(struct page *page)
148{
149 if (PageUnevictable(page))
150 __count_vm_event(UNEVICTABLE_PGSTRANDED);
151 else
152 __count_vm_event(UNEVICTABLE_PGMUNLOCKED);
153}
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173unsigned int munlock_vma_page(struct page *page)
174{
175 int nr_pages;
176 struct zone *zone = page_zone(page);
177
178
179 BUG_ON(!PageLocked(page));
180
181
182
183
184
185
186 spin_lock_irq(&zone->lru_lock);
187
188 nr_pages = hpage_nr_pages(page);
189 if (!TestClearPageMlocked(page))
190 goto unlock_out;
191
192 __mod_zone_page_state(zone, NR_MLOCK, -nr_pages);
193
194 if (__munlock_isolate_lru_page(page, true)) {
195 spin_unlock_irq(&zone->lru_lock);
196 __munlock_isolated_page(page);
197 goto out;
198 }
199 __munlock_isolation_failed(page);
200
201unlock_out:
202 spin_unlock_irq(&zone->lru_lock);
203
204out:
205 return nr_pages - 1;
206}
207
208
209
210
211static int __mlock_posix_error_return(long retval)
212{
213 if (retval == -EFAULT)
214 retval = -ENOMEM;
215 else if (retval == -ENOMEM)
216 retval = -EAGAIN;
217 return retval;
218}
219
220
221
222
223
224
225
226
227
228
229
230
231
232static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec,
233 int *pgrescued)
234{
235 VM_BUG_ON(PageLRU(page));
236 VM_BUG_ON(!PageLocked(page));
237
238 if (page_mapcount(page) <= 1 && page_evictable(page)) {
239 pagevec_add(pvec, page);
240 if (TestClearPageUnevictable(page))
241 (*pgrescued)++;
242 unlock_page(page);
243 return true;
244 }
245
246 return false;
247}
248
249
250
251
252
253
254
255static void __putback_lru_fast(struct pagevec *pvec, int pgrescued)
256{
257 count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec));
258
259
260
261
262 __pagevec_lru_add(pvec);
263 count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
264}
265
266
267
268
269
270
271
272
273
274
275
276static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
277{
278 int i;
279 int nr = pagevec_count(pvec);
280 int delta_munlocked = -nr;
281 struct pagevec pvec_putback;
282 int pgrescued = 0;
283
284 pagevec_init(&pvec_putback, 0);
285
286
287 spin_lock_irq(&zone->lru_lock);
288 for (i = 0; i < nr; i++) {
289 struct page *page = pvec->pages[i];
290
291 if (TestClearPageMlocked(page)) {
292
293
294
295
296 if (__munlock_isolate_lru_page(page, false))
297 continue;
298 else
299 __munlock_isolation_failed(page);
300 } else {
301 delta_munlocked++;
302 }
303
304
305
306
307
308
309
310 pagevec_add(&pvec_putback, pvec->pages[i]);
311 pvec->pages[i] = NULL;
312 }
313 __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
314 spin_unlock_irq(&zone->lru_lock);
315
316
317 pagevec_release(&pvec_putback);
318
319
320 for (i = 0; i < nr; i++) {
321 struct page *page = pvec->pages[i];
322
323 if (page) {
324 lock_page(page);
325 if (!__putback_lru_fast_prepare(page, &pvec_putback,
326 &pgrescued)) {
327
328
329
330
331 get_page(page);
332 __munlock_isolated_page(page);
333 unlock_page(page);
334 put_page(page);
335 }
336 }
337 }
338
339
340
341
342
343 if (pagevec_count(&pvec_putback))
344 __putback_lru_fast(&pvec_putback, pgrescued);
345}
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
361 struct vm_area_struct *vma, int zoneid, unsigned long start,
362 unsigned long end)
363{
364 pte_t *pte;
365 spinlock_t *ptl;
366
367
368
369
370
371
372 pte = get_locked_pte(vma->vm_mm, start, &ptl);
373
374 end = pgd_addr_end(start, end);
375 end = pud_addr_end(start, end);
376 end = pmd_addr_end(start, end);
377
378
379 start += PAGE_SIZE;
380 while (start < end) {
381 struct page *page = NULL;
382 pte++;
383 if (pte_present(*pte))
384 page = vm_normal_page(vma, start, *pte);
385
386
387
388
389 if (!page || page_zone_id(page) != zoneid)
390 break;
391
392 get_page(page);
393
394
395
396
397 start += PAGE_SIZE;
398 if (pagevec_add(pvec, page) == 0)
399 break;
400 }
401 pte_unmap_unlock(pte, ptl);
402 return start;
403}
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423void munlock_vma_pages_range(struct vm_area_struct *vma,
424 unsigned long start, unsigned long end)
425{
426 vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
427
428 while (start < end) {
429 struct page *page = NULL;
430 unsigned int page_mask;
431 unsigned long page_increm;
432 struct pagevec pvec;
433 struct zone *zone;
434 int zoneid;
435
436 pagevec_init(&pvec, 0);
437
438
439
440
441
442
443
444 page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP,
445 &page_mask);
446
447 if (page && !IS_ERR(page)) {
448 if (PageTransHuge(page)) {
449 lock_page(page);
450
451
452
453
454
455
456 page_mask = munlock_vma_page(page);
457 unlock_page(page);
458 put_page(page);
459 } else {
460
461
462
463
464
465 pagevec_add(&pvec, page);
466 zone = page_zone(page);
467 zoneid = page_zone_id(page);
468
469
470
471
472
473
474
475 start = __munlock_pagevec_fill(&pvec, vma,
476 zoneid, start, end);
477 __munlock_pagevec(&pvec, zone);
478 goto next;
479 }
480 }
481
482 VM_BUG_ON((start >> PAGE_SHIFT) & page_mask);
483 page_increm = 1 + page_mask;
484 start += page_increm * PAGE_SIZE;
485next:
486 cond_resched();
487 }
488}
489
490
491
492
493
494
495
496
497
498
499static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
500 unsigned long start, unsigned long end, vm_flags_t newflags)
501{
502 struct mm_struct *mm = vma->vm_mm;
503 pgoff_t pgoff;
504 int nr_pages;
505 int ret = 0;
506 int lock = !!(newflags & VM_LOCKED);
507 vm_flags_t old_flags = vma->vm_flags;
508
509 if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
510 is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
511 vma_is_dax(vma))
512
513 goto out;
514
515 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
516 *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
517 vma->vm_file, pgoff, vma_policy(vma),
518 vma->vm_userfaultfd_ctx);
519 if (*prev) {
520 vma = *prev;
521 goto success;
522 }
523
524 if (start != vma->vm_start) {
525 ret = split_vma(mm, vma, start, 1);
526 if (ret)
527 goto out;
528 }
529
530 if (end != vma->vm_end) {
531 ret = split_vma(mm, vma, end, 0);
532 if (ret)
533 goto out;
534 }
535
536success:
537
538
539
540 nr_pages = (end - start) >> PAGE_SHIFT;
541 if (!lock)
542 nr_pages = -nr_pages;
543 else if (old_flags & VM_LOCKED)
544 nr_pages = 0;
545 mm->locked_vm += nr_pages;
546
547
548
549
550
551
552
553 if (lock)
554 vma->vm_flags = newflags;
555 else
556 munlock_vma_pages_range(vma, start, end);
557
558out:
559 *prev = vma;
560 return ret;
561}
562
563static int apply_vma_lock_flags(unsigned long start, size_t len,
564 vm_flags_t flags)
565{
566 unsigned long nstart, end, tmp;
567 struct vm_area_struct * vma, * prev;
568 int error;
569
570 VM_BUG_ON(offset_in_page(start));
571 VM_BUG_ON(len != PAGE_ALIGN(len));
572 end = start + len;
573 if (end < start)
574 return -EINVAL;
575 if (end == start)
576 return 0;
577 vma = find_vma(current->mm, start);
578 if (!vma || vma->vm_start > start)
579 return -ENOMEM;
580
581 prev = vma->vm_prev;
582 if (start > vma->vm_start)
583 prev = vma;
584
585 for (nstart = start ; ; ) {
586 vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
587
588 newflags |= flags;
589
590
591 tmp = vma->vm_end;
592 if (tmp > end)
593 tmp = end;
594 error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
595 if (error)
596 break;
597 nstart = tmp;
598 if (nstart < prev->vm_end)
599 nstart = prev->vm_end;
600 if (nstart >= end)
601 break;
602
603 vma = prev->vm_next;
604 if (!vma || vma->vm_start != nstart) {
605 error = -ENOMEM;
606 break;
607 }
608 }
609 return error;
610}
611
612static int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
613{
614 unsigned long locked;
615 unsigned long lock_limit;
616 int error = -ENOMEM;
617
618 if (!can_do_mlock())
619 return -EPERM;
620
621 len = PAGE_ALIGN(len + (offset_in_page(start)));
622 start &= PAGE_MASK;
623
624 lock_limit = rlimit(RLIMIT_MEMLOCK);
625 lock_limit >>= PAGE_SHIFT;
626 locked = len >> PAGE_SHIFT;
627
628 down_write(¤t->mm->mmap_sem);
629
630 locked += current->mm->locked_vm;
631
632
633 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
634 error = apply_vma_lock_flags(start, len, flags);
635
636 up_write(¤t->mm->mmap_sem);
637 if (error)
638 return error;
639
640 error = __mm_populate(start, len, 0);
641 if (error)
642 return __mlock_posix_error_return(error);
643 return 0;
644}
645
646SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
647{
648 return do_mlock(start, len, VM_LOCKED);
649}
650
651SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
652{
653 vm_flags_t vm_flags = VM_LOCKED;
654
655 if (flags & ~MLOCK_ONFAULT)
656 return -EINVAL;
657
658 if (flags & MLOCK_ONFAULT)
659 vm_flags |= VM_LOCKONFAULT;
660
661 return do_mlock(start, len, vm_flags);
662}
663
664SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
665{
666 int ret;
667
668 len = PAGE_ALIGN(len + (offset_in_page(start)));
669 start &= PAGE_MASK;
670
671 down_write(¤t->mm->mmap_sem);
672 ret = apply_vma_lock_flags(start, len, 0);
673 up_write(¤t->mm->mmap_sem);
674
675 return ret;
676}
677
678
679
680
681
682
683
684
685
686
687
688static int apply_mlockall_flags(int flags)
689{
690 struct vm_area_struct * vma, * prev = NULL;
691 vm_flags_t to_add = 0;
692
693 current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
694 if (flags & MCL_FUTURE) {
695 current->mm->def_flags |= VM_LOCKED;
696
697 if (flags & MCL_ONFAULT)
698 current->mm->def_flags |= VM_LOCKONFAULT;
699
700 if (!(flags & MCL_CURRENT))
701 goto out;
702 }
703
704 if (flags & MCL_CURRENT) {
705 to_add |= VM_LOCKED;
706 if (flags & MCL_ONFAULT)
707 to_add |= VM_LOCKONFAULT;
708 }
709
710 for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
711 vm_flags_t newflags;
712
713 newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
714 newflags |= to_add;
715
716
717 mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
718 cond_resched();
719 }
720out:
721 return 0;
722}
723
724SYSCALL_DEFINE1(mlockall, int, flags)
725{
726 unsigned long lock_limit;
727 int ret;
728
729 if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) ||
730 flags == MCL_ONFAULT)
731 return -EINVAL;
732
733 if (!can_do_mlock())
734 return -EPERM;
735
736 lock_limit = rlimit(RLIMIT_MEMLOCK);
737 lock_limit >>= PAGE_SHIFT;
738
739 ret = -ENOMEM;
740 down_write(¤t->mm->mmap_sem);
741
742 if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
743 capable(CAP_IPC_LOCK))
744 ret = apply_mlockall_flags(flags);
745 up_write(¤t->mm->mmap_sem);
746 if (!ret && (flags & MCL_CURRENT))
747 mm_populate(0, TASK_SIZE);
748
749 return ret;
750}
751
752SYSCALL_DEFINE0(munlockall)
753{
754 int ret;
755
756 down_write(¤t->mm->mmap_sem);
757 ret = apply_mlockall_flags(0);
758 up_write(¤t->mm->mmap_sem);
759 return ret;
760}
761
762
763
764
765
766static DEFINE_SPINLOCK(shmlock_user_lock);
767
768int user_shm_lock(size_t size, struct user_struct *user)
769{
770 unsigned long lock_limit, locked;
771 int allowed = 0;
772
773 locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
774 lock_limit = rlimit(RLIMIT_MEMLOCK);
775 if (lock_limit == RLIM_INFINITY)
776 allowed = 1;
777 lock_limit >>= PAGE_SHIFT;
778 spin_lock(&shmlock_user_lock);
779 if (!allowed &&
780 locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
781 goto out;
782 get_uid(user);
783 user->locked_shm += locked;
784 allowed = 1;
785out:
786 spin_unlock(&shmlock_user_lock);
787 return allowed;
788}
789
790void user_shm_unlock(size_t size, struct user_struct *user)
791{
792 spin_lock(&shmlock_user_lock);
793 user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
794 spin_unlock(&shmlock_user_lock);
795 free_uid(user);
796}
797