1
2
3
4
5
6
7
8#include <linux/mm.h>
9#include <linux/hugetlb.h>
10#include <linux/mman.h>
11#include <linux/slab.h>
12#include <linux/kernel_stat.h>
13#include <linux/swap.h>
14#include <linux/vmalloc.h>
15#include <linux/pagemap.h>
16#include <linux/namei.h>
17#include <linux/shmem_fs.h>
18#include <linux/blkdev.h>
19#include <linux/random.h>
20#include <linux/writeback.h>
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/init.h>
24#include <linux/ksm.h>
25#include <linux/rmap.h>
26#include <linux/security.h>
27#include <linux/backing-dev.h>
28#include <linux/mutex.h>
29#include <linux/capability.h>
30#include <linux/syscalls.h>
31#include <linux/memcontrol.h>
32#include <linux/poll.h>
33#include <linux/oom.h>
34#include <linux/frontswap.h>
35#include <linux/swapfile.h>
36#include <linux/export.h>
37
38#include <asm/pgtable.h>
39#include <asm/tlbflush.h>
40#include <linux/swapops.h>
41#include <linux/page_cgroup.h>
42
43static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
44 unsigned char);
45static void free_swap_count_continuations(struct swap_info_struct *);
46static sector_t map_swap_entry(swp_entry_t, struct block_device**);
47
48DEFINE_SPINLOCK(swap_lock);
49static unsigned int nr_swapfiles;
50atomic_long_t nr_swap_pages;
51
52long total_swap_pages;
53static int least_priority;
54static atomic_t highest_priority_index = ATOMIC_INIT(-1);
55
56static const char Bad_file[] = "Bad swap file entry ";
57static const char Unused_file[] = "Unused swap file entry ";
58static const char Bad_offset[] = "Bad swap offset entry ";
59static const char Unused_offset[] = "Unused swap offset entry ";
60
61struct swap_list_t swap_list = {-1, -1};
62
63struct swap_info_struct *swap_info[MAX_SWAPFILES];
64
65static DEFINE_MUTEX(swapon_mutex);
66
67static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
68
69static atomic_t proc_poll_event = ATOMIC_INIT(0);
70
71static inline unsigned char swap_count(unsigned char ent)
72{
73 return ent & ~SWAP_HAS_CACHE;
74}
75
76
77static int
78__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
79{
80 swp_entry_t entry = swp_entry(si->type, offset);
81 struct page *page;
82 int ret = 0;
83
84 page = find_get_page(swap_address_space(entry), entry.val);
85 if (!page)
86 return 0;
87
88
89
90
91
92
93
94 if (trylock_page(page)) {
95 ret = try_to_free_swap(page);
96 unlock_page(page);
97 }
98 page_cache_release(page);
99 return ret;
100}
101
102
103
104
105
106static int discard_swap(struct swap_info_struct *si)
107{
108 struct swap_extent *se;
109 sector_t start_block;
110 sector_t nr_blocks;
111 int err = 0;
112
113
114 se = &si->first_swap_extent;
115 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
116 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
117 if (nr_blocks) {
118 err = blkdev_issue_discard(si->bdev, start_block,
119 nr_blocks, GFP_KERNEL, 0);
120 if (err)
121 return err;
122 cond_resched();
123 }
124
125 list_for_each_entry(se, &si->first_swap_extent.list, list) {
126 start_block = se->start_block << (PAGE_SHIFT - 9);
127 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
128
129 err = blkdev_issue_discard(si->bdev, start_block,
130 nr_blocks, GFP_KERNEL, 0);
131 if (err)
132 break;
133
134 cond_resched();
135 }
136 return err;
137}
138
139
140
141
142
143static void discard_swap_cluster(struct swap_info_struct *si,
144 pgoff_t start_page, pgoff_t nr_pages)
145{
146 struct swap_extent *se = si->curr_swap_extent;
147 int found_extent = 0;
148
149 while (nr_pages) {
150 struct list_head *lh;
151
152 if (se->start_page <= start_page &&
153 start_page < se->start_page + se->nr_pages) {
154 pgoff_t offset = start_page - se->start_page;
155 sector_t start_block = se->start_block + offset;
156 sector_t nr_blocks = se->nr_pages - offset;
157
158 if (nr_blocks > nr_pages)
159 nr_blocks = nr_pages;
160 start_page += nr_blocks;
161 nr_pages -= nr_blocks;
162
163 if (!found_extent++)
164 si->curr_swap_extent = se;
165
166 start_block <<= PAGE_SHIFT - 9;
167 nr_blocks <<= PAGE_SHIFT - 9;
168 if (blkdev_issue_discard(si->bdev, start_block,
169 nr_blocks, GFP_NOIO, 0))
170 break;
171 }
172
173 lh = se->list.next;
174 se = list_entry(lh, struct swap_extent, list);
175 }
176}
177
178static int wait_for_discard(void *word)
179{
180 schedule();
181 return 0;
182}
183
184#define SWAPFILE_CLUSTER 256
185#define LATENCY_LIMIT 256
186
187static unsigned long scan_swap_map(struct swap_info_struct *si,
188 unsigned char usage)
189{
190 unsigned long offset;
191 unsigned long scan_base;
192 unsigned long last_in_cluster = 0;
193 int latency_ration = LATENCY_LIMIT;
194 int found_free_cluster = 0;
195
196
197
198
199
200
201
202
203
204
205
206
207 si->flags += SWP_SCANNING;
208 scan_base = offset = si->cluster_next;
209
210 if (unlikely(!si->cluster_nr--)) {
211 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
212 si->cluster_nr = SWAPFILE_CLUSTER - 1;
213 goto checks;
214 }
215 if (si->flags & SWP_PAGE_DISCARD) {
216
217
218
219
220
221
222
223 if (si->lowest_alloc)
224 goto checks;
225 si->lowest_alloc = si->max;
226 si->highest_alloc = 0;
227 }
228 spin_unlock(&si->lock);
229
230
231
232
233
234
235
236
237
238 if (!(si->flags & SWP_SOLIDSTATE))
239 scan_base = offset = si->lowest_bit;
240 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
241
242
243 for (; last_in_cluster <= si->highest_bit; offset++) {
244 if (si->swap_map[offset])
245 last_in_cluster = offset + SWAPFILE_CLUSTER;
246 else if (offset == last_in_cluster) {
247 spin_lock(&si->lock);
248 offset -= SWAPFILE_CLUSTER - 1;
249 si->cluster_next = offset;
250 si->cluster_nr = SWAPFILE_CLUSTER - 1;
251 found_free_cluster = 1;
252 goto checks;
253 }
254 if (unlikely(--latency_ration < 0)) {
255 cond_resched();
256 latency_ration = LATENCY_LIMIT;
257 }
258 }
259
260 offset = si->lowest_bit;
261 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
262
263
264 for (; last_in_cluster < scan_base; offset++) {
265 if (si->swap_map[offset])
266 last_in_cluster = offset + SWAPFILE_CLUSTER;
267 else if (offset == last_in_cluster) {
268 spin_lock(&si->lock);
269 offset -= SWAPFILE_CLUSTER - 1;
270 si->cluster_next = offset;
271 si->cluster_nr = SWAPFILE_CLUSTER - 1;
272 found_free_cluster = 1;
273 goto checks;
274 }
275 if (unlikely(--latency_ration < 0)) {
276 cond_resched();
277 latency_ration = LATENCY_LIMIT;
278 }
279 }
280
281 offset = scan_base;
282 spin_lock(&si->lock);
283 si->cluster_nr = SWAPFILE_CLUSTER - 1;
284 si->lowest_alloc = 0;
285 }
286
287checks:
288 if (!(si->flags & SWP_WRITEOK))
289 goto no_page;
290 if (!si->highest_bit)
291 goto no_page;
292 if (offset > si->highest_bit)
293 scan_base = offset = si->lowest_bit;
294
295
296 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
297 int swap_was_freed;
298 spin_unlock(&si->lock);
299 swap_was_freed = __try_to_reclaim_swap(si, offset);
300 spin_lock(&si->lock);
301
302 if (swap_was_freed)
303 goto checks;
304 goto scan;
305 }
306
307 if (si->swap_map[offset])
308 goto scan;
309
310 if (offset == si->lowest_bit)
311 si->lowest_bit++;
312 if (offset == si->highest_bit)
313 si->highest_bit--;
314 si->inuse_pages++;
315 if (si->inuse_pages == si->pages) {
316 si->lowest_bit = si->max;
317 si->highest_bit = 0;
318 }
319 si->swap_map[offset] = usage;
320 si->cluster_next = offset + 1;
321 si->flags -= SWP_SCANNING;
322
323 if (si->lowest_alloc) {
324
325
326
327
328 if (found_free_cluster) {
329
330
331
332
333
334
335
336 if (offset < si->highest_alloc &&
337 si->lowest_alloc <= last_in_cluster)
338 last_in_cluster = si->lowest_alloc - 1;
339 si->flags |= SWP_DISCARDING;
340 spin_unlock(&si->lock);
341
342 if (offset < last_in_cluster)
343 discard_swap_cluster(si, offset,
344 last_in_cluster - offset + 1);
345
346 spin_lock(&si->lock);
347 si->lowest_alloc = 0;
348 si->flags &= ~SWP_DISCARDING;
349
350 smp_mb();
351 wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
352
353 } else if (si->flags & SWP_DISCARDING) {
354
355
356
357
358
359
360 spin_unlock(&si->lock);
361 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
362 wait_for_discard, TASK_UNINTERRUPTIBLE);
363 spin_lock(&si->lock);
364 } else {
365
366
367
368
369
370 if (offset < si->lowest_alloc)
371 si->lowest_alloc = offset;
372 if (offset > si->highest_alloc)
373 si->highest_alloc = offset;
374 }
375 }
376 return offset;
377
378scan:
379 spin_unlock(&si->lock);
380 while (++offset <= si->highest_bit) {
381 if (!si->swap_map[offset]) {
382 spin_lock(&si->lock);
383 goto checks;
384 }
385 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
386 spin_lock(&si->lock);
387 goto checks;
388 }
389 if (unlikely(--latency_ration < 0)) {
390 cond_resched();
391 latency_ration = LATENCY_LIMIT;
392 }
393 }
394 offset = si->lowest_bit;
395 while (++offset < scan_base) {
396 if (!si->swap_map[offset]) {
397 spin_lock(&si->lock);
398 goto checks;
399 }
400 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
401 spin_lock(&si->lock);
402 goto checks;
403 }
404 if (unlikely(--latency_ration < 0)) {
405 cond_resched();
406 latency_ration = LATENCY_LIMIT;
407 }
408 }
409 spin_lock(&si->lock);
410
411no_page:
412 si->flags -= SWP_SCANNING;
413 return 0;
414}
415
416swp_entry_t get_swap_page(void)
417{
418 struct swap_info_struct *si;
419 pgoff_t offset;
420 int type, next;
421 int wrapped = 0;
422 int hp_index;
423
424 spin_lock(&swap_lock);
425 if (atomic_long_read(&nr_swap_pages) <= 0)
426 goto noswap;
427 atomic_long_dec(&nr_swap_pages);
428
429 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
430 hp_index = atomic_xchg(&highest_priority_index, -1);
431
432
433
434
435
436
437
438
439
440
441
442
443 if (hp_index != -1 && hp_index != type &&
444 swap_info[type]->prio < swap_info[hp_index]->prio &&
445 (swap_info[hp_index]->flags & SWP_WRITEOK)) {
446 type = hp_index;
447 swap_list.next = type;
448 }
449
450 si = swap_info[type];
451 next = si->next;
452 if (next < 0 ||
453 (!wrapped && si->prio != swap_info[next]->prio)) {
454 next = swap_list.head;
455 wrapped++;
456 }
457
458 spin_lock(&si->lock);
459 if (!si->highest_bit) {
460 spin_unlock(&si->lock);
461 continue;
462 }
463 if (!(si->flags & SWP_WRITEOK)) {
464 spin_unlock(&si->lock);
465 continue;
466 }
467
468 swap_list.next = next;
469
470 spin_unlock(&swap_lock);
471
472 offset = scan_swap_map(si, SWAP_HAS_CACHE);
473 spin_unlock(&si->lock);
474 if (offset)
475 return swp_entry(type, offset);
476 spin_lock(&swap_lock);
477 next = swap_list.next;
478 }
479
480 atomic_long_inc(&nr_swap_pages);
481noswap:
482 spin_unlock(&swap_lock);
483 return (swp_entry_t) {0};
484}
485
486
487swp_entry_t get_swap_page_of_type(int type)
488{
489 struct swap_info_struct *si;
490 pgoff_t offset;
491
492 si = swap_info[type];
493 spin_lock(&si->lock);
494 if (si && (si->flags & SWP_WRITEOK)) {
495 atomic_long_dec(&nr_swap_pages);
496
497 offset = scan_swap_map(si, 1);
498 if (offset) {
499 spin_unlock(&si->lock);
500 return swp_entry(type, offset);
501 }
502 atomic_long_inc(&nr_swap_pages);
503 }
504 spin_unlock(&si->lock);
505 return (swp_entry_t) {0};
506}
507
508static struct swap_info_struct *swap_info_get(swp_entry_t entry)
509{
510 struct swap_info_struct *p;
511 unsigned long offset, type;
512
513 if (!entry.val)
514 goto out;
515 type = swp_type(entry);
516 if (type >= nr_swapfiles)
517 goto bad_nofile;
518 p = swap_info[type];
519 if (!(p->flags & SWP_USED))
520 goto bad_device;
521 offset = swp_offset(entry);
522 if (offset >= p->max)
523 goto bad_offset;
524 if (!p->swap_map[offset])
525 goto bad_free;
526 spin_lock(&p->lock);
527 return p;
528
529bad_free:
530 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
531 goto out;
532bad_offset:
533 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
534 goto out;
535bad_device:
536 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
537 goto out;
538bad_nofile:
539 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
540out:
541 return NULL;
542}
543
544
545
546
547
548
549
550
551static void set_highest_priority_index(int type)
552{
553 int old_hp_index, new_hp_index;
554
555 do {
556 old_hp_index = atomic_read(&highest_priority_index);
557 if (old_hp_index != -1 &&
558 swap_info[old_hp_index]->prio >= swap_info[type]->prio)
559 break;
560 new_hp_index = type;
561 } while (atomic_cmpxchg(&highest_priority_index,
562 old_hp_index, new_hp_index) != old_hp_index);
563}
564
565static unsigned char swap_entry_free(struct swap_info_struct *p,
566 swp_entry_t entry, unsigned char usage)
567{
568 unsigned long offset = swp_offset(entry);
569 unsigned char count;
570 unsigned char has_cache;
571
572 count = p->swap_map[offset];
573 has_cache = count & SWAP_HAS_CACHE;
574 count &= ~SWAP_HAS_CACHE;
575
576 if (usage == SWAP_HAS_CACHE) {
577 VM_BUG_ON(!has_cache);
578 has_cache = 0;
579 } else if (count == SWAP_MAP_SHMEM) {
580
581
582
583
584 count = 0;
585 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
586 if (count == COUNT_CONTINUED) {
587 if (swap_count_continued(p, offset, count))
588 count = SWAP_MAP_MAX | COUNT_CONTINUED;
589 else
590 count = SWAP_MAP_MAX;
591 } else
592 count--;
593 }
594
595 if (!count)
596 mem_cgroup_uncharge_swap(entry);
597
598 usage = count | has_cache;
599 p->swap_map[offset] = usage;
600
601
602 if (!usage) {
603 if (offset < p->lowest_bit)
604 p->lowest_bit = offset;
605 if (offset > p->highest_bit)
606 p->highest_bit = offset;
607 set_highest_priority_index(p->type);
608 atomic_long_inc(&nr_swap_pages);
609 p->inuse_pages--;
610 frontswap_invalidate_page(p->type, offset);
611 if (p->flags & SWP_BLKDEV) {
612 struct gendisk *disk = p->bdev->bd_disk;
613 if (disk->fops->swap_slot_free_notify)
614 disk->fops->swap_slot_free_notify(p->bdev,
615 offset);
616 }
617 }
618
619 return usage;
620}
621
622
623
624
625
626void swap_free(swp_entry_t entry)
627{
628 struct swap_info_struct *p;
629
630 p = swap_info_get(entry);
631 if (p) {
632 swap_entry_free(p, entry, 1);
633 spin_unlock(&p->lock);
634 }
635}
636
637
638
639
640void swapcache_free(swp_entry_t entry, struct page *page)
641{
642 struct swap_info_struct *p;
643 unsigned char count;
644
645 p = swap_info_get(entry);
646 if (p) {
647 count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
648 if (page)
649 mem_cgroup_uncharge_swapcache(page, entry, count != 0);
650 spin_unlock(&p->lock);
651 }
652}
653
654
655
656
657
658
659int page_swapcount(struct page *page)
660{
661 int count = 0;
662 struct swap_info_struct *p;
663 swp_entry_t entry;
664
665 entry.val = page_private(page);
666 p = swap_info_get(entry);
667 if (p) {
668 count = swap_count(p->swap_map[swp_offset(entry)]);
669 spin_unlock(&p->lock);
670 }
671 return count;
672}
673
674
675
676
677
678
679
680int reuse_swap_page(struct page *page)
681{
682 int count;
683
684 VM_BUG_ON(!PageLocked(page));
685 if (unlikely(PageKsm(page)))
686 return 0;
687 count = page_mapcount(page);
688 if (count <= 1 && PageSwapCache(page)) {
689 count += page_swapcount(page);
690 if (count == 1 && !PageWriteback(page)) {
691 delete_from_swap_cache(page);
692 SetPageDirty(page);
693 }
694 }
695 return count <= 1;
696}
697
698
699
700
701
702int try_to_free_swap(struct page *page)
703{
704 VM_BUG_ON(!PageLocked(page));
705
706 if (!PageSwapCache(page))
707 return 0;
708 if (PageWriteback(page))
709 return 0;
710 if (page_swapcount(page))
711 return 0;
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728 if (pm_suspended_storage())
729 return 0;
730
731 delete_from_swap_cache(page);
732 SetPageDirty(page);
733 return 1;
734}
735
736
737
738
739
740int free_swap_and_cache(swp_entry_t entry)
741{
742 struct swap_info_struct *p;
743 struct page *page = NULL;
744
745 if (non_swap_entry(entry))
746 return 1;
747
748 p = swap_info_get(entry);
749 if (p) {
750 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
751 page = find_get_page(swap_address_space(entry),
752 entry.val);
753 if (page && !trylock_page(page)) {
754 page_cache_release(page);
755 page = NULL;
756 }
757 }
758 spin_unlock(&p->lock);
759 }
760 if (page) {
761
762
763
764
765 if (PageSwapCache(page) && !PageWriteback(page) &&
766 (!page_mapped(page) || vm_swap_full())) {
767 delete_from_swap_cache(page);
768 SetPageDirty(page);
769 }
770 unlock_page(page);
771 page_cache_release(page);
772 }
773 return p != NULL;
774}
775
776#ifdef CONFIG_HIBERNATION
777
778
779
780
781
782
783
784
785int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
786{
787 struct block_device *bdev = NULL;
788 int type;
789
790 if (device)
791 bdev = bdget(device);
792
793 spin_lock(&swap_lock);
794 for (type = 0; type < nr_swapfiles; type++) {
795 struct swap_info_struct *sis = swap_info[type];
796
797 if (!(sis->flags & SWP_WRITEOK))
798 continue;
799
800 if (!bdev) {
801 if (bdev_p)
802 *bdev_p = bdgrab(sis->bdev);
803
804 spin_unlock(&swap_lock);
805 return type;
806 }
807 if (bdev == sis->bdev) {
808 struct swap_extent *se = &sis->first_swap_extent;
809
810 if (se->start_block == offset) {
811 if (bdev_p)
812 *bdev_p = bdgrab(sis->bdev);
813
814 spin_unlock(&swap_lock);
815 bdput(bdev);
816 return type;
817 }
818 }
819 }
820 spin_unlock(&swap_lock);
821 if (bdev)
822 bdput(bdev);
823
824 return -ENODEV;
825}
826
827
828
829
830
831sector_t swapdev_block(int type, pgoff_t offset)
832{
833 struct block_device *bdev;
834
835 if ((unsigned int)type >= nr_swapfiles)
836 return 0;
837 if (!(swap_info[type]->flags & SWP_WRITEOK))
838 return 0;
839 return map_swap_entry(swp_entry(type, offset), &bdev);
840}
841
842
843
844
845
846
847
848unsigned int count_swap_pages(int type, int free)
849{
850 unsigned int n = 0;
851
852 spin_lock(&swap_lock);
853 if ((unsigned int)type < nr_swapfiles) {
854 struct swap_info_struct *sis = swap_info[type];
855
856 spin_lock(&sis->lock);
857 if (sis->flags & SWP_WRITEOK) {
858 n = sis->pages;
859 if (free)
860 n -= sis->inuse_pages;
861 }
862 spin_unlock(&sis->lock);
863 }
864 spin_unlock(&swap_lock);
865 return n;
866}
867#endif
868
869static inline int maybe_same_pte(pte_t pte, pte_t swp_pte)
870{
871#ifdef CONFIG_MEM_SOFT_DIRTY
872
873
874
875
876
877 pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte);
878 return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty);
879#else
880 return pte_same(pte, swp_pte);
881#endif
882}
883
884
885
886
887
888
889static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
890 unsigned long addr, swp_entry_t entry, struct page *page)
891{
892 struct page *swapcache;
893 struct mem_cgroup *memcg;
894 spinlock_t *ptl;
895 pte_t *pte;
896 int ret = 1;
897
898 swapcache = page;
899 page = ksm_might_need_to_copy(page, vma, addr);
900 if (unlikely(!page))
901 return -ENOMEM;
902
903 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page,
904 GFP_KERNEL, &memcg)) {
905 ret = -ENOMEM;
906 goto out_nolock;
907 }
908
909 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
910 if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) {
911 mem_cgroup_cancel_charge_swapin(memcg);
912 ret = 0;
913 goto out;
914 }
915
916 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
917 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
918 get_page(page);
919 set_pte_at(vma->vm_mm, addr, pte,
920 pte_mkold(mk_pte(page, vma->vm_page_prot)));
921 if (page == swapcache)
922 page_add_anon_rmap(page, vma, addr);
923 else
924 page_add_new_anon_rmap(page, vma, addr);
925 mem_cgroup_commit_charge_swapin(page, memcg);
926 swap_free(entry);
927
928
929
930
931 activate_page(page);
932out:
933 pte_unmap_unlock(pte, ptl);
934out_nolock:
935 if (page != swapcache) {
936 unlock_page(page);
937 put_page(page);
938 }
939 return ret;
940}
941
942static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
943 unsigned long addr, unsigned long end,
944 swp_entry_t entry, struct page *page)
945{
946 pte_t swp_pte = swp_entry_to_pte(entry);
947 pte_t *pte;
948 int ret = 0;
949
950
951
952
953
954
955
956
957
958
959 pte = pte_offset_map(pmd, addr);
960 do {
961
962
963
964
965 if (unlikely(maybe_same_pte(*pte, swp_pte))) {
966 pte_unmap(pte);
967 ret = unuse_pte(vma, pmd, addr, entry, page);
968 if (ret)
969 goto out;
970 pte = pte_offset_map(pmd, addr);
971 }
972 } while (pte++, addr += PAGE_SIZE, addr != end);
973 pte_unmap(pte - 1);
974out:
975 return ret;
976}
977
978static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
979 unsigned long addr, unsigned long end,
980 swp_entry_t entry, struct page *page)
981{
982 pmd_t *pmd;
983 unsigned long next;
984 int ret;
985
986 pmd = pmd_offset(pud, addr);
987 do {
988 next = pmd_addr_end(addr, end);
989 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
990 continue;
991 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
992 if (ret)
993 return ret;
994 } while (pmd++, addr = next, addr != end);
995 return 0;
996}
997
998static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
999 unsigned long addr, unsigned long end,
1000 swp_entry_t entry, struct page *page)
1001{
1002 pud_t *pud;
1003 unsigned long next;
1004 int ret;
1005
1006 pud = pud_offset(pgd, addr);
1007 do {
1008 next = pud_addr_end(addr, end);
1009 if (pud_none_or_clear_bad(pud))
1010 continue;
1011 ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
1012 if (ret)
1013 return ret;
1014 } while (pud++, addr = next, addr != end);
1015 return 0;
1016}
1017
1018static int unuse_vma(struct vm_area_struct *vma,
1019 swp_entry_t entry, struct page *page)
1020{
1021 pgd_t *pgd;
1022 unsigned long addr, end, next;
1023 int ret;
1024
1025 if (page_anon_vma(page)) {
1026 addr = page_address_in_vma(page, vma);
1027 if (addr == -EFAULT)
1028 return 0;
1029 else
1030 end = addr + PAGE_SIZE;
1031 } else {
1032 addr = vma->vm_start;
1033 end = vma->vm_end;
1034 }
1035
1036 pgd = pgd_offset(vma->vm_mm, addr);
1037 do {
1038 next = pgd_addr_end(addr, end);
1039 if (pgd_none_or_clear_bad(pgd))
1040 continue;
1041 ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
1042 if (ret)
1043 return ret;
1044 } while (pgd++, addr = next, addr != end);
1045 return 0;
1046}
1047
1048static int unuse_mm(struct mm_struct *mm,
1049 swp_entry_t entry, struct page *page)
1050{
1051 struct vm_area_struct *vma;
1052 int ret = 0;
1053
1054 if (!down_read_trylock(&mm->mmap_sem)) {
1055
1056
1057
1058
1059 activate_page(page);
1060 unlock_page(page);
1061 down_read(&mm->mmap_sem);
1062 lock_page(page);
1063 }
1064 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1065 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
1066 break;
1067 }
1068 up_read(&mm->mmap_sem);
1069 return (ret < 0)? ret: 0;
1070}
1071
1072
1073
1074
1075
1076
1077static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1078 unsigned int prev, bool frontswap)
1079{
1080 unsigned int max = si->max;
1081 unsigned int i = prev;
1082 unsigned char count;
1083
1084
1085
1086
1087
1088
1089
1090 for (;;) {
1091 if (++i >= max) {
1092 if (!prev) {
1093 i = 0;
1094 break;
1095 }
1096
1097
1098
1099
1100 max = prev + 1;
1101 prev = 0;
1102 i = 1;
1103 }
1104 if (frontswap) {
1105 if (frontswap_test(si, i))
1106 break;
1107 else
1108 continue;
1109 }
1110 count = si->swap_map[i];
1111 if (count && swap_count(count) != SWAP_MAP_BAD)
1112 break;
1113 }
1114 return i;
1115}
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125int try_to_unuse(unsigned int type, bool frontswap,
1126 unsigned long pages_to_unuse)
1127{
1128 struct swap_info_struct *si = swap_info[type];
1129 struct mm_struct *start_mm;
1130 unsigned char *swap_map;
1131 unsigned char swcount;
1132 struct page *page;
1133 swp_entry_t entry;
1134 unsigned int i = 0;
1135 int retval = 0;
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151 start_mm = &init_mm;
1152 atomic_inc(&init_mm.mm_users);
1153
1154
1155
1156
1157
1158
1159 while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
1160 if (signal_pending(current)) {
1161 retval = -EINTR;
1162 break;
1163 }
1164
1165
1166
1167
1168
1169
1170 swap_map = &si->swap_map[i];
1171 entry = swp_entry(type, i);
1172 page = read_swap_cache_async(entry,
1173 GFP_HIGHUSER_MOVABLE, NULL, 0);
1174 if (!page) {
1175
1176
1177
1178
1179
1180
1181 if (!*swap_map)
1182 continue;
1183 retval = -ENOMEM;
1184 break;
1185 }
1186
1187
1188
1189
1190 if (atomic_read(&start_mm->mm_users) == 1) {
1191 mmput(start_mm);
1192 start_mm = &init_mm;
1193 atomic_inc(&init_mm.mm_users);
1194 }
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204 wait_on_page_locked(page);
1205 wait_on_page_writeback(page);
1206 lock_page(page);
1207 wait_on_page_writeback(page);
1208
1209
1210
1211
1212 swcount = *swap_map;
1213 if (swap_count(swcount) == SWAP_MAP_SHMEM) {
1214 retval = shmem_unuse(entry, page);
1215
1216 if (retval < 0)
1217 break;
1218 continue;
1219 }
1220 if (swap_count(swcount) && start_mm != &init_mm)
1221 retval = unuse_mm(start_mm, entry, page);
1222
1223 if (swap_count(*swap_map)) {
1224 int set_start_mm = (*swap_map >= swcount);
1225 struct list_head *p = &start_mm->mmlist;
1226 struct mm_struct *new_start_mm = start_mm;
1227 struct mm_struct *prev_mm = start_mm;
1228 struct mm_struct *mm;
1229
1230 atomic_inc(&new_start_mm->mm_users);
1231 atomic_inc(&prev_mm->mm_users);
1232 spin_lock(&mmlist_lock);
1233 while (swap_count(*swap_map) && !retval &&
1234 (p = p->next) != &start_mm->mmlist) {
1235 mm = list_entry(p, struct mm_struct, mmlist);
1236 if (!atomic_inc_not_zero(&mm->mm_users))
1237 continue;
1238 spin_unlock(&mmlist_lock);
1239 mmput(prev_mm);
1240 prev_mm = mm;
1241
1242 cond_resched();
1243
1244 swcount = *swap_map;
1245 if (!swap_count(swcount))
1246 ;
1247 else if (mm == &init_mm)
1248 set_start_mm = 1;
1249 else
1250 retval = unuse_mm(mm, entry, page);
1251
1252 if (set_start_mm && *swap_map < swcount) {
1253 mmput(new_start_mm);
1254 atomic_inc(&mm->mm_users);
1255 new_start_mm = mm;
1256 set_start_mm = 0;
1257 }
1258 spin_lock(&mmlist_lock);
1259 }
1260 spin_unlock(&mmlist_lock);
1261 mmput(prev_mm);
1262 mmput(start_mm);
1263 start_mm = new_start_mm;
1264 }
1265 if (retval) {
1266 unlock_page(page);
1267 page_cache_release(page);
1268 break;
1269 }
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290 if (swap_count(*swap_map) &&
1291 PageDirty(page) && PageSwapCache(page)) {
1292 struct writeback_control wbc = {
1293 .sync_mode = WB_SYNC_NONE,
1294 };
1295
1296 swap_writepage(page, &wbc);
1297 lock_page(page);
1298 wait_on_page_writeback(page);
1299 }
1300
1301
1302
1303
1304
1305
1306
1307
1308 if (PageSwapCache(page) &&
1309 likely(page_private(page) == entry.val))
1310 delete_from_swap_cache(page);
1311
1312
1313
1314
1315
1316
1317 SetPageDirty(page);
1318 unlock_page(page);
1319 page_cache_release(page);
1320
1321
1322
1323
1324
1325 cond_resched();
1326 if (frontswap && pages_to_unuse > 0) {
1327 if (!--pages_to_unuse)
1328 break;
1329 }
1330 }
1331
1332 mmput(start_mm);
1333 return retval;
1334}
1335
1336
1337
1338
1339
1340
1341
1342static void drain_mmlist(void)
1343{
1344 struct list_head *p, *next;
1345 unsigned int type;
1346
1347 for (type = 0; type < nr_swapfiles; type++)
1348 if (swap_info[type]->inuse_pages)
1349 return;
1350 spin_lock(&mmlist_lock);
1351 list_for_each_safe(p, next, &init_mm.mmlist)
1352 list_del_init(p);
1353 spin_unlock(&mmlist_lock);
1354}
1355
1356
1357
1358
1359
1360
1361
1362static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
1363{
1364 struct swap_info_struct *sis;
1365 struct swap_extent *start_se;
1366 struct swap_extent *se;
1367 pgoff_t offset;
1368
1369 sis = swap_info[swp_type(entry)];
1370 *bdev = sis->bdev;
1371
1372 offset = swp_offset(entry);
1373 start_se = sis->curr_swap_extent;
1374 se = start_se;
1375
1376 for ( ; ; ) {
1377 struct list_head *lh;
1378
1379 if (se->start_page <= offset &&
1380 offset < (se->start_page + se->nr_pages)) {
1381 return se->start_block + (offset - se->start_page);
1382 }
1383 lh = se->list.next;
1384 se = list_entry(lh, struct swap_extent, list);
1385 sis->curr_swap_extent = se;
1386 BUG_ON(se == start_se);
1387 }
1388}
1389
1390
1391
1392
1393sector_t map_swap_page(struct page *page, struct block_device **bdev)
1394{
1395 swp_entry_t entry;
1396 entry.val = page_private(page);
1397 return map_swap_entry(entry, bdev);
1398}
1399
1400
1401
1402
1403static void destroy_swap_extents(struct swap_info_struct *sis)
1404{
1405 while (!list_empty(&sis->first_swap_extent.list)) {
1406 struct swap_extent *se;
1407
1408 se = list_entry(sis->first_swap_extent.list.next,
1409 struct swap_extent, list);
1410 list_del(&se->list);
1411 kfree(se);
1412 }
1413
1414 if (sis->flags & SWP_FILE) {
1415 struct file *swap_file = sis->swap_file;
1416 struct address_space *mapping = swap_file->f_mapping;
1417
1418 sis->flags &= ~SWP_FILE;
1419 mapping->a_ops->swap_deactivate(swap_file);
1420 }
1421}
1422
1423
1424
1425
1426
1427
1428
1429int
1430add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1431 unsigned long nr_pages, sector_t start_block)
1432{
1433 struct swap_extent *se;
1434 struct swap_extent *new_se;
1435 struct list_head *lh;
1436
1437 if (start_page == 0) {
1438 se = &sis->first_swap_extent;
1439 sis->curr_swap_extent = se;
1440 se->start_page = 0;
1441 se->nr_pages = nr_pages;
1442 se->start_block = start_block;
1443 return 1;
1444 } else {
1445 lh = sis->first_swap_extent.list.prev;
1446 se = list_entry(lh, struct swap_extent, list);
1447 BUG_ON(se->start_page + se->nr_pages != start_page);
1448 if (se->start_block + se->nr_pages == start_block) {
1449
1450 se->nr_pages += nr_pages;
1451 return 0;
1452 }
1453 }
1454
1455
1456
1457
1458 new_se = kmalloc(sizeof(*se), GFP_KERNEL);
1459 if (new_se == NULL)
1460 return -ENOMEM;
1461 new_se->start_page = start_page;
1462 new_se->nr_pages = nr_pages;
1463 new_se->start_block = start_block;
1464
1465 list_add_tail(&new_se->list, &sis->first_swap_extent.list);
1466 return 1;
1467}
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1501{
1502 struct file *swap_file = sis->swap_file;
1503 struct address_space *mapping = swap_file->f_mapping;
1504 struct inode *inode = mapping->host;
1505 int ret;
1506
1507 if (S_ISBLK(inode->i_mode)) {
1508 ret = add_swap_extent(sis, 0, sis->max, 0);
1509 *span = sis->pages;
1510 return ret;
1511 }
1512
1513 if (mapping->a_ops->swap_activate) {
1514 ret = mapping->a_ops->swap_activate(sis, swap_file, span);
1515 if (!ret) {
1516 sis->flags |= SWP_FILE;
1517 ret = add_swap_extent(sis, 0, sis->max, 0);
1518 *span = sis->pages;
1519 }
1520 return ret;
1521 }
1522
1523 return generic_swapfile_activate(sis, swap_file, span);
1524}
1525
1526static void _enable_swap_info(struct swap_info_struct *p, int prio,
1527 unsigned char *swap_map)
1528{
1529 int i, prev;
1530
1531 if (prio >= 0)
1532 p->prio = prio;
1533 else
1534 p->prio = --least_priority;
1535 p->swap_map = swap_map;
1536 p->flags |= SWP_WRITEOK;
1537 atomic_long_add(p->pages, &nr_swap_pages);
1538 total_swap_pages += p->pages;
1539
1540
1541 prev = -1;
1542 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
1543 if (p->prio >= swap_info[i]->prio)
1544 break;
1545 prev = i;
1546 }
1547 p->next = i;
1548 if (prev < 0)
1549 swap_list.head = swap_list.next = p->type;
1550 else
1551 swap_info[prev]->next = p->type;
1552}
1553
1554static void enable_swap_info(struct swap_info_struct *p, int prio,
1555 unsigned char *swap_map,
1556 unsigned long *frontswap_map)
1557{
1558 frontswap_init(p->type, frontswap_map);
1559 spin_lock(&swap_lock);
1560 spin_lock(&p->lock);
1561 _enable_swap_info(p, prio, swap_map);
1562 spin_unlock(&p->lock);
1563 spin_unlock(&swap_lock);
1564}
1565
1566static void reinsert_swap_info(struct swap_info_struct *p)
1567{
1568 spin_lock(&swap_lock);
1569 spin_lock(&p->lock);
1570 _enable_swap_info(p, p->prio, p->swap_map);
1571 spin_unlock(&p->lock);
1572 spin_unlock(&swap_lock);
1573}
1574
1575SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1576{
1577 struct swap_info_struct *p = NULL;
1578 unsigned char *swap_map;
1579 unsigned long *frontswap_map;
1580 struct file *swap_file, *victim;
1581 struct address_space *mapping;
1582 struct inode *inode;
1583 struct filename *pathname;
1584 int i, type, prev;
1585 int err;
1586
1587 if (!capable(CAP_SYS_ADMIN))
1588 return -EPERM;
1589
1590 BUG_ON(!current->mm);
1591
1592 pathname = getname(specialfile);
1593 if (IS_ERR(pathname))
1594 return PTR_ERR(pathname);
1595
1596 victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
1597 err = PTR_ERR(victim);
1598 if (IS_ERR(victim))
1599 goto out;
1600
1601 mapping = victim->f_mapping;
1602 prev = -1;
1603 spin_lock(&swap_lock);
1604 for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
1605 p = swap_info[type];
1606 if (p->flags & SWP_WRITEOK) {
1607 if (p->swap_file->f_mapping == mapping)
1608 break;
1609 }
1610 prev = type;
1611 }
1612 if (type < 0) {
1613 err = -EINVAL;
1614 spin_unlock(&swap_lock);
1615 goto out_dput;
1616 }
1617 if (!security_vm_enough_memory_mm(current->mm, p->pages))
1618 vm_unacct_memory(p->pages);
1619 else {
1620 err = -ENOMEM;
1621 spin_unlock(&swap_lock);
1622 goto out_dput;
1623 }
1624 if (prev < 0)
1625 swap_list.head = p->next;
1626 else
1627 swap_info[prev]->next = p->next;
1628 if (type == swap_list.next) {
1629
1630 swap_list.next = swap_list.head;
1631 }
1632 spin_lock(&p->lock);
1633 if (p->prio < 0) {
1634 for (i = p->next; i >= 0; i = swap_info[i]->next)
1635 swap_info[i]->prio = p->prio--;
1636 least_priority++;
1637 }
1638 atomic_long_sub(p->pages, &nr_swap_pages);
1639 total_swap_pages -= p->pages;
1640 p->flags &= ~SWP_WRITEOK;
1641 spin_unlock(&p->lock);
1642 spin_unlock(&swap_lock);
1643
1644 set_current_oom_origin();
1645 err = try_to_unuse(type, false, 0);
1646 clear_current_oom_origin();
1647
1648 if (err) {
1649
1650 reinsert_swap_info(p);
1651 goto out_dput;
1652 }
1653
1654 destroy_swap_extents(p);
1655 if (p->flags & SWP_CONTINUED)
1656 free_swap_count_continuations(p);
1657
1658 mutex_lock(&swapon_mutex);
1659 spin_lock(&swap_lock);
1660 spin_lock(&p->lock);
1661 drain_mmlist();
1662
1663
1664 p->highest_bit = 0;
1665 while (p->flags >= SWP_SCANNING) {
1666 spin_unlock(&p->lock);
1667 spin_unlock(&swap_lock);
1668 schedule_timeout_uninterruptible(1);
1669 spin_lock(&swap_lock);
1670 spin_lock(&p->lock);
1671 }
1672
1673 swap_file = p->swap_file;
1674 p->swap_file = NULL;
1675 p->max = 0;
1676 swap_map = p->swap_map;
1677 p->swap_map = NULL;
1678 p->flags = 0;
1679 frontswap_map = frontswap_map_get(p);
1680 frontswap_map_set(p, NULL);
1681 spin_unlock(&p->lock);
1682 spin_unlock(&swap_lock);
1683 frontswap_invalidate_area(type);
1684 mutex_unlock(&swapon_mutex);
1685 vfree(swap_map);
1686 vfree(frontswap_map);
1687
1688 swap_cgroup_swapoff(type);
1689
1690 inode = mapping->host;
1691 if (S_ISBLK(inode->i_mode)) {
1692 struct block_device *bdev = I_BDEV(inode);
1693 set_blocksize(bdev, p->old_block_size);
1694 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1695 } else {
1696 mutex_lock(&inode->i_mutex);
1697 inode->i_flags &= ~S_SWAPFILE;
1698 mutex_unlock(&inode->i_mutex);
1699 }
1700 filp_close(swap_file, NULL);
1701 err = 0;
1702 atomic_inc(&proc_poll_event);
1703 wake_up_interruptible(&proc_poll_wait);
1704
1705out_dput:
1706 filp_close(victim, NULL);
1707out:
1708 putname(pathname);
1709 return err;
1710}
1711
1712#ifdef CONFIG_PROC_FS
1713static unsigned swaps_poll(struct file *file, poll_table *wait)
1714{
1715 struct seq_file *seq = file->private_data;
1716
1717 poll_wait(file, &proc_poll_wait, wait);
1718
1719 if (seq->poll_event != atomic_read(&proc_poll_event)) {
1720 seq->poll_event = atomic_read(&proc_poll_event);
1721 return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
1722 }
1723
1724 return POLLIN | POLLRDNORM;
1725}
1726
1727
1728static void *swap_start(struct seq_file *swap, loff_t *pos)
1729{
1730 struct swap_info_struct *si;
1731 int type;
1732 loff_t l = *pos;
1733
1734 mutex_lock(&swapon_mutex);
1735
1736 if (!l)
1737 return SEQ_START_TOKEN;
1738
1739 for (type = 0; type < nr_swapfiles; type++) {
1740 smp_rmb();
1741 si = swap_info[type];
1742 if (!(si->flags & SWP_USED) || !si->swap_map)
1743 continue;
1744 if (!--l)
1745 return si;
1746 }
1747
1748 return NULL;
1749}
1750
1751static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1752{
1753 struct swap_info_struct *si = v;
1754 int type;
1755
1756 if (v == SEQ_START_TOKEN)
1757 type = 0;
1758 else
1759 type = si->type + 1;
1760
1761 for (; type < nr_swapfiles; type++) {
1762 smp_rmb();
1763 si = swap_info[type];
1764 if (!(si->flags & SWP_USED) || !si->swap_map)
1765 continue;
1766 ++*pos;
1767 return si;
1768 }
1769
1770 return NULL;
1771}
1772
1773static void swap_stop(struct seq_file *swap, void *v)
1774{
1775 mutex_unlock(&swapon_mutex);
1776}
1777
1778static int swap_show(struct seq_file *swap, void *v)
1779{
1780 struct swap_info_struct *si = v;
1781 struct file *file;
1782 int len;
1783
1784 if (si == SEQ_START_TOKEN) {
1785 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1786 return 0;
1787 }
1788
1789 file = si->swap_file;
1790 len = seq_path(swap, &file->f_path, " \t\n\\");
1791 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1792 len < 40 ? 40 - len : 1, " ",
1793 S_ISBLK(file_inode(file)->i_mode) ?
1794 "partition" : "file\t",
1795 si->pages << (PAGE_SHIFT - 10),
1796 si->inuse_pages << (PAGE_SHIFT - 10),
1797 si->prio);
1798 return 0;
1799}
1800
1801static const struct seq_operations swaps_op = {
1802 .start = swap_start,
1803 .next = swap_next,
1804 .stop = swap_stop,
1805 .show = swap_show
1806};
1807
1808static int swaps_open(struct inode *inode, struct file *file)
1809{
1810 struct seq_file *seq;
1811 int ret;
1812
1813 ret = seq_open(file, &swaps_op);
1814 if (ret)
1815 return ret;
1816
1817 seq = file->private_data;
1818 seq->poll_event = atomic_read(&proc_poll_event);
1819 return 0;
1820}
1821
1822static const struct file_operations proc_swaps_operations = {
1823 .open = swaps_open,
1824 .read = seq_read,
1825 .llseek = seq_lseek,
1826 .release = seq_release,
1827 .poll = swaps_poll,
1828};
1829
1830static int __init procswaps_init(void)
1831{
1832 proc_create("swaps", 0, NULL, &proc_swaps_operations);
1833 return 0;
1834}
1835__initcall(procswaps_init);
1836#endif
1837
1838#ifdef MAX_SWAPFILES_CHECK
1839static int __init max_swapfiles_check(void)
1840{
1841 MAX_SWAPFILES_CHECK();
1842 return 0;
1843}
1844late_initcall(max_swapfiles_check);
1845#endif
1846
1847static struct swap_info_struct *alloc_swap_info(void)
1848{
1849 struct swap_info_struct *p;
1850 unsigned int type;
1851
1852 p = kzalloc(sizeof(*p), GFP_KERNEL);
1853 if (!p)
1854 return ERR_PTR(-ENOMEM);
1855
1856 spin_lock(&swap_lock);
1857 for (type = 0; type < nr_swapfiles; type++) {
1858 if (!(swap_info[type]->flags & SWP_USED))
1859 break;
1860 }
1861 if (type >= MAX_SWAPFILES) {
1862 spin_unlock(&swap_lock);
1863 kfree(p);
1864 return ERR_PTR(-EPERM);
1865 }
1866 if (type >= nr_swapfiles) {
1867 p->type = type;
1868 swap_info[type] = p;
1869
1870
1871
1872
1873
1874 smp_wmb();
1875 nr_swapfiles++;
1876 } else {
1877 kfree(p);
1878 p = swap_info[type];
1879
1880
1881
1882
1883 }
1884 INIT_LIST_HEAD(&p->first_swap_extent.list);
1885 p->flags = SWP_USED;
1886 p->next = -1;
1887 spin_unlock(&swap_lock);
1888 spin_lock_init(&p->lock);
1889
1890 return p;
1891}
1892
1893static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
1894{
1895 int error;
1896
1897 if (S_ISBLK(inode->i_mode)) {
1898 p->bdev = bdgrab(I_BDEV(inode));
1899 error = blkdev_get(p->bdev,
1900 FMODE_READ | FMODE_WRITE | FMODE_EXCL,
1901 sys_swapon);
1902 if (error < 0) {
1903 p->bdev = NULL;
1904 return -EINVAL;
1905 }
1906 p->old_block_size = block_size(p->bdev);
1907 error = set_blocksize(p->bdev, PAGE_SIZE);
1908 if (error < 0)
1909 return error;
1910 p->flags |= SWP_BLKDEV;
1911 } else if (S_ISREG(inode->i_mode)) {
1912 p->bdev = inode->i_sb->s_bdev;
1913 mutex_lock(&inode->i_mutex);
1914 if (IS_SWAPFILE(inode))
1915 return -EBUSY;
1916 } else
1917 return -EINVAL;
1918
1919 return 0;
1920}
1921
1922static unsigned long read_swap_header(struct swap_info_struct *p,
1923 union swap_header *swap_header,
1924 struct inode *inode)
1925{
1926 int i;
1927 unsigned long maxpages;
1928 unsigned long swapfilepages;
1929
1930 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
1931 printk(KERN_ERR "Unable to find swap-space signature\n");
1932 return 0;
1933 }
1934
1935
1936 if (swab32(swap_header->info.version) == 1) {
1937 swab32s(&swap_header->info.version);
1938 swab32s(&swap_header->info.last_page);
1939 swab32s(&swap_header->info.nr_badpages);
1940 for (i = 0; i < swap_header->info.nr_badpages; i++)
1941 swab32s(&swap_header->info.badpages[i]);
1942 }
1943
1944 if (swap_header->info.version != 1) {
1945 printk(KERN_WARNING
1946 "Unable to handle swap header version %d\n",
1947 swap_header->info.version);
1948 return 0;
1949 }
1950
1951 p->lowest_bit = 1;
1952 p->cluster_next = 1;
1953 p->cluster_nr = 0;
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969 maxpages = swp_offset(pte_to_swp_entry(
1970 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
1971 if (maxpages > swap_header->info.last_page) {
1972 maxpages = swap_header->info.last_page + 1;
1973
1974 if ((unsigned int)maxpages == 0)
1975 maxpages = UINT_MAX;
1976 }
1977 p->highest_bit = maxpages - 1;
1978
1979 if (!maxpages)
1980 return 0;
1981 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
1982 if (swapfilepages && maxpages > swapfilepages) {
1983 printk(KERN_WARNING
1984 "Swap area shorter than signature indicates\n");
1985 return 0;
1986 }
1987 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
1988 return 0;
1989 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
1990 return 0;
1991
1992 return maxpages;
1993}
1994
1995static int setup_swap_map_and_extents(struct swap_info_struct *p,
1996 union swap_header *swap_header,
1997 unsigned char *swap_map,
1998 unsigned long maxpages,
1999 sector_t *span)
2000{
2001 int i;
2002 unsigned int nr_good_pages;
2003 int nr_extents;
2004
2005 nr_good_pages = maxpages - 1;
2006
2007 for (i = 0; i < swap_header->info.nr_badpages; i++) {
2008 unsigned int page_nr = swap_header->info.badpages[i];
2009 if (page_nr == 0 || page_nr > swap_header->info.last_page)
2010 return -EINVAL;
2011 if (page_nr < maxpages) {
2012 swap_map[page_nr] = SWAP_MAP_BAD;
2013 nr_good_pages--;
2014 }
2015 }
2016
2017 if (nr_good_pages) {
2018 swap_map[0] = SWAP_MAP_BAD;
2019 p->max = maxpages;
2020 p->pages = nr_good_pages;
2021 nr_extents = setup_swap_extents(p, span);
2022 if (nr_extents < 0)
2023 return nr_extents;
2024 nr_good_pages = p->pages;
2025 }
2026 if (!nr_good_pages) {
2027 printk(KERN_WARNING "Empty swap-file\n");
2028 return -EINVAL;
2029 }
2030
2031 return nr_extents;
2032}
2033
2034
2035
2036
2037
2038static bool swap_discardable(struct swap_info_struct *si)
2039{
2040 struct request_queue *q = bdev_get_queue(si->bdev);
2041
2042 if (!q || !blk_queue_discard(q))
2043 return false;
2044
2045 return true;
2046}
2047
2048SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2049{
2050 struct swap_info_struct *p;
2051 struct filename *name;
2052 struct file *swap_file = NULL;
2053 struct address_space *mapping;
2054 int i;
2055 int prio;
2056 int error;
2057 union swap_header *swap_header;
2058 int nr_extents;
2059 sector_t span;
2060 unsigned long maxpages;
2061 unsigned char *swap_map = NULL;
2062 unsigned long *frontswap_map = NULL;
2063 struct page *page = NULL;
2064 struct inode *inode = NULL;
2065
2066 if (swap_flags & ~SWAP_FLAGS_VALID)
2067 return -EINVAL;
2068
2069 if (!capable(CAP_SYS_ADMIN))
2070 return -EPERM;
2071
2072 p = alloc_swap_info();
2073 if (IS_ERR(p))
2074 return PTR_ERR(p);
2075
2076 name = getname(specialfile);
2077 if (IS_ERR(name)) {
2078 error = PTR_ERR(name);
2079 name = NULL;
2080 goto bad_swap;
2081 }
2082 swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
2083 if (IS_ERR(swap_file)) {
2084 error = PTR_ERR(swap_file);
2085 swap_file = NULL;
2086 goto bad_swap;
2087 }
2088
2089 p->swap_file = swap_file;
2090 mapping = swap_file->f_mapping;
2091
2092 for (i = 0; i < nr_swapfiles; i++) {
2093 struct swap_info_struct *q = swap_info[i];
2094
2095 if (q == p || !q->swap_file)
2096 continue;
2097 if (mapping == q->swap_file->f_mapping) {
2098 error = -EBUSY;
2099 goto bad_swap;
2100 }
2101 }
2102
2103 inode = mapping->host;
2104
2105 error = claim_swapfile(p, inode);
2106 if (unlikely(error))
2107 goto bad_swap;
2108
2109
2110
2111
2112 if (!mapping->a_ops->readpage) {
2113 error = -EINVAL;
2114 goto bad_swap;
2115 }
2116 page = read_mapping_page(mapping, 0, swap_file);
2117 if (IS_ERR(page)) {
2118 error = PTR_ERR(page);
2119 goto bad_swap;
2120 }
2121 swap_header = kmap(page);
2122
2123 maxpages = read_swap_header(p, swap_header, inode);
2124 if (unlikely(!maxpages)) {
2125 error = -EINVAL;
2126 goto bad_swap;
2127 }
2128
2129
2130 swap_map = vzalloc(maxpages);
2131 if (!swap_map) {
2132 error = -ENOMEM;
2133 goto bad_swap;
2134 }
2135
2136 error = swap_cgroup_swapon(p->type, maxpages);
2137 if (error)
2138 goto bad_swap;
2139
2140 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
2141 maxpages, &span);
2142 if (unlikely(nr_extents < 0)) {
2143 error = nr_extents;
2144 goto bad_swap;
2145 }
2146
2147 if (frontswap_enabled)
2148 frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long));
2149
2150 if (p->bdev) {
2151 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
2152 p->flags |= SWP_SOLIDSTATE;
2153 p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
2154 }
2155
2156 if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
2157
2158
2159
2160
2161
2162
2163 p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
2164 SWP_PAGE_DISCARD);
2165
2166
2167
2168
2169
2170
2171
2172 if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
2173 p->flags &= ~SWP_PAGE_DISCARD;
2174 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
2175 p->flags &= ~SWP_AREA_DISCARD;
2176
2177
2178 if (p->flags & SWP_AREA_DISCARD) {
2179 int err = discard_swap(p);
2180 if (unlikely(err))
2181 printk(KERN_ERR
2182 "swapon: discard_swap(%p): %d\n",
2183 p, err);
2184 }
2185 }
2186 }
2187
2188 mutex_lock(&swapon_mutex);
2189 prio = -1;
2190 if (swap_flags & SWAP_FLAG_PREFER)
2191 prio =
2192 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2193 enable_swap_info(p, prio, swap_map, frontswap_map);
2194
2195 printk(KERN_INFO "Adding %uk swap on %s. "
2196 "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
2197 p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
2198 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2199 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2200 (p->flags & SWP_DISCARDABLE) ? "D" : "",
2201 (p->flags & SWP_AREA_DISCARD) ? "s" : "",
2202 (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
2203 (frontswap_map) ? "FS" : "");
2204
2205 mutex_unlock(&swapon_mutex);
2206 atomic_inc(&proc_poll_event);
2207 wake_up_interruptible(&proc_poll_wait);
2208
2209 if (S_ISREG(inode->i_mode))
2210 inode->i_flags |= S_SWAPFILE;
2211 error = 0;
2212 goto out;
2213bad_swap:
2214 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
2215 set_blocksize(p->bdev, p->old_block_size);
2216 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2217 }
2218 destroy_swap_extents(p);
2219 swap_cgroup_swapoff(p->type);
2220 spin_lock(&swap_lock);
2221 p->swap_file = NULL;
2222 p->flags = 0;
2223 spin_unlock(&swap_lock);
2224 vfree(swap_map);
2225 if (swap_file) {
2226 if (inode && S_ISREG(inode->i_mode)) {
2227 mutex_unlock(&inode->i_mutex);
2228 inode = NULL;
2229 }
2230 filp_close(swap_file, NULL);
2231 }
2232out:
2233 if (page && !IS_ERR(page)) {
2234 kunmap(page);
2235 page_cache_release(page);
2236 }
2237 if (name)
2238 putname(name);
2239 if (inode && S_ISREG(inode->i_mode))
2240 mutex_unlock(&inode->i_mutex);
2241 return error;
2242}
2243
2244void si_swapinfo(struct sysinfo *val)
2245{
2246 unsigned int type;
2247 unsigned long nr_to_be_unused = 0;
2248
2249 spin_lock(&swap_lock);
2250 for (type = 0; type < nr_swapfiles; type++) {
2251 struct swap_info_struct *si = swap_info[type];
2252
2253 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
2254 nr_to_be_unused += si->inuse_pages;
2255 }
2256 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
2257 val->totalswap = total_swap_pages + nr_to_be_unused;
2258 spin_unlock(&swap_lock);
2259}
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2273{
2274 struct swap_info_struct *p;
2275 unsigned long offset, type;
2276 unsigned char count;
2277 unsigned char has_cache;
2278 int err = -EINVAL;
2279
2280 if (non_swap_entry(entry))
2281 goto out;
2282
2283 type = swp_type(entry);
2284 if (type >= nr_swapfiles)
2285 goto bad_file;
2286 p = swap_info[type];
2287 offset = swp_offset(entry);
2288
2289 spin_lock(&p->lock);
2290 if (unlikely(offset >= p->max))
2291 goto unlock_out;
2292
2293 count = p->swap_map[offset];
2294 has_cache = count & SWAP_HAS_CACHE;
2295 count &= ~SWAP_HAS_CACHE;
2296 err = 0;
2297
2298 if (usage == SWAP_HAS_CACHE) {
2299
2300
2301 if (!has_cache && count)
2302 has_cache = SWAP_HAS_CACHE;
2303 else if (has_cache)
2304 err = -EEXIST;
2305 else
2306 err = -ENOENT;
2307
2308 } else if (count || has_cache) {
2309
2310 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
2311 count += usage;
2312 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
2313 err = -EINVAL;
2314 else if (swap_count_continued(p, offset, count))
2315 count = COUNT_CONTINUED;
2316 else
2317 err = -ENOMEM;
2318 } else
2319 err = -ENOENT;
2320
2321 p->swap_map[offset] = count | has_cache;
2322
2323unlock_out:
2324 spin_unlock(&p->lock);
2325out:
2326 return err;
2327
2328bad_file:
2329 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
2330 goto out;
2331}
2332
2333
2334
2335
2336
2337void swap_shmem_alloc(swp_entry_t entry)
2338{
2339 __swap_duplicate(entry, SWAP_MAP_SHMEM);
2340}
2341
2342
2343
2344
2345
2346
2347
2348
2349int swap_duplicate(swp_entry_t entry)
2350{
2351 int err = 0;
2352
2353 while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
2354 err = add_swap_count_continuation(entry, GFP_ATOMIC);
2355 return err;
2356}
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366int swapcache_prepare(swp_entry_t entry)
2367{
2368 return __swap_duplicate(entry, SWAP_HAS_CACHE);
2369}
2370
2371struct swap_info_struct *page_swap_info(struct page *page)
2372{
2373 swp_entry_t swap = { .val = page_private(page) };
2374 BUG_ON(!PageSwapCache(page));
2375 return swap_info[swp_type(swap)];
2376}
2377
2378
2379
2380
2381struct address_space *__page_file_mapping(struct page *page)
2382{
2383 VM_BUG_ON(!PageSwapCache(page));
2384 return page_swap_info(page)->swap_file->f_mapping;
2385}
2386EXPORT_SYMBOL_GPL(__page_file_mapping);
2387
2388pgoff_t __page_file_index(struct page *page)
2389{
2390 swp_entry_t swap = { .val = page_private(page) };
2391 VM_BUG_ON(!PageSwapCache(page));
2392 return swp_offset(swap);
2393}
2394EXPORT_SYMBOL_GPL(__page_file_index);
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2412{
2413 struct swap_info_struct *si;
2414 struct page *head;
2415 struct page *page;
2416 struct page *list_page;
2417 pgoff_t offset;
2418 unsigned char count;
2419
2420
2421
2422
2423
2424 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
2425
2426 si = swap_info_get(entry);
2427 if (!si) {
2428
2429
2430
2431
2432
2433 goto outer;
2434 }
2435
2436 offset = swp_offset(entry);
2437 count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
2438
2439 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
2440
2441
2442
2443
2444
2445 goto out;
2446 }
2447
2448 if (!page) {
2449 spin_unlock(&si->lock);
2450 return -ENOMEM;
2451 }
2452
2453
2454
2455
2456
2457
2458 head = vmalloc_to_page(si->swap_map + offset);
2459 offset &= ~PAGE_MASK;
2460
2461
2462
2463
2464
2465 if (!page_private(head)) {
2466 BUG_ON(count & COUNT_CONTINUED);
2467 INIT_LIST_HEAD(&head->lru);
2468 set_page_private(head, SWP_CONTINUED);
2469 si->flags |= SWP_CONTINUED;
2470 }
2471
2472 list_for_each_entry(list_page, &head->lru, lru) {
2473 unsigned char *map;
2474
2475
2476
2477
2478
2479 if (!(count & COUNT_CONTINUED))
2480 goto out;
2481
2482 map = kmap_atomic(list_page) + offset;
2483 count = *map;
2484 kunmap_atomic(map);
2485
2486
2487
2488
2489
2490 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
2491 goto out;
2492 }
2493
2494 list_add_tail(&page->lru, &head->lru);
2495 page = NULL;
2496out:
2497 spin_unlock(&si->lock);
2498outer:
2499 if (page)
2500 __free_page(page);
2501 return 0;
2502}
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512static bool swap_count_continued(struct swap_info_struct *si,
2513 pgoff_t offset, unsigned char count)
2514{
2515 struct page *head;
2516 struct page *page;
2517 unsigned char *map;
2518
2519 head = vmalloc_to_page(si->swap_map + offset);
2520 if (page_private(head) != SWP_CONTINUED) {
2521 BUG_ON(count & COUNT_CONTINUED);
2522 return false;
2523 }
2524
2525 offset &= ~PAGE_MASK;
2526 page = list_entry(head->lru.next, struct page, lru);
2527 map = kmap_atomic(page) + offset;
2528
2529 if (count == SWAP_MAP_MAX)
2530 goto init_map;
2531
2532 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) {
2533
2534
2535
2536 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
2537 kunmap_atomic(map);
2538 page = list_entry(page->lru.next, struct page, lru);
2539 BUG_ON(page == head);
2540 map = kmap_atomic(page) + offset;
2541 }
2542 if (*map == SWAP_CONT_MAX) {
2543 kunmap_atomic(map);
2544 page = list_entry(page->lru.next, struct page, lru);
2545 if (page == head)
2546 return false;
2547 map = kmap_atomic(page) + offset;
2548init_map: *map = 0;
2549 }
2550 *map += 1;
2551 kunmap_atomic(map);
2552 page = list_entry(page->lru.prev, struct page, lru);
2553 while (page != head) {
2554 map = kmap_atomic(page) + offset;
2555 *map = COUNT_CONTINUED;
2556 kunmap_atomic(map);
2557 page = list_entry(page->lru.prev, struct page, lru);
2558 }
2559 return true;
2560
2561 } else {
2562
2563
2564
2565 BUG_ON(count != COUNT_CONTINUED);
2566 while (*map == COUNT_CONTINUED) {
2567 kunmap_atomic(map);
2568 page = list_entry(page->lru.next, struct page, lru);
2569 BUG_ON(page == head);
2570 map = kmap_atomic(page) + offset;
2571 }
2572 BUG_ON(*map == 0);
2573 *map -= 1;
2574 if (*map == 0)
2575 count = 0;
2576 kunmap_atomic(map);
2577 page = list_entry(page->lru.prev, struct page, lru);
2578 while (page != head) {
2579 map = kmap_atomic(page) + offset;
2580 *map = SWAP_CONT_MAX | count;
2581 count = COUNT_CONTINUED;
2582 kunmap_atomic(map);
2583 page = list_entry(page->lru.prev, struct page, lru);
2584 }
2585 return count == COUNT_CONTINUED;
2586 }
2587}
2588
2589
2590
2591
2592
2593static void free_swap_count_continuations(struct swap_info_struct *si)
2594{
2595 pgoff_t offset;
2596
2597 for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
2598 struct page *head;
2599 head = vmalloc_to_page(si->swap_map + offset);
2600 if (page_private(head)) {
2601 struct list_head *this, *next;
2602 list_for_each_safe(this, next, &head->lru) {
2603 struct page *page;
2604 page = list_entry(this, struct page, lru);
2605 list_del(this);
2606 __free_page(page);
2607 }
2608 }
2609 }
2610}
2611