1
2
3
4
5
6
7
8#include <linux/mm.h>
9#include <linux/hugetlb.h>
10#include <linux/mman.h>
11#include <linux/slab.h>
12#include <linux/kernel_stat.h>
13#include <linux/swap.h>
14#include <linux/vmalloc.h>
15#include <linux/pagemap.h>
16#include <linux/namei.h>
17#include <linux/shm.h>
18#include <linux/blkdev.h>
19#include <linux/random.h>
20#include <linux/writeback.h>
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/init.h>
24#include <linux/module.h>
25#include <linux/ksm.h>
26#include <linux/rmap.h>
27#include <linux/security.h>
28#include <linux/backing-dev.h>
29#include <linux/mutex.h>
30#include <linux/capability.h>
31#include <linux/syscalls.h>
32#include <linux/memcontrol.h>
33#include <linux/poll.h>
34
35#include <asm/pgtable.h>
36#include <asm/tlbflush.h>
37#include <linux/swapops.h>
38#include <linux/page_cgroup.h>
39
40static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
41 unsigned char);
42static void free_swap_count_continuations(struct swap_info_struct *);
43static sector_t map_swap_entry(swp_entry_t, struct block_device**);
44
45static DEFINE_SPINLOCK(swap_lock);
46static unsigned int nr_swapfiles;
47long nr_swap_pages;
48long total_swap_pages;
49static int least_priority;
50
51static const char Bad_file[] = "Bad swap file entry ";
52static const char Unused_file[] = "Unused swap file entry ";
53static const char Bad_offset[] = "Bad swap offset entry ";
54static const char Unused_offset[] = "Unused swap offset entry ";
55
56static struct swap_list_t swap_list = {-1, -1};
57
58static struct swap_info_struct *swap_info[MAX_SWAPFILES];
59
60static DEFINE_MUTEX(swapon_mutex);
61
62static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
63
64static atomic_t proc_poll_event = ATOMIC_INIT(0);
65
66static inline unsigned char swap_count(unsigned char ent)
67{
68 return ent & ~SWAP_HAS_CACHE;
69}
70
71
72static int
73__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
74{
75 swp_entry_t entry = swp_entry(si->type, offset);
76 struct page *page;
77 int ret = 0;
78
79 page = find_get_page(&swapper_space, entry.val);
80 if (!page)
81 return 0;
82
83
84
85
86
87
88
89 if (trylock_page(page)) {
90 ret = try_to_free_swap(page);
91 unlock_page(page);
92 }
93 page_cache_release(page);
94 return ret;
95}
96
97
98
99
100
101static int discard_swap(struct swap_info_struct *si)
102{
103 struct swap_extent *se;
104 sector_t start_block;
105 sector_t nr_blocks;
106 int err = 0;
107
108
109 se = &si->first_swap_extent;
110 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
111 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
112 if (nr_blocks) {
113 err = blkdev_issue_discard(si->bdev, start_block,
114 nr_blocks, GFP_KERNEL, 0);
115 if (err)
116 return err;
117 cond_resched();
118 }
119
120 list_for_each_entry(se, &si->first_swap_extent.list, list) {
121 start_block = se->start_block << (PAGE_SHIFT - 9);
122 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
123
124 err = blkdev_issue_discard(si->bdev, start_block,
125 nr_blocks, GFP_KERNEL, 0);
126 if (err)
127 break;
128
129 cond_resched();
130 }
131 return err;
132}
133
134
135
136
137
138static void discard_swap_cluster(struct swap_info_struct *si,
139 pgoff_t start_page, pgoff_t nr_pages)
140{
141 struct swap_extent *se = si->curr_swap_extent;
142 int found_extent = 0;
143
144 while (nr_pages) {
145 struct list_head *lh;
146
147 if (se->start_page <= start_page &&
148 start_page < se->start_page + se->nr_pages) {
149 pgoff_t offset = start_page - se->start_page;
150 sector_t start_block = se->start_block + offset;
151 sector_t nr_blocks = se->nr_pages - offset;
152
153 if (nr_blocks > nr_pages)
154 nr_blocks = nr_pages;
155 start_page += nr_blocks;
156 nr_pages -= nr_blocks;
157
158 if (!found_extent++)
159 si->curr_swap_extent = se;
160
161 start_block <<= PAGE_SHIFT - 9;
162 nr_blocks <<= PAGE_SHIFT - 9;
163 if (blkdev_issue_discard(si->bdev, start_block,
164 nr_blocks, GFP_NOIO, 0))
165 break;
166 }
167
168 lh = se->list.next;
169 se = list_entry(lh, struct swap_extent, list);
170 }
171}
172
173static int wait_for_discard(void *word)
174{
175 schedule();
176 return 0;
177}
178
179#define SWAPFILE_CLUSTER 256
180#define LATENCY_LIMIT 256
181
182static unsigned long scan_swap_map(struct swap_info_struct *si,
183 unsigned char usage)
184{
185 unsigned long offset;
186 unsigned long scan_base;
187 unsigned long last_in_cluster = 0;
188 int latency_ration = LATENCY_LIMIT;
189 int found_free_cluster = 0;
190
191
192
193
194
195
196
197
198
199
200
201
202 si->flags += SWP_SCANNING;
203 scan_base = offset = si->cluster_next;
204
205 if (unlikely(!si->cluster_nr--)) {
206 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
207 si->cluster_nr = SWAPFILE_CLUSTER - 1;
208 goto checks;
209 }
210 if (si->flags & SWP_DISCARDABLE) {
211
212
213
214
215
216
217
218 if (si->lowest_alloc)
219 goto checks;
220 si->lowest_alloc = si->max;
221 si->highest_alloc = 0;
222 }
223 spin_unlock(&swap_lock);
224
225
226
227
228
229
230
231
232
233 if (!(si->flags & SWP_SOLIDSTATE))
234 scan_base = offset = si->lowest_bit;
235 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
236
237
238 for (; last_in_cluster <= si->highest_bit; offset++) {
239 if (si->swap_map[offset])
240 last_in_cluster = offset + SWAPFILE_CLUSTER;
241 else if (offset == last_in_cluster) {
242 spin_lock(&swap_lock);
243 offset -= SWAPFILE_CLUSTER - 1;
244 si->cluster_next = offset;
245 si->cluster_nr = SWAPFILE_CLUSTER - 1;
246 found_free_cluster = 1;
247 goto checks;
248 }
249 if (unlikely(--latency_ration < 0)) {
250 cond_resched();
251 latency_ration = LATENCY_LIMIT;
252 }
253 }
254
255 offset = si->lowest_bit;
256 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
257
258
259 for (; last_in_cluster < scan_base; offset++) {
260 if (si->swap_map[offset])
261 last_in_cluster = offset + SWAPFILE_CLUSTER;
262 else if (offset == last_in_cluster) {
263 spin_lock(&swap_lock);
264 offset -= SWAPFILE_CLUSTER - 1;
265 si->cluster_next = offset;
266 si->cluster_nr = SWAPFILE_CLUSTER - 1;
267 found_free_cluster = 1;
268 goto checks;
269 }
270 if (unlikely(--latency_ration < 0)) {
271 cond_resched();
272 latency_ration = LATENCY_LIMIT;
273 }
274 }
275
276 offset = scan_base;
277 spin_lock(&swap_lock);
278 si->cluster_nr = SWAPFILE_CLUSTER - 1;
279 si->lowest_alloc = 0;
280 }
281
282checks:
283 if (!(si->flags & SWP_WRITEOK))
284 goto no_page;
285 if (!si->highest_bit)
286 goto no_page;
287 if (offset > si->highest_bit)
288 scan_base = offset = si->lowest_bit;
289
290
291 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
292 int swap_was_freed;
293 spin_unlock(&swap_lock);
294 swap_was_freed = __try_to_reclaim_swap(si, offset);
295 spin_lock(&swap_lock);
296
297 if (swap_was_freed)
298 goto checks;
299 goto scan;
300 }
301
302 if (si->swap_map[offset])
303 goto scan;
304
305 if (offset == si->lowest_bit)
306 si->lowest_bit++;
307 if (offset == si->highest_bit)
308 si->highest_bit--;
309 si->inuse_pages++;
310 if (si->inuse_pages == si->pages) {
311 si->lowest_bit = si->max;
312 si->highest_bit = 0;
313 }
314 si->swap_map[offset] = usage;
315 si->cluster_next = offset + 1;
316 si->flags -= SWP_SCANNING;
317
318 if (si->lowest_alloc) {
319
320
321
322
323 if (found_free_cluster) {
324
325
326
327
328
329
330
331 if (offset < si->highest_alloc &&
332 si->lowest_alloc <= last_in_cluster)
333 last_in_cluster = si->lowest_alloc - 1;
334 si->flags |= SWP_DISCARDING;
335 spin_unlock(&swap_lock);
336
337 if (offset < last_in_cluster)
338 discard_swap_cluster(si, offset,
339 last_in_cluster - offset + 1);
340
341 spin_lock(&swap_lock);
342 si->lowest_alloc = 0;
343 si->flags &= ~SWP_DISCARDING;
344
345 smp_mb();
346 wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
347
348 } else if (si->flags & SWP_DISCARDING) {
349
350
351
352
353
354
355 spin_unlock(&swap_lock);
356 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
357 wait_for_discard, TASK_UNINTERRUPTIBLE);
358 spin_lock(&swap_lock);
359 } else {
360
361
362
363
364
365 if (offset < si->lowest_alloc)
366 si->lowest_alloc = offset;
367 if (offset > si->highest_alloc)
368 si->highest_alloc = offset;
369 }
370 }
371 return offset;
372
373scan:
374 spin_unlock(&swap_lock);
375 while (++offset <= si->highest_bit) {
376 if (!si->swap_map[offset]) {
377 spin_lock(&swap_lock);
378 goto checks;
379 }
380 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
381 spin_lock(&swap_lock);
382 goto checks;
383 }
384 if (unlikely(--latency_ration < 0)) {
385 cond_resched();
386 latency_ration = LATENCY_LIMIT;
387 }
388 }
389 offset = si->lowest_bit;
390 while (++offset < scan_base) {
391 if (!si->swap_map[offset]) {
392 spin_lock(&swap_lock);
393 goto checks;
394 }
395 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
396 spin_lock(&swap_lock);
397 goto checks;
398 }
399 if (unlikely(--latency_ration < 0)) {
400 cond_resched();
401 latency_ration = LATENCY_LIMIT;
402 }
403 }
404 spin_lock(&swap_lock);
405
406no_page:
407 si->flags -= SWP_SCANNING;
408 return 0;
409}
410
411swp_entry_t get_swap_page(void)
412{
413 struct swap_info_struct *si;
414 pgoff_t offset;
415 int type, next;
416 int wrapped = 0;
417
418 spin_lock(&swap_lock);
419 if (nr_swap_pages <= 0)
420 goto noswap;
421 nr_swap_pages--;
422
423 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
424 si = swap_info[type];
425 next = si->next;
426 if (next < 0 ||
427 (!wrapped && si->prio != swap_info[next]->prio)) {
428 next = swap_list.head;
429 wrapped++;
430 }
431
432 if (!si->highest_bit)
433 continue;
434 if (!(si->flags & SWP_WRITEOK))
435 continue;
436
437 swap_list.next = next;
438
439 offset = scan_swap_map(si, SWAP_HAS_CACHE);
440 if (offset) {
441 spin_unlock(&swap_lock);
442 return swp_entry(type, offset);
443 }
444 next = swap_list.next;
445 }
446
447 nr_swap_pages++;
448noswap:
449 spin_unlock(&swap_lock);
450 return (swp_entry_t) {0};
451}
452
453
454swp_entry_t get_swap_page_of_type(int type)
455{
456 struct swap_info_struct *si;
457 pgoff_t offset;
458
459 spin_lock(&swap_lock);
460 si = swap_info[type];
461 if (si && (si->flags & SWP_WRITEOK)) {
462 nr_swap_pages--;
463
464 offset = scan_swap_map(si, 1);
465 if (offset) {
466 spin_unlock(&swap_lock);
467 return swp_entry(type, offset);
468 }
469 nr_swap_pages++;
470 }
471 spin_unlock(&swap_lock);
472 return (swp_entry_t) {0};
473}
474
475static struct swap_info_struct *swap_info_get(swp_entry_t entry)
476{
477 struct swap_info_struct *p;
478 unsigned long offset, type;
479
480 if (!entry.val)
481 goto out;
482 type = swp_type(entry);
483 if (type >= nr_swapfiles)
484 goto bad_nofile;
485 p = swap_info[type];
486 if (!(p->flags & SWP_USED))
487 goto bad_device;
488 offset = swp_offset(entry);
489 if (offset >= p->max)
490 goto bad_offset;
491 if (!p->swap_map[offset])
492 goto bad_free;
493 spin_lock(&swap_lock);
494 return p;
495
496bad_free:
497 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
498 goto out;
499bad_offset:
500 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
501 goto out;
502bad_device:
503 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
504 goto out;
505bad_nofile:
506 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
507out:
508 return NULL;
509}
510
511static unsigned char swap_entry_free(struct swap_info_struct *p,
512 swp_entry_t entry, unsigned char usage)
513{
514 unsigned long offset = swp_offset(entry);
515 unsigned char count;
516 unsigned char has_cache;
517
518 count = p->swap_map[offset];
519 has_cache = count & SWAP_HAS_CACHE;
520 count &= ~SWAP_HAS_CACHE;
521
522 if (usage == SWAP_HAS_CACHE) {
523 VM_BUG_ON(!has_cache);
524 has_cache = 0;
525 } else if (count == SWAP_MAP_SHMEM) {
526
527
528
529
530 count = 0;
531 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
532 if (count == COUNT_CONTINUED) {
533 if (swap_count_continued(p, offset, count))
534 count = SWAP_MAP_MAX | COUNT_CONTINUED;
535 else
536 count = SWAP_MAP_MAX;
537 } else
538 count--;
539 }
540
541 if (!count)
542 mem_cgroup_uncharge_swap(entry);
543
544 usage = count | has_cache;
545 p->swap_map[offset] = usage;
546
547
548 if (!usage) {
549 struct gendisk *disk = p->bdev->bd_disk;
550 if (offset < p->lowest_bit)
551 p->lowest_bit = offset;
552 if (offset > p->highest_bit)
553 p->highest_bit = offset;
554 if (swap_list.next >= 0 &&
555 p->prio > swap_info[swap_list.next]->prio)
556 swap_list.next = p->type;
557 nr_swap_pages++;
558 p->inuse_pages--;
559 if ((p->flags & SWP_BLKDEV) &&
560 disk->fops->swap_slot_free_notify)
561 disk->fops->swap_slot_free_notify(p->bdev, offset);
562 }
563
564 return usage;
565}
566
567
568
569
570
571void swap_free(swp_entry_t entry)
572{
573 struct swap_info_struct *p;
574
575 p = swap_info_get(entry);
576 if (p) {
577 swap_entry_free(p, entry, 1);
578 spin_unlock(&swap_lock);
579 }
580}
581
582
583
584
585void swapcache_free(swp_entry_t entry, struct page *page)
586{
587 struct swap_info_struct *p;
588 unsigned char count;
589
590 p = swap_info_get(entry);
591 if (p) {
592 count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
593 if (page)
594 mem_cgroup_uncharge_swapcache(page, entry, count != 0);
595 spin_unlock(&swap_lock);
596 }
597}
598
599
600
601
602
603
604static inline int page_swapcount(struct page *page)
605{
606 int count = 0;
607 struct swap_info_struct *p;
608 swp_entry_t entry;
609
610 entry.val = page_private(page);
611 p = swap_info_get(entry);
612 if (p) {
613 count = swap_count(p->swap_map[swp_offset(entry)]);
614 spin_unlock(&swap_lock);
615 }
616 return count;
617}
618
619
620
621
622
623
624
625int reuse_swap_page(struct page *page)
626{
627 int count;
628
629 VM_BUG_ON(!PageLocked(page));
630 if (unlikely(PageKsm(page)))
631 return 0;
632 count = page_mapcount(page);
633 if (count <= 1 && PageSwapCache(page)) {
634 count += page_swapcount(page);
635 if (count == 1 && !PageWriteback(page)) {
636 delete_from_swap_cache(page);
637 SetPageDirty(page);
638 }
639 }
640 return count <= 1;
641}
642
643
644
645
646
647int try_to_free_swap(struct page *page)
648{
649 VM_BUG_ON(!PageLocked(page));
650
651 if (!PageSwapCache(page))
652 return 0;
653 if (PageWriteback(page))
654 return 0;
655 if (page_swapcount(page))
656 return 0;
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673 if (!(gfp_allowed_mask & __GFP_IO))
674 return 0;
675
676 delete_from_swap_cache(page);
677 SetPageDirty(page);
678 return 1;
679}
680
681
682
683
684
685int free_swap_and_cache(swp_entry_t entry)
686{
687 struct swap_info_struct *p;
688 struct page *page = NULL;
689
690 if (non_swap_entry(entry))
691 return 1;
692
693 p = swap_info_get(entry);
694 if (p) {
695 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
696 page = find_get_page(&swapper_space, entry.val);
697 if (page && !trylock_page(page)) {
698 page_cache_release(page);
699 page = NULL;
700 }
701 }
702 spin_unlock(&swap_lock);
703 }
704 if (page) {
705
706
707
708
709 if (PageSwapCache(page) && !PageWriteback(page) &&
710 (!page_mapped(page) || vm_swap_full())) {
711 delete_from_swap_cache(page);
712 SetPageDirty(page);
713 }
714 unlock_page(page);
715 page_cache_release(page);
716 }
717 return p != NULL;
718}
719
720#ifdef CONFIG_CGROUP_MEM_RES_CTLR
721
722
723
724
725
726
727
728
729
730
731int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
732{
733 struct page *page;
734 struct swap_info_struct *p;
735 int count = 0;
736
737 page = find_get_page(&swapper_space, ent.val);
738 if (page)
739 count += page_mapcount(page);
740 p = swap_info_get(ent);
741 if (p) {
742 count += swap_count(p->swap_map[swp_offset(ent)]);
743 spin_unlock(&swap_lock);
744 }
745
746 *pagep = page;
747 return count;
748}
749#endif
750
751#ifdef CONFIG_HIBERNATION
752
753
754
755
756
757
758
759
760int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
761{
762 struct block_device *bdev = NULL;
763 int type;
764
765 if (device)
766 bdev = bdget(device);
767
768 spin_lock(&swap_lock);
769 for (type = 0; type < nr_swapfiles; type++) {
770 struct swap_info_struct *sis = swap_info[type];
771
772 if (!(sis->flags & SWP_WRITEOK))
773 continue;
774
775 if (!bdev) {
776 if (bdev_p)
777 *bdev_p = bdgrab(sis->bdev);
778
779 spin_unlock(&swap_lock);
780 return type;
781 }
782 if (bdev == sis->bdev) {
783 struct swap_extent *se = &sis->first_swap_extent;
784
785 if (se->start_block == offset) {
786 if (bdev_p)
787 *bdev_p = bdgrab(sis->bdev);
788
789 spin_unlock(&swap_lock);
790 bdput(bdev);
791 return type;
792 }
793 }
794 }
795 spin_unlock(&swap_lock);
796 if (bdev)
797 bdput(bdev);
798
799 return -ENODEV;
800}
801
802
803
804
805
806sector_t swapdev_block(int type, pgoff_t offset)
807{
808 struct block_device *bdev;
809
810 if ((unsigned int)type >= nr_swapfiles)
811 return 0;
812 if (!(swap_info[type]->flags & SWP_WRITEOK))
813 return 0;
814 return map_swap_entry(swp_entry(type, offset), &bdev);
815}
816
817
818
819
820
821
822
823unsigned int count_swap_pages(int type, int free)
824{
825 unsigned int n = 0;
826
827 spin_lock(&swap_lock);
828 if ((unsigned int)type < nr_swapfiles) {
829 struct swap_info_struct *sis = swap_info[type];
830
831 if (sis->flags & SWP_WRITEOK) {
832 n = sis->pages;
833 if (free)
834 n -= sis->inuse_pages;
835 }
836 }
837 spin_unlock(&swap_lock);
838 return n;
839}
840#endif
841
842
843
844
845
846
847static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
848 unsigned long addr, swp_entry_t entry, struct page *page)
849{
850 struct mem_cgroup *ptr;
851 spinlock_t *ptl;
852 pte_t *pte;
853 int ret = 1;
854
855 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) {
856 ret = -ENOMEM;
857 goto out_nolock;
858 }
859
860 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
861 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
862 if (ret > 0)
863 mem_cgroup_cancel_charge_swapin(ptr);
864 ret = 0;
865 goto out;
866 }
867
868 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
869 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
870 get_page(page);
871 set_pte_at(vma->vm_mm, addr, pte,
872 pte_mkold(mk_pte(page, vma->vm_page_prot)));
873 page_add_anon_rmap(page, vma, addr);
874 mem_cgroup_commit_charge_swapin(page, ptr);
875 swap_free(entry);
876
877
878
879
880 activate_page(page);
881out:
882 pte_unmap_unlock(pte, ptl);
883out_nolock:
884 return ret;
885}
886
887static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
888 unsigned long addr, unsigned long end,
889 swp_entry_t entry, struct page *page)
890{
891 pte_t swp_pte = swp_entry_to_pte(entry);
892 pte_t *pte;
893 int ret = 0;
894
895
896
897
898
899
900
901
902
903
904 pte = pte_offset_map(pmd, addr);
905 do {
906
907
908
909
910 if (unlikely(pte_same(*pte, swp_pte))) {
911 pte_unmap(pte);
912 ret = unuse_pte(vma, pmd, addr, entry, page);
913 if (ret)
914 goto out;
915 pte = pte_offset_map(pmd, addr);
916 }
917 } while (pte++, addr += PAGE_SIZE, addr != end);
918 pte_unmap(pte - 1);
919out:
920 return ret;
921}
922
923static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
924 unsigned long addr, unsigned long end,
925 swp_entry_t entry, struct page *page)
926{
927 pmd_t *pmd;
928 unsigned long next;
929 int ret;
930
931 pmd = pmd_offset(pud, addr);
932 do {
933 next = pmd_addr_end(addr, end);
934 if (unlikely(pmd_trans_huge(*pmd)))
935 continue;
936 if (pmd_none_or_clear_bad(pmd))
937 continue;
938 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
939 if (ret)
940 return ret;
941 } while (pmd++, addr = next, addr != end);
942 return 0;
943}
944
945static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
946 unsigned long addr, unsigned long end,
947 swp_entry_t entry, struct page *page)
948{
949 pud_t *pud;
950 unsigned long next;
951 int ret;
952
953 pud = pud_offset(pgd, addr);
954 do {
955 next = pud_addr_end(addr, end);
956 if (pud_none_or_clear_bad(pud))
957 continue;
958 ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
959 if (ret)
960 return ret;
961 } while (pud++, addr = next, addr != end);
962 return 0;
963}
964
965static int unuse_vma(struct vm_area_struct *vma,
966 swp_entry_t entry, struct page *page)
967{
968 pgd_t *pgd;
969 unsigned long addr, end, next;
970 int ret;
971
972 if (page_anon_vma(page)) {
973 addr = page_address_in_vma(page, vma);
974 if (addr == -EFAULT)
975 return 0;
976 else
977 end = addr + PAGE_SIZE;
978 } else {
979 addr = vma->vm_start;
980 end = vma->vm_end;
981 }
982
983 pgd = pgd_offset(vma->vm_mm, addr);
984 do {
985 next = pgd_addr_end(addr, end);
986 if (pgd_none_or_clear_bad(pgd))
987 continue;
988 ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
989 if (ret)
990 return ret;
991 } while (pgd++, addr = next, addr != end);
992 return 0;
993}
994
995static int unuse_mm(struct mm_struct *mm,
996 swp_entry_t entry, struct page *page)
997{
998 struct vm_area_struct *vma;
999 int ret = 0;
1000
1001 if (!down_read_trylock(&mm->mmap_sem)) {
1002
1003
1004
1005
1006 activate_page(page);
1007 unlock_page(page);
1008 down_read(&mm->mmap_sem);
1009 lock_page(page);
1010 }
1011 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1012 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
1013 break;
1014 }
1015 up_read(&mm->mmap_sem);
1016 return (ret < 0)? ret: 0;
1017}
1018
1019
1020
1021
1022
1023static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1024 unsigned int prev)
1025{
1026 unsigned int max = si->max;
1027 unsigned int i = prev;
1028 unsigned char count;
1029
1030
1031
1032
1033
1034
1035
1036 for (;;) {
1037 if (++i >= max) {
1038 if (!prev) {
1039 i = 0;
1040 break;
1041 }
1042
1043
1044
1045
1046 max = prev + 1;
1047 prev = 0;
1048 i = 1;
1049 }
1050 count = si->swap_map[i];
1051 if (count && swap_count(count) != SWAP_MAP_BAD)
1052 break;
1053 }
1054 return i;
1055}
1056
1057
1058
1059
1060
1061
1062static int try_to_unuse(unsigned int type)
1063{
1064 struct swap_info_struct *si = swap_info[type];
1065 struct mm_struct *start_mm;
1066 unsigned char *swap_map;
1067 unsigned char swcount;
1068 struct page *page;
1069 swp_entry_t entry;
1070 unsigned int i = 0;
1071 int retval = 0;
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087 start_mm = &init_mm;
1088 atomic_inc(&init_mm.mm_users);
1089
1090
1091
1092
1093
1094
1095 while ((i = find_next_to_unuse(si, i)) != 0) {
1096 if (signal_pending(current)) {
1097 retval = -EINTR;
1098 break;
1099 }
1100
1101
1102
1103
1104
1105
1106 swap_map = &si->swap_map[i];
1107 entry = swp_entry(type, i);
1108 page = read_swap_cache_async(entry,
1109 GFP_HIGHUSER_MOVABLE, NULL, 0);
1110 if (!page) {
1111
1112
1113
1114
1115
1116
1117 if (!*swap_map)
1118 continue;
1119 retval = -ENOMEM;
1120 break;
1121 }
1122
1123
1124
1125
1126 if (atomic_read(&start_mm->mm_users) == 1) {
1127 mmput(start_mm);
1128 start_mm = &init_mm;
1129 atomic_inc(&init_mm.mm_users);
1130 }
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140 wait_on_page_locked(page);
1141 wait_on_page_writeback(page);
1142 lock_page(page);
1143 wait_on_page_writeback(page);
1144
1145
1146
1147
1148 swcount = *swap_map;
1149 if (swap_count(swcount) == SWAP_MAP_SHMEM) {
1150 retval = shmem_unuse(entry, page);
1151
1152 if (retval < 0)
1153 break;
1154 continue;
1155 }
1156 if (swap_count(swcount) && start_mm != &init_mm)
1157 retval = unuse_mm(start_mm, entry, page);
1158
1159 if (swap_count(*swap_map)) {
1160 int set_start_mm = (*swap_map >= swcount);
1161 struct list_head *p = &start_mm->mmlist;
1162 struct mm_struct *new_start_mm = start_mm;
1163 struct mm_struct *prev_mm = start_mm;
1164 struct mm_struct *mm;
1165
1166 atomic_inc(&new_start_mm->mm_users);
1167 atomic_inc(&prev_mm->mm_users);
1168 spin_lock(&mmlist_lock);
1169 while (swap_count(*swap_map) && !retval &&
1170 (p = p->next) != &start_mm->mmlist) {
1171 mm = list_entry(p, struct mm_struct, mmlist);
1172 if (!atomic_inc_not_zero(&mm->mm_users))
1173 continue;
1174 spin_unlock(&mmlist_lock);
1175 mmput(prev_mm);
1176 prev_mm = mm;
1177
1178 cond_resched();
1179
1180 swcount = *swap_map;
1181 if (!swap_count(swcount))
1182 ;
1183 else if (mm == &init_mm)
1184 set_start_mm = 1;
1185 else
1186 retval = unuse_mm(mm, entry, page);
1187
1188 if (set_start_mm && *swap_map < swcount) {
1189 mmput(new_start_mm);
1190 atomic_inc(&mm->mm_users);
1191 new_start_mm = mm;
1192 set_start_mm = 0;
1193 }
1194 spin_lock(&mmlist_lock);
1195 }
1196 spin_unlock(&mmlist_lock);
1197 mmput(prev_mm);
1198 mmput(start_mm);
1199 start_mm = new_start_mm;
1200 }
1201 if (retval) {
1202 unlock_page(page);
1203 page_cache_release(page);
1204 break;
1205 }
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226 if (swap_count(*swap_map) &&
1227 PageDirty(page) && PageSwapCache(page)) {
1228 struct writeback_control wbc = {
1229 .sync_mode = WB_SYNC_NONE,
1230 };
1231
1232 swap_writepage(page, &wbc);
1233 lock_page(page);
1234 wait_on_page_writeback(page);
1235 }
1236
1237
1238
1239
1240
1241
1242
1243
1244 if (PageSwapCache(page) &&
1245 likely(page_private(page) == entry.val))
1246 delete_from_swap_cache(page);
1247
1248
1249
1250
1251
1252
1253 SetPageDirty(page);
1254 unlock_page(page);
1255 page_cache_release(page);
1256
1257
1258
1259
1260
1261 cond_resched();
1262 }
1263
1264 mmput(start_mm);
1265 return retval;
1266}
1267
1268
1269
1270
1271
1272
1273
1274static void drain_mmlist(void)
1275{
1276 struct list_head *p, *next;
1277 unsigned int type;
1278
1279 for (type = 0; type < nr_swapfiles; type++)
1280 if (swap_info[type]->inuse_pages)
1281 return;
1282 spin_lock(&mmlist_lock);
1283 list_for_each_safe(p, next, &init_mm.mmlist)
1284 list_del_init(p);
1285 spin_unlock(&mmlist_lock);
1286}
1287
1288
1289
1290
1291
1292
1293
1294static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
1295{
1296 struct swap_info_struct *sis;
1297 struct swap_extent *start_se;
1298 struct swap_extent *se;
1299 pgoff_t offset;
1300
1301 sis = swap_info[swp_type(entry)];
1302 *bdev = sis->bdev;
1303
1304 offset = swp_offset(entry);
1305 start_se = sis->curr_swap_extent;
1306 se = start_se;
1307
1308 for ( ; ; ) {
1309 struct list_head *lh;
1310
1311 if (se->start_page <= offset &&
1312 offset < (se->start_page + se->nr_pages)) {
1313 return se->start_block + (offset - se->start_page);
1314 }
1315 lh = se->list.next;
1316 se = list_entry(lh, struct swap_extent, list);
1317 sis->curr_swap_extent = se;
1318 BUG_ON(se == start_se);
1319 }
1320}
1321
1322
1323
1324
1325sector_t map_swap_page(struct page *page, struct block_device **bdev)
1326{
1327 swp_entry_t entry;
1328 entry.val = page_private(page);
1329 return map_swap_entry(entry, bdev);
1330}
1331
1332
1333
1334
1335static void destroy_swap_extents(struct swap_info_struct *sis)
1336{
1337 while (!list_empty(&sis->first_swap_extent.list)) {
1338 struct swap_extent *se;
1339
1340 se = list_entry(sis->first_swap_extent.list.next,
1341 struct swap_extent, list);
1342 list_del(&se->list);
1343 kfree(se);
1344 }
1345}
1346
1347
1348
1349
1350
1351
1352
1353static int
1354add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1355 unsigned long nr_pages, sector_t start_block)
1356{
1357 struct swap_extent *se;
1358 struct swap_extent *new_se;
1359 struct list_head *lh;
1360
1361 if (start_page == 0) {
1362 se = &sis->first_swap_extent;
1363 sis->curr_swap_extent = se;
1364 se->start_page = 0;
1365 se->nr_pages = nr_pages;
1366 se->start_block = start_block;
1367 return 1;
1368 } else {
1369 lh = sis->first_swap_extent.list.prev;
1370 se = list_entry(lh, struct swap_extent, list);
1371 BUG_ON(se->start_page + se->nr_pages != start_page);
1372 if (se->start_block + se->nr_pages == start_block) {
1373
1374 se->nr_pages += nr_pages;
1375 return 0;
1376 }
1377 }
1378
1379
1380
1381
1382 new_se = kmalloc(sizeof(*se), GFP_KERNEL);
1383 if (new_se == NULL)
1384 return -ENOMEM;
1385 new_se->start_page = start_page;
1386 new_se->nr_pages = nr_pages;
1387 new_se->start_block = start_block;
1388
1389 list_add_tail(&new_se->list, &sis->first_swap_extent.list);
1390 return 1;
1391}
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1425{
1426 struct inode *inode;
1427 unsigned blocks_per_page;
1428 unsigned long page_no;
1429 unsigned blkbits;
1430 sector_t probe_block;
1431 sector_t last_block;
1432 sector_t lowest_block = -1;
1433 sector_t highest_block = 0;
1434 int nr_extents = 0;
1435 int ret;
1436
1437 inode = sis->swap_file->f_mapping->host;
1438 if (S_ISBLK(inode->i_mode)) {
1439 ret = add_swap_extent(sis, 0, sis->max, 0);
1440 *span = sis->pages;
1441 goto out;
1442 }
1443
1444 blkbits = inode->i_blkbits;
1445 blocks_per_page = PAGE_SIZE >> blkbits;
1446
1447
1448
1449
1450
1451 probe_block = 0;
1452 page_no = 0;
1453 last_block = i_size_read(inode) >> blkbits;
1454 while ((probe_block + blocks_per_page) <= last_block &&
1455 page_no < sis->max) {
1456 unsigned block_in_page;
1457 sector_t first_block;
1458
1459 first_block = bmap(inode, probe_block);
1460 if (first_block == 0)
1461 goto bad_bmap;
1462
1463
1464
1465
1466 if (first_block & (blocks_per_page - 1)) {
1467 probe_block++;
1468 goto reprobe;
1469 }
1470
1471 for (block_in_page = 1; block_in_page < blocks_per_page;
1472 block_in_page++) {
1473 sector_t block;
1474
1475 block = bmap(inode, probe_block + block_in_page);
1476 if (block == 0)
1477 goto bad_bmap;
1478 if (block != first_block + block_in_page) {
1479
1480 probe_block++;
1481 goto reprobe;
1482 }
1483 }
1484
1485 first_block >>= (PAGE_SHIFT - blkbits);
1486 if (page_no) {
1487 if (first_block < lowest_block)
1488 lowest_block = first_block;
1489 if (first_block > highest_block)
1490 highest_block = first_block;
1491 }
1492
1493
1494
1495
1496 ret = add_swap_extent(sis, page_no, 1, first_block);
1497 if (ret < 0)
1498 goto out;
1499 nr_extents += ret;
1500 page_no++;
1501 probe_block += blocks_per_page;
1502reprobe:
1503 continue;
1504 }
1505 ret = nr_extents;
1506 *span = 1 + highest_block - lowest_block;
1507 if (page_no == 0)
1508 page_no = 1;
1509 sis->max = page_no;
1510 sis->pages = page_no - 1;
1511 sis->highest_bit = page_no - 1;
1512out:
1513 return ret;
1514bad_bmap:
1515 printk(KERN_ERR "swapon: swapfile has holes\n");
1516 ret = -EINVAL;
1517 goto out;
1518}
1519
1520static void enable_swap_info(struct swap_info_struct *p, int prio,
1521 unsigned char *swap_map)
1522{
1523 int i, prev;
1524
1525 spin_lock(&swap_lock);
1526 if (prio >= 0)
1527 p->prio = prio;
1528 else
1529 p->prio = --least_priority;
1530 p->swap_map = swap_map;
1531 p->flags |= SWP_WRITEOK;
1532 nr_swap_pages += p->pages;
1533 total_swap_pages += p->pages;
1534
1535
1536 prev = -1;
1537 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
1538 if (p->prio >= swap_info[i]->prio)
1539 break;
1540 prev = i;
1541 }
1542 p->next = i;
1543 if (prev < 0)
1544 swap_list.head = swap_list.next = p->type;
1545 else
1546 swap_info[prev]->next = p->type;
1547 spin_unlock(&swap_lock);
1548}
1549
1550SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1551{
1552 struct swap_info_struct *p = NULL;
1553 unsigned char *swap_map;
1554 struct file *swap_file, *victim;
1555 struct address_space *mapping;
1556 struct inode *inode;
1557 char *pathname;
1558 int i, type, prev;
1559 int err;
1560
1561 if (!capable(CAP_SYS_ADMIN))
1562 return -EPERM;
1563
1564 pathname = getname(specialfile);
1565 err = PTR_ERR(pathname);
1566 if (IS_ERR(pathname))
1567 goto out;
1568
1569 victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
1570 putname(pathname);
1571 err = PTR_ERR(victim);
1572 if (IS_ERR(victim))
1573 goto out;
1574
1575 mapping = victim->f_mapping;
1576 prev = -1;
1577 spin_lock(&swap_lock);
1578 for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
1579 p = swap_info[type];
1580 if (p->flags & SWP_WRITEOK) {
1581 if (p->swap_file->f_mapping == mapping)
1582 break;
1583 }
1584 prev = type;
1585 }
1586 if (type < 0) {
1587 err = -EINVAL;
1588 spin_unlock(&swap_lock);
1589 goto out_dput;
1590 }
1591 if (!security_vm_enough_memory(p->pages))
1592 vm_unacct_memory(p->pages);
1593 else {
1594 err = -ENOMEM;
1595 spin_unlock(&swap_lock);
1596 goto out_dput;
1597 }
1598 if (prev < 0)
1599 swap_list.head = p->next;
1600 else
1601 swap_info[prev]->next = p->next;
1602 if (type == swap_list.next) {
1603
1604 swap_list.next = swap_list.head;
1605 }
1606 if (p->prio < 0) {
1607 for (i = p->next; i >= 0; i = swap_info[i]->next)
1608 swap_info[i]->prio = p->prio--;
1609 least_priority++;
1610 }
1611 nr_swap_pages -= p->pages;
1612 total_swap_pages -= p->pages;
1613 p->flags &= ~SWP_WRITEOK;
1614 spin_unlock(&swap_lock);
1615
1616 current->flags |= PF_OOM_ORIGIN;
1617 err = try_to_unuse(type);
1618 current->flags &= ~PF_OOM_ORIGIN;
1619
1620 if (err) {
1621
1622
1623
1624
1625
1626
1627
1628 enable_swap_info(p, p->prio, p->swap_map);
1629 goto out_dput;
1630 }
1631
1632 destroy_swap_extents(p);
1633 if (p->flags & SWP_CONTINUED)
1634 free_swap_count_continuations(p);
1635
1636 mutex_lock(&swapon_mutex);
1637 spin_lock(&swap_lock);
1638 drain_mmlist();
1639
1640
1641 p->highest_bit = 0;
1642 while (p->flags >= SWP_SCANNING) {
1643 spin_unlock(&swap_lock);
1644 schedule_timeout_uninterruptible(1);
1645 spin_lock(&swap_lock);
1646 }
1647
1648 swap_file = p->swap_file;
1649 p->swap_file = NULL;
1650 p->max = 0;
1651 swap_map = p->swap_map;
1652 p->swap_map = NULL;
1653 p->flags = 0;
1654 spin_unlock(&swap_lock);
1655 mutex_unlock(&swapon_mutex);
1656 vfree(swap_map);
1657
1658 swap_cgroup_swapoff(type);
1659
1660 inode = mapping->host;
1661 if (S_ISBLK(inode->i_mode)) {
1662 struct block_device *bdev = I_BDEV(inode);
1663 set_blocksize(bdev, p->old_block_size);
1664 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1665 } else {
1666 mutex_lock(&inode->i_mutex);
1667 inode->i_flags &= ~S_SWAPFILE;
1668 mutex_unlock(&inode->i_mutex);
1669 }
1670 filp_close(swap_file, NULL);
1671 err = 0;
1672 atomic_inc(&proc_poll_event);
1673 wake_up_interruptible(&proc_poll_wait);
1674
1675out_dput:
1676 filp_close(victim, NULL);
1677out:
1678 return err;
1679}
1680
1681#ifdef CONFIG_PROC_FS
1682struct proc_swaps {
1683 struct seq_file seq;
1684 int event;
1685};
1686
1687static unsigned swaps_poll(struct file *file, poll_table *wait)
1688{
1689 struct proc_swaps *s = file->private_data;
1690
1691 poll_wait(file, &proc_poll_wait, wait);
1692
1693 if (s->event != atomic_read(&proc_poll_event)) {
1694 s->event = atomic_read(&proc_poll_event);
1695 return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
1696 }
1697
1698 return POLLIN | POLLRDNORM;
1699}
1700
1701
1702static void *swap_start(struct seq_file *swap, loff_t *pos)
1703{
1704 struct swap_info_struct *si;
1705 int type;
1706 loff_t l = *pos;
1707
1708 mutex_lock(&swapon_mutex);
1709
1710 if (!l)
1711 return SEQ_START_TOKEN;
1712
1713 for (type = 0; type < nr_swapfiles; type++) {
1714 smp_rmb();
1715 si = swap_info[type];
1716 if (!(si->flags & SWP_USED) || !si->swap_map)
1717 continue;
1718 if (!--l)
1719 return si;
1720 }
1721
1722 return NULL;
1723}
1724
1725static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1726{
1727 struct swap_info_struct *si = v;
1728 int type;
1729
1730 if (v == SEQ_START_TOKEN)
1731 type = 0;
1732 else
1733 type = si->type + 1;
1734
1735 for (; type < nr_swapfiles; type++) {
1736 smp_rmb();
1737 si = swap_info[type];
1738 if (!(si->flags & SWP_USED) || !si->swap_map)
1739 continue;
1740 ++*pos;
1741 return si;
1742 }
1743
1744 return NULL;
1745}
1746
1747static void swap_stop(struct seq_file *swap, void *v)
1748{
1749 mutex_unlock(&swapon_mutex);
1750}
1751
1752static int swap_show(struct seq_file *swap, void *v)
1753{
1754 struct swap_info_struct *si = v;
1755 struct file *file;
1756 int len;
1757
1758 if (si == SEQ_START_TOKEN) {
1759 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1760 return 0;
1761 }
1762
1763 file = si->swap_file;
1764 len = seq_path(swap, &file->f_path, " \t\n\\");
1765 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1766 len < 40 ? 40 - len : 1, " ",
1767 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1768 "partition" : "file\t",
1769 si->pages << (PAGE_SHIFT - 10),
1770 si->inuse_pages << (PAGE_SHIFT - 10),
1771 si->prio);
1772 return 0;
1773}
1774
1775static const struct seq_operations swaps_op = {
1776 .start = swap_start,
1777 .next = swap_next,
1778 .stop = swap_stop,
1779 .show = swap_show
1780};
1781
1782static int swaps_open(struct inode *inode, struct file *file)
1783{
1784 struct proc_swaps *s;
1785 int ret;
1786
1787 s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
1788 if (!s)
1789 return -ENOMEM;
1790
1791 file->private_data = s;
1792
1793 ret = seq_open(file, &swaps_op);
1794 if (ret) {
1795 kfree(s);
1796 return ret;
1797 }
1798
1799 s->seq.private = s;
1800 s->event = atomic_read(&proc_poll_event);
1801 return ret;
1802}
1803
1804static const struct file_operations proc_swaps_operations = {
1805 .open = swaps_open,
1806 .read = seq_read,
1807 .llseek = seq_lseek,
1808 .release = seq_release,
1809 .poll = swaps_poll,
1810};
1811
1812static int __init procswaps_init(void)
1813{
1814 proc_create("swaps", 0, NULL, &proc_swaps_operations);
1815 return 0;
1816}
1817__initcall(procswaps_init);
1818#endif
1819
1820#ifdef MAX_SWAPFILES_CHECK
1821static int __init max_swapfiles_check(void)
1822{
1823 MAX_SWAPFILES_CHECK();
1824 return 0;
1825}
1826late_initcall(max_swapfiles_check);
1827#endif
1828
1829static struct swap_info_struct *alloc_swap_info(void)
1830{
1831 struct swap_info_struct *p;
1832 unsigned int type;
1833
1834 p = kzalloc(sizeof(*p), GFP_KERNEL);
1835 if (!p)
1836 return ERR_PTR(-ENOMEM);
1837
1838 spin_lock(&swap_lock);
1839 for (type = 0; type < nr_swapfiles; type++) {
1840 if (!(swap_info[type]->flags & SWP_USED))
1841 break;
1842 }
1843 if (type >= MAX_SWAPFILES) {
1844 spin_unlock(&swap_lock);
1845 kfree(p);
1846 return ERR_PTR(-EPERM);
1847 }
1848 if (type >= nr_swapfiles) {
1849 p->type = type;
1850 swap_info[type] = p;
1851
1852
1853
1854
1855
1856 smp_wmb();
1857 nr_swapfiles++;
1858 } else {
1859 kfree(p);
1860 p = swap_info[type];
1861
1862
1863
1864
1865 }
1866 INIT_LIST_HEAD(&p->first_swap_extent.list);
1867 p->flags = SWP_USED;
1868 p->next = -1;
1869 spin_unlock(&swap_lock);
1870
1871 return p;
1872}
1873
1874static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
1875{
1876 int error;
1877
1878 if (S_ISBLK(inode->i_mode)) {
1879 p->bdev = bdgrab(I_BDEV(inode));
1880 error = blkdev_get(p->bdev,
1881 FMODE_READ | FMODE_WRITE | FMODE_EXCL,
1882 sys_swapon);
1883 if (error < 0) {
1884 p->bdev = NULL;
1885 return -EINVAL;
1886 }
1887 p->old_block_size = block_size(p->bdev);
1888 error = set_blocksize(p->bdev, PAGE_SIZE);
1889 if (error < 0)
1890 return error;
1891 p->flags |= SWP_BLKDEV;
1892 } else if (S_ISREG(inode->i_mode)) {
1893 p->bdev = inode->i_sb->s_bdev;
1894 mutex_lock(&inode->i_mutex);
1895 if (IS_SWAPFILE(inode))
1896 return -EBUSY;
1897 } else
1898 return -EINVAL;
1899
1900 return 0;
1901}
1902
1903static unsigned long read_swap_header(struct swap_info_struct *p,
1904 union swap_header *swap_header,
1905 struct inode *inode)
1906{
1907 int i;
1908 unsigned long maxpages;
1909 unsigned long swapfilepages;
1910
1911 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
1912 printk(KERN_ERR "Unable to find swap-space signature\n");
1913 return 0;
1914 }
1915
1916
1917 if (swab32(swap_header->info.version) == 1) {
1918 swab32s(&swap_header->info.version);
1919 swab32s(&swap_header->info.last_page);
1920 swab32s(&swap_header->info.nr_badpages);
1921 for (i = 0; i < swap_header->info.nr_badpages; i++)
1922 swab32s(&swap_header->info.badpages[i]);
1923 }
1924
1925 if (swap_header->info.version != 1) {
1926 printk(KERN_WARNING
1927 "Unable to handle swap header version %d\n",
1928 swap_header->info.version);
1929 return 0;
1930 }
1931
1932 p->lowest_bit = 1;
1933 p->cluster_next = 1;
1934 p->cluster_nr = 0;
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950 maxpages = swp_offset(pte_to_swp_entry(
1951 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
1952 if (maxpages > swap_header->info.last_page) {
1953 maxpages = swap_header->info.last_page + 1;
1954
1955 if ((unsigned int)maxpages == 0)
1956 maxpages = UINT_MAX;
1957 }
1958 p->highest_bit = maxpages - 1;
1959
1960 if (!maxpages)
1961 return 0;
1962 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
1963 if (swapfilepages && maxpages > swapfilepages) {
1964 printk(KERN_WARNING
1965 "Swap area shorter than signature indicates\n");
1966 return 0;
1967 }
1968 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
1969 return 0;
1970 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
1971 return 0;
1972
1973 return maxpages;
1974}
1975
1976static int setup_swap_map_and_extents(struct swap_info_struct *p,
1977 union swap_header *swap_header,
1978 unsigned char *swap_map,
1979 unsigned long maxpages,
1980 sector_t *span)
1981{
1982 int i;
1983 unsigned int nr_good_pages;
1984 int nr_extents;
1985
1986 nr_good_pages = maxpages - 1;
1987
1988 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1989 unsigned int page_nr = swap_header->info.badpages[i];
1990 if (page_nr == 0 || page_nr > swap_header->info.last_page)
1991 return -EINVAL;
1992 if (page_nr < maxpages) {
1993 swap_map[page_nr] = SWAP_MAP_BAD;
1994 nr_good_pages--;
1995 }
1996 }
1997
1998 if (nr_good_pages) {
1999 swap_map[0] = SWAP_MAP_BAD;
2000 p->max = maxpages;
2001 p->pages = nr_good_pages;
2002 nr_extents = setup_swap_extents(p, span);
2003 if (nr_extents < 0)
2004 return nr_extents;
2005 nr_good_pages = p->pages;
2006 }
2007 if (!nr_good_pages) {
2008 printk(KERN_WARNING "Empty swap-file\n");
2009 return -EINVAL;
2010 }
2011
2012 return nr_extents;
2013}
2014
2015SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2016{
2017 struct swap_info_struct *p;
2018 char *name;
2019 struct file *swap_file = NULL;
2020 struct address_space *mapping;
2021 int i;
2022 int prio;
2023 int error;
2024 union swap_header *swap_header;
2025 int nr_extents;
2026 sector_t span;
2027 unsigned long maxpages;
2028 unsigned char *swap_map = NULL;
2029 struct page *page = NULL;
2030 struct inode *inode = NULL;
2031
2032 if (!capable(CAP_SYS_ADMIN))
2033 return -EPERM;
2034
2035 p = alloc_swap_info();
2036 if (IS_ERR(p))
2037 return PTR_ERR(p);
2038
2039 name = getname(specialfile);
2040 if (IS_ERR(name)) {
2041 error = PTR_ERR(name);
2042 name = NULL;
2043 goto bad_swap;
2044 }
2045 swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
2046 if (IS_ERR(swap_file)) {
2047 error = PTR_ERR(swap_file);
2048 swap_file = NULL;
2049 goto bad_swap;
2050 }
2051
2052 p->swap_file = swap_file;
2053 mapping = swap_file->f_mapping;
2054
2055 for (i = 0; i < nr_swapfiles; i++) {
2056 struct swap_info_struct *q = swap_info[i];
2057
2058 if (q == p || !q->swap_file)
2059 continue;
2060 if (mapping == q->swap_file->f_mapping) {
2061 error = -EBUSY;
2062 goto bad_swap;
2063 }
2064 }
2065
2066 inode = mapping->host;
2067
2068 error = claim_swapfile(p, inode);
2069 if (unlikely(error))
2070 goto bad_swap;
2071
2072
2073
2074
2075 if (!mapping->a_ops->readpage) {
2076 error = -EINVAL;
2077 goto bad_swap;
2078 }
2079 page = read_mapping_page(mapping, 0, swap_file);
2080 if (IS_ERR(page)) {
2081 error = PTR_ERR(page);
2082 goto bad_swap;
2083 }
2084 swap_header = kmap(page);
2085
2086 maxpages = read_swap_header(p, swap_header, inode);
2087 if (unlikely(!maxpages)) {
2088 error = -EINVAL;
2089 goto bad_swap;
2090 }
2091
2092
2093 swap_map = vzalloc(maxpages);
2094 if (!swap_map) {
2095 error = -ENOMEM;
2096 goto bad_swap;
2097 }
2098
2099 error = swap_cgroup_swapon(p->type, maxpages);
2100 if (error)
2101 goto bad_swap;
2102
2103 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
2104 maxpages, &span);
2105 if (unlikely(nr_extents < 0)) {
2106 error = nr_extents;
2107 goto bad_swap;
2108 }
2109
2110 if (p->bdev) {
2111 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
2112 p->flags |= SWP_SOLIDSTATE;
2113 p->cluster_next = 1 + (random32() % p->highest_bit);
2114 }
2115 if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD))
2116 p->flags |= SWP_DISCARDABLE;
2117 }
2118
2119 mutex_lock(&swapon_mutex);
2120 prio = -1;
2121 if (swap_flags & SWAP_FLAG_PREFER)
2122 prio =
2123 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2124 enable_swap_info(p, prio, swap_map);
2125
2126 printk(KERN_INFO "Adding %uk swap on %s. "
2127 "Priority:%d extents:%d across:%lluk %s%s\n",
2128 p->pages<<(PAGE_SHIFT-10), name, p->prio,
2129 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2130 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2131 (p->flags & SWP_DISCARDABLE) ? "D" : "");
2132
2133 mutex_unlock(&swapon_mutex);
2134 atomic_inc(&proc_poll_event);
2135 wake_up_interruptible(&proc_poll_wait);
2136
2137 if (S_ISREG(inode->i_mode))
2138 inode->i_flags |= S_SWAPFILE;
2139 error = 0;
2140 goto out;
2141bad_swap:
2142 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
2143 set_blocksize(p->bdev, p->old_block_size);
2144 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2145 }
2146 destroy_swap_extents(p);
2147 swap_cgroup_swapoff(p->type);
2148 spin_lock(&swap_lock);
2149 p->swap_file = NULL;
2150 p->flags = 0;
2151 spin_unlock(&swap_lock);
2152 vfree(swap_map);
2153 if (swap_file) {
2154 if (inode && S_ISREG(inode->i_mode)) {
2155 mutex_unlock(&inode->i_mutex);
2156 inode = NULL;
2157 }
2158 filp_close(swap_file, NULL);
2159 }
2160out:
2161 if (page && !IS_ERR(page)) {
2162 kunmap(page);
2163 page_cache_release(page);
2164 }
2165 if (name)
2166 putname(name);
2167 if (inode && S_ISREG(inode->i_mode))
2168 mutex_unlock(&inode->i_mutex);
2169 return error;
2170}
2171
2172void si_swapinfo(struct sysinfo *val)
2173{
2174 unsigned int type;
2175 unsigned long nr_to_be_unused = 0;
2176
2177 spin_lock(&swap_lock);
2178 for (type = 0; type < nr_swapfiles; type++) {
2179 struct swap_info_struct *si = swap_info[type];
2180
2181 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
2182 nr_to_be_unused += si->inuse_pages;
2183 }
2184 val->freeswap = nr_swap_pages + nr_to_be_unused;
2185 val->totalswap = total_swap_pages + nr_to_be_unused;
2186 spin_unlock(&swap_lock);
2187}
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2201{
2202 struct swap_info_struct *p;
2203 unsigned long offset, type;
2204 unsigned char count;
2205 unsigned char has_cache;
2206 int err = -EINVAL;
2207
2208 if (non_swap_entry(entry))
2209 goto out;
2210
2211 type = swp_type(entry);
2212 if (type >= nr_swapfiles)
2213 goto bad_file;
2214 p = swap_info[type];
2215 offset = swp_offset(entry);
2216
2217 spin_lock(&swap_lock);
2218 if (unlikely(offset >= p->max))
2219 goto unlock_out;
2220
2221 count = p->swap_map[offset];
2222 has_cache = count & SWAP_HAS_CACHE;
2223 count &= ~SWAP_HAS_CACHE;
2224 err = 0;
2225
2226 if (usage == SWAP_HAS_CACHE) {
2227
2228
2229 if (!has_cache && count)
2230 has_cache = SWAP_HAS_CACHE;
2231 else if (has_cache)
2232 err = -EEXIST;
2233 else
2234 err = -ENOENT;
2235
2236 } else if (count || has_cache) {
2237
2238 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
2239 count += usage;
2240 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
2241 err = -EINVAL;
2242 else if (swap_count_continued(p, offset, count))
2243 count = COUNT_CONTINUED;
2244 else
2245 err = -ENOMEM;
2246 } else
2247 err = -ENOENT;
2248
2249 p->swap_map[offset] = count | has_cache;
2250
2251unlock_out:
2252 spin_unlock(&swap_lock);
2253out:
2254 return err;
2255
2256bad_file:
2257 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
2258 goto out;
2259}
2260
2261
2262
2263
2264
2265void swap_shmem_alloc(swp_entry_t entry)
2266{
2267 __swap_duplicate(entry, SWAP_MAP_SHMEM);
2268}
2269
2270
2271
2272
2273
2274
2275
2276
2277int swap_duplicate(swp_entry_t entry)
2278{
2279 int err = 0;
2280
2281 while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
2282 err = add_swap_count_continuation(entry, GFP_ATOMIC);
2283 return err;
2284}
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294int swapcache_prepare(swp_entry_t entry)
2295{
2296 return __swap_duplicate(entry, SWAP_HAS_CACHE);
2297}
2298
2299
2300
2301
2302
2303int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2304{
2305 struct swap_info_struct *si;
2306 int our_page_cluster = page_cluster;
2307 pgoff_t target, toff;
2308 pgoff_t base, end;
2309 int nr_pages = 0;
2310
2311 if (!our_page_cluster)
2312 return 0;
2313
2314 si = swap_info[swp_type(entry)];
2315 target = swp_offset(entry);
2316 base = (target >> our_page_cluster) << our_page_cluster;
2317 end = base + (1 << our_page_cluster);
2318 if (!base)
2319 base++;
2320
2321 spin_lock(&swap_lock);
2322 if (end > si->max)
2323 end = si->max;
2324
2325
2326 for (toff = target; ++toff < end; nr_pages++) {
2327
2328 if (!si->swap_map[toff])
2329 break;
2330 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2331 break;
2332 }
2333
2334 for (toff = target; --toff >= base; nr_pages++) {
2335
2336 if (!si->swap_map[toff])
2337 break;
2338 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2339 break;
2340 }
2341 spin_unlock(&swap_lock);
2342
2343
2344
2345
2346
2347 *offset = ++toff;
2348 return nr_pages? ++nr_pages: 0;
2349}
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2367{
2368 struct swap_info_struct *si;
2369 struct page *head;
2370 struct page *page;
2371 struct page *list_page;
2372 pgoff_t offset;
2373 unsigned char count;
2374
2375
2376
2377
2378
2379 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
2380
2381 si = swap_info_get(entry);
2382 if (!si) {
2383
2384
2385
2386
2387
2388 goto outer;
2389 }
2390
2391 offset = swp_offset(entry);
2392 count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
2393
2394 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
2395
2396
2397
2398
2399
2400 goto out;
2401 }
2402
2403 if (!page) {
2404 spin_unlock(&swap_lock);
2405 return -ENOMEM;
2406 }
2407
2408
2409
2410
2411
2412
2413 head = vmalloc_to_page(si->swap_map + offset);
2414 offset &= ~PAGE_MASK;
2415
2416
2417
2418
2419
2420 if (!page_private(head)) {
2421 BUG_ON(count & COUNT_CONTINUED);
2422 INIT_LIST_HEAD(&head->lru);
2423 set_page_private(head, SWP_CONTINUED);
2424 si->flags |= SWP_CONTINUED;
2425 }
2426
2427 list_for_each_entry(list_page, &head->lru, lru) {
2428 unsigned char *map;
2429
2430
2431
2432
2433
2434 if (!(count & COUNT_CONTINUED))
2435 goto out;
2436
2437 map = kmap_atomic(list_page, KM_USER0) + offset;
2438 count = *map;
2439 kunmap_atomic(map, KM_USER0);
2440
2441
2442
2443
2444
2445 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
2446 goto out;
2447 }
2448
2449 list_add_tail(&page->lru, &head->lru);
2450 page = NULL;
2451out:
2452 spin_unlock(&swap_lock);
2453outer:
2454 if (page)
2455 __free_page(page);
2456 return 0;
2457}
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467static bool swap_count_continued(struct swap_info_struct *si,
2468 pgoff_t offset, unsigned char count)
2469{
2470 struct page *head;
2471 struct page *page;
2472 unsigned char *map;
2473
2474 head = vmalloc_to_page(si->swap_map + offset);
2475 if (page_private(head) != SWP_CONTINUED) {
2476 BUG_ON(count & COUNT_CONTINUED);
2477 return false;
2478 }
2479
2480 offset &= ~PAGE_MASK;
2481 page = list_entry(head->lru.next, struct page, lru);
2482 map = kmap_atomic(page, KM_USER0) + offset;
2483
2484 if (count == SWAP_MAP_MAX)
2485 goto init_map;
2486
2487 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) {
2488
2489
2490
2491 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
2492 kunmap_atomic(map, KM_USER0);
2493 page = list_entry(page->lru.next, struct page, lru);
2494 BUG_ON(page == head);
2495 map = kmap_atomic(page, KM_USER0) + offset;
2496 }
2497 if (*map == SWAP_CONT_MAX) {
2498 kunmap_atomic(map, KM_USER0);
2499 page = list_entry(page->lru.next, struct page, lru);
2500 if (page == head)
2501 return false;
2502 map = kmap_atomic(page, KM_USER0) + offset;
2503init_map: *map = 0;
2504 }
2505 *map += 1;
2506 kunmap_atomic(map, KM_USER0);
2507 page = list_entry(page->lru.prev, struct page, lru);
2508 while (page != head) {
2509 map = kmap_atomic(page, KM_USER0) + offset;
2510 *map = COUNT_CONTINUED;
2511 kunmap_atomic(map, KM_USER0);
2512 page = list_entry(page->lru.prev, struct page, lru);
2513 }
2514 return true;
2515
2516 } else {
2517
2518
2519
2520 BUG_ON(count != COUNT_CONTINUED);
2521 while (*map == COUNT_CONTINUED) {
2522 kunmap_atomic(map, KM_USER0);
2523 page = list_entry(page->lru.next, struct page, lru);
2524 BUG_ON(page == head);
2525 map = kmap_atomic(page, KM_USER0) + offset;
2526 }
2527 BUG_ON(*map == 0);
2528 *map -= 1;
2529 if (*map == 0)
2530 count = 0;
2531 kunmap_atomic(map, KM_USER0);
2532 page = list_entry(page->lru.prev, struct page, lru);
2533 while (page != head) {
2534 map = kmap_atomic(page, KM_USER0) + offset;
2535 *map = SWAP_CONT_MAX | count;
2536 count = COUNT_CONTINUED;
2537 kunmap_atomic(map, KM_USER0);
2538 page = list_entry(page->lru.prev, struct page, lru);
2539 }
2540 return count == COUNT_CONTINUED;
2541 }
2542}
2543
2544
2545
2546
2547
2548static void free_swap_count_continuations(struct swap_info_struct *si)
2549{
2550 pgoff_t offset;
2551
2552 for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
2553 struct page *head;
2554 head = vmalloc_to_page(si->swap_map + offset);
2555 if (page_private(head)) {
2556 struct list_head *this, *next;
2557 list_for_each_safe(this, next, &head->lru) {
2558 struct page *page;
2559 page = list_entry(this, struct page, lru);
2560 list_del(this);
2561 __free_page(page);
2562 }
2563 }
2564 }
2565}
2566