1
2
3
4
5
6
7
8#include <linux/mm.h>
9#include <linux/hugetlb.h>
10#include <linux/mman.h>
11#include <linux/slab.h>
12#include <linux/kernel_stat.h>
13#include <linux/swap.h>
14#include <linux/vmalloc.h>
15#include <linux/pagemap.h>
16#include <linux/namei.h>
17#include <linux/shm.h>
18#include <linux/blkdev.h>
19#include <linux/random.h>
20#include <linux/writeback.h>
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/init.h>
24#include <linux/module.h>
25#include <linux/ksm.h>
26#include <linux/rmap.h>
27#include <linux/security.h>
28#include <linux/backing-dev.h>
29#include <linux/mutex.h>
30#include <linux/capability.h>
31#include <linux/syscalls.h>
32#include <linux/memcontrol.h>
33#include <linux/poll.h>
34
35#include <asm/pgtable.h>
36#include <asm/tlbflush.h>
37#include <linux/swapops.h>
38#include <linux/page_cgroup.h>
39
40static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
41 unsigned char);
42static void free_swap_count_continuations(struct swap_info_struct *);
43static sector_t map_swap_entry(swp_entry_t, struct block_device**);
44
45static DEFINE_SPINLOCK(swap_lock);
46static unsigned int nr_swapfiles;
47long nr_swap_pages;
48long total_swap_pages;
49static int least_priority;
50
51static const char Bad_file[] = "Bad swap file entry ";
52static const char Unused_file[] = "Unused swap file entry ";
53static const char Bad_offset[] = "Bad swap offset entry ";
54static const char Unused_offset[] = "Unused swap offset entry ";
55
56static struct swap_list_t swap_list = {-1, -1};
57
58static struct swap_info_struct *swap_info[MAX_SWAPFILES];
59
60static DEFINE_MUTEX(swapon_mutex);
61
62static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
63
64static atomic_t proc_poll_event = ATOMIC_INIT(0);
65
66static inline unsigned char swap_count(unsigned char ent)
67{
68 return ent & ~SWAP_HAS_CACHE;
69}
70
71
72static int
73__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
74{
75 swp_entry_t entry = swp_entry(si->type, offset);
76 struct page *page;
77 int ret = 0;
78
79 page = find_get_page(&swapper_space, entry.val);
80 if (!page)
81 return 0;
82
83
84
85
86
87
88
89 if (trylock_page(page)) {
90 ret = try_to_free_swap(page);
91 unlock_page(page);
92 }
93 page_cache_release(page);
94 return ret;
95}
96
97
98
99
100
101
102static DECLARE_RWSEM(swap_unplug_sem);
103
104void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
105{
106 swp_entry_t entry;
107
108 down_read(&swap_unplug_sem);
109 entry.val = page_private(page);
110 if (PageSwapCache(page)) {
111 struct block_device *bdev = swap_info[swp_type(entry)]->bdev;
112 struct backing_dev_info *bdi;
113
114
115
116
117
118
119
120
121
122 WARN_ON(page_count(page) <= 1);
123
124 bdi = bdev->bd_inode->i_mapping->backing_dev_info;
125 blk_run_backing_dev(bdi, page);
126 }
127 up_read(&swap_unplug_sem);
128}
129
130
131
132
133
134static int discard_swap(struct swap_info_struct *si)
135{
136 struct swap_extent *se;
137 sector_t start_block;
138 sector_t nr_blocks;
139 int err = 0;
140
141
142 se = &si->first_swap_extent;
143 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
144 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
145 if (nr_blocks) {
146 err = blkdev_issue_discard(si->bdev, start_block,
147 nr_blocks, GFP_KERNEL, 0);
148 if (err)
149 return err;
150 cond_resched();
151 }
152
153 list_for_each_entry(se, &si->first_swap_extent.list, list) {
154 start_block = se->start_block << (PAGE_SHIFT - 9);
155 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
156
157 err = blkdev_issue_discard(si->bdev, start_block,
158 nr_blocks, GFP_KERNEL, 0);
159 if (err)
160 break;
161
162 cond_resched();
163 }
164 return err;
165}
166
167
168
169
170
171static void discard_swap_cluster(struct swap_info_struct *si,
172 pgoff_t start_page, pgoff_t nr_pages)
173{
174 struct swap_extent *se = si->curr_swap_extent;
175 int found_extent = 0;
176
177 while (nr_pages) {
178 struct list_head *lh;
179
180 if (se->start_page <= start_page &&
181 start_page < se->start_page + se->nr_pages) {
182 pgoff_t offset = start_page - se->start_page;
183 sector_t start_block = se->start_block + offset;
184 sector_t nr_blocks = se->nr_pages - offset;
185
186 if (nr_blocks > nr_pages)
187 nr_blocks = nr_pages;
188 start_page += nr_blocks;
189 nr_pages -= nr_blocks;
190
191 if (!found_extent++)
192 si->curr_swap_extent = se;
193
194 start_block <<= PAGE_SHIFT - 9;
195 nr_blocks <<= PAGE_SHIFT - 9;
196 if (blkdev_issue_discard(si->bdev, start_block,
197 nr_blocks, GFP_NOIO, 0))
198 break;
199 }
200
201 lh = se->list.next;
202 se = list_entry(lh, struct swap_extent, list);
203 }
204}
205
206static int wait_for_discard(void *word)
207{
208 schedule();
209 return 0;
210}
211
212#define SWAPFILE_CLUSTER 256
213#define LATENCY_LIMIT 256
214
215static inline unsigned long scan_swap_map(struct swap_info_struct *si,
216 unsigned char usage)
217{
218 unsigned long offset;
219 unsigned long scan_base;
220 unsigned long last_in_cluster = 0;
221 int latency_ration = LATENCY_LIMIT;
222 int found_free_cluster = 0;
223
224
225
226
227
228
229
230
231
232
233
234
235 si->flags += SWP_SCANNING;
236 scan_base = offset = si->cluster_next;
237
238 if (unlikely(!si->cluster_nr--)) {
239 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
240 si->cluster_nr = SWAPFILE_CLUSTER - 1;
241 goto checks;
242 }
243 if (si->flags & SWP_DISCARDABLE) {
244
245
246
247
248
249
250
251 if (si->lowest_alloc)
252 goto checks;
253 si->lowest_alloc = si->max;
254 si->highest_alloc = 0;
255 }
256 spin_unlock(&swap_lock);
257
258
259
260
261
262
263
264
265
266 if (!(si->flags & SWP_SOLIDSTATE))
267 scan_base = offset = si->lowest_bit;
268 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
269
270
271 for (; last_in_cluster <= si->highest_bit; offset++) {
272 if (si->swap_map[offset])
273 last_in_cluster = offset + SWAPFILE_CLUSTER;
274 else if (offset == last_in_cluster) {
275 spin_lock(&swap_lock);
276 offset -= SWAPFILE_CLUSTER - 1;
277 si->cluster_next = offset;
278 si->cluster_nr = SWAPFILE_CLUSTER - 1;
279 found_free_cluster = 1;
280 goto checks;
281 }
282 if (unlikely(--latency_ration < 0)) {
283 cond_resched();
284 latency_ration = LATENCY_LIMIT;
285 }
286 }
287
288 offset = si->lowest_bit;
289 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
290
291
292 for (; last_in_cluster < scan_base; offset++) {
293 if (si->swap_map[offset])
294 last_in_cluster = offset + SWAPFILE_CLUSTER;
295 else if (offset == last_in_cluster) {
296 spin_lock(&swap_lock);
297 offset -= SWAPFILE_CLUSTER - 1;
298 si->cluster_next = offset;
299 si->cluster_nr = SWAPFILE_CLUSTER - 1;
300 found_free_cluster = 1;
301 goto checks;
302 }
303 if (unlikely(--latency_ration < 0)) {
304 cond_resched();
305 latency_ration = LATENCY_LIMIT;
306 }
307 }
308
309 offset = scan_base;
310 spin_lock(&swap_lock);
311 si->cluster_nr = SWAPFILE_CLUSTER - 1;
312 si->lowest_alloc = 0;
313 }
314
315checks:
316 if (!(si->flags & SWP_WRITEOK))
317 goto no_page;
318 if (!si->highest_bit)
319 goto no_page;
320 if (offset > si->highest_bit)
321 scan_base = offset = si->lowest_bit;
322
323
324 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
325 int swap_was_freed;
326 spin_unlock(&swap_lock);
327 swap_was_freed = __try_to_reclaim_swap(si, offset);
328 spin_lock(&swap_lock);
329
330 if (swap_was_freed)
331 goto checks;
332 goto scan;
333 }
334
335 if (si->swap_map[offset])
336 goto scan;
337
338 if (offset == si->lowest_bit)
339 si->lowest_bit++;
340 if (offset == si->highest_bit)
341 si->highest_bit--;
342 si->inuse_pages++;
343 if (si->inuse_pages == si->pages) {
344 si->lowest_bit = si->max;
345 si->highest_bit = 0;
346 }
347 si->swap_map[offset] = usage;
348 si->cluster_next = offset + 1;
349 si->flags -= SWP_SCANNING;
350
351 if (si->lowest_alloc) {
352
353
354
355
356 if (found_free_cluster) {
357
358
359
360
361
362
363
364 if (offset < si->highest_alloc &&
365 si->lowest_alloc <= last_in_cluster)
366 last_in_cluster = si->lowest_alloc - 1;
367 si->flags |= SWP_DISCARDING;
368 spin_unlock(&swap_lock);
369
370 if (offset < last_in_cluster)
371 discard_swap_cluster(si, offset,
372 last_in_cluster - offset + 1);
373
374 spin_lock(&swap_lock);
375 si->lowest_alloc = 0;
376 si->flags &= ~SWP_DISCARDING;
377
378 smp_mb();
379 wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
380
381 } else if (si->flags & SWP_DISCARDING) {
382
383
384
385
386
387
388 spin_unlock(&swap_lock);
389 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
390 wait_for_discard, TASK_UNINTERRUPTIBLE);
391 spin_lock(&swap_lock);
392 } else {
393
394
395
396
397
398 if (offset < si->lowest_alloc)
399 si->lowest_alloc = offset;
400 if (offset > si->highest_alloc)
401 si->highest_alloc = offset;
402 }
403 }
404 return offset;
405
406scan:
407 spin_unlock(&swap_lock);
408 while (++offset <= si->highest_bit) {
409 if (!si->swap_map[offset]) {
410 spin_lock(&swap_lock);
411 goto checks;
412 }
413 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
414 spin_lock(&swap_lock);
415 goto checks;
416 }
417 if (unlikely(--latency_ration < 0)) {
418 cond_resched();
419 latency_ration = LATENCY_LIMIT;
420 }
421 }
422 offset = si->lowest_bit;
423 while (++offset < scan_base) {
424 if (!si->swap_map[offset]) {
425 spin_lock(&swap_lock);
426 goto checks;
427 }
428 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
429 spin_lock(&swap_lock);
430 goto checks;
431 }
432 if (unlikely(--latency_ration < 0)) {
433 cond_resched();
434 latency_ration = LATENCY_LIMIT;
435 }
436 }
437 spin_lock(&swap_lock);
438
439no_page:
440 si->flags -= SWP_SCANNING;
441 return 0;
442}
443
444swp_entry_t get_swap_page(void)
445{
446 struct swap_info_struct *si;
447 pgoff_t offset;
448 int type, next;
449 int wrapped = 0;
450
451 spin_lock(&swap_lock);
452 if (nr_swap_pages <= 0)
453 goto noswap;
454 nr_swap_pages--;
455
456 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
457 si = swap_info[type];
458 next = si->next;
459 if (next < 0 ||
460 (!wrapped && si->prio != swap_info[next]->prio)) {
461 next = swap_list.head;
462 wrapped++;
463 }
464
465 if (!si->highest_bit)
466 continue;
467 if (!(si->flags & SWP_WRITEOK))
468 continue;
469
470 swap_list.next = next;
471
472 offset = scan_swap_map(si, SWAP_HAS_CACHE);
473 if (offset) {
474 spin_unlock(&swap_lock);
475 return swp_entry(type, offset);
476 }
477 next = swap_list.next;
478 }
479
480 nr_swap_pages++;
481noswap:
482 spin_unlock(&swap_lock);
483 return (swp_entry_t) {0};
484}
485
486
487swp_entry_t get_swap_page_of_type(int type)
488{
489 struct swap_info_struct *si;
490 pgoff_t offset;
491
492 spin_lock(&swap_lock);
493 si = swap_info[type];
494 if (si && (si->flags & SWP_WRITEOK)) {
495 nr_swap_pages--;
496
497 offset = scan_swap_map(si, 1);
498 if (offset) {
499 spin_unlock(&swap_lock);
500 return swp_entry(type, offset);
501 }
502 nr_swap_pages++;
503 }
504 spin_unlock(&swap_lock);
505 return (swp_entry_t) {0};
506}
507
508static struct swap_info_struct *swap_info_get(swp_entry_t entry)
509{
510 struct swap_info_struct *p;
511 unsigned long offset, type;
512
513 if (!entry.val)
514 goto out;
515 type = swp_type(entry);
516 if (type >= nr_swapfiles)
517 goto bad_nofile;
518 p = swap_info[type];
519 if (!(p->flags & SWP_USED))
520 goto bad_device;
521 offset = swp_offset(entry);
522 if (offset >= p->max)
523 goto bad_offset;
524 if (!p->swap_map[offset])
525 goto bad_free;
526 spin_lock(&swap_lock);
527 return p;
528
529bad_free:
530 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
531 goto out;
532bad_offset:
533 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
534 goto out;
535bad_device:
536 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
537 goto out;
538bad_nofile:
539 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
540out:
541 return NULL;
542}
543
544static unsigned char swap_entry_free(struct swap_info_struct *p,
545 swp_entry_t entry, unsigned char usage)
546{
547 unsigned long offset = swp_offset(entry);
548 unsigned char count;
549 unsigned char has_cache;
550
551 count = p->swap_map[offset];
552 has_cache = count & SWAP_HAS_CACHE;
553 count &= ~SWAP_HAS_CACHE;
554
555 if (usage == SWAP_HAS_CACHE) {
556 VM_BUG_ON(!has_cache);
557 has_cache = 0;
558 } else if (count == SWAP_MAP_SHMEM) {
559
560
561
562
563 count = 0;
564 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
565 if (count == COUNT_CONTINUED) {
566 if (swap_count_continued(p, offset, count))
567 count = SWAP_MAP_MAX | COUNT_CONTINUED;
568 else
569 count = SWAP_MAP_MAX;
570 } else
571 count--;
572 }
573
574 if (!count)
575 mem_cgroup_uncharge_swap(entry);
576
577 usage = count | has_cache;
578 p->swap_map[offset] = usage;
579
580
581 if (!usage) {
582 struct gendisk *disk = p->bdev->bd_disk;
583 if (offset < p->lowest_bit)
584 p->lowest_bit = offset;
585 if (offset > p->highest_bit)
586 p->highest_bit = offset;
587 if (swap_list.next >= 0 &&
588 p->prio > swap_info[swap_list.next]->prio)
589 swap_list.next = p->type;
590 nr_swap_pages++;
591 p->inuse_pages--;
592 if ((p->flags & SWP_BLKDEV) &&
593 disk->fops->swap_slot_free_notify)
594 disk->fops->swap_slot_free_notify(p->bdev, offset);
595 }
596
597 return usage;
598}
599
600
601
602
603
604void swap_free(swp_entry_t entry)
605{
606 struct swap_info_struct *p;
607
608 p = swap_info_get(entry);
609 if (p) {
610 swap_entry_free(p, entry, 1);
611 spin_unlock(&swap_lock);
612 }
613}
614
615
616
617
618void swapcache_free(swp_entry_t entry, struct page *page)
619{
620 struct swap_info_struct *p;
621 unsigned char count;
622
623 p = swap_info_get(entry);
624 if (p) {
625 count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
626 if (page)
627 mem_cgroup_uncharge_swapcache(page, entry, count != 0);
628 spin_unlock(&swap_lock);
629 }
630}
631
632
633
634
635
636
637static inline int page_swapcount(struct page *page)
638{
639 int count = 0;
640 struct swap_info_struct *p;
641 swp_entry_t entry;
642
643 entry.val = page_private(page);
644 p = swap_info_get(entry);
645 if (p) {
646 count = swap_count(p->swap_map[swp_offset(entry)]);
647 spin_unlock(&swap_lock);
648 }
649 return count;
650}
651
652
653
654
655
656
657
658int reuse_swap_page(struct page *page)
659{
660 int count;
661
662 VM_BUG_ON(!PageLocked(page));
663 if (unlikely(PageKsm(page)))
664 return 0;
665 count = page_mapcount(page);
666 if (count <= 1 && PageSwapCache(page)) {
667 count += page_swapcount(page);
668 if (count == 1 && !PageWriteback(page)) {
669 delete_from_swap_cache(page);
670 SetPageDirty(page);
671 }
672 }
673 return count <= 1;
674}
675
676
677
678
679
680int try_to_free_swap(struct page *page)
681{
682 VM_BUG_ON(!PageLocked(page));
683
684 if (!PageSwapCache(page))
685 return 0;
686 if (PageWriteback(page))
687 return 0;
688 if (page_swapcount(page))
689 return 0;
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706 if (!(gfp_allowed_mask & __GFP_IO))
707 return 0;
708
709 delete_from_swap_cache(page);
710 SetPageDirty(page);
711 return 1;
712}
713
714
715
716
717
718int free_swap_and_cache(swp_entry_t entry)
719{
720 struct swap_info_struct *p;
721 struct page *page = NULL;
722
723 if (non_swap_entry(entry))
724 return 1;
725
726 p = swap_info_get(entry);
727 if (p) {
728 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
729 page = find_get_page(&swapper_space, entry.val);
730 if (page && !trylock_page(page)) {
731 page_cache_release(page);
732 page = NULL;
733 }
734 }
735 spin_unlock(&swap_lock);
736 }
737 if (page) {
738
739
740
741
742 if (PageSwapCache(page) && !PageWriteback(page) &&
743 (!page_mapped(page) || vm_swap_full())) {
744 delete_from_swap_cache(page);
745 SetPageDirty(page);
746 }
747 unlock_page(page);
748 page_cache_release(page);
749 }
750 return p != NULL;
751}
752
753#ifdef CONFIG_CGROUP_MEM_RES_CTLR
754
755
756
757
758
759
760
761
762
763
764int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
765{
766 struct page *page;
767 struct swap_info_struct *p;
768 int count = 0;
769
770 page = find_get_page(&swapper_space, ent.val);
771 if (page)
772 count += page_mapcount(page);
773 p = swap_info_get(ent);
774 if (p) {
775 count += swap_count(p->swap_map[swp_offset(ent)]);
776 spin_unlock(&swap_lock);
777 }
778
779 *pagep = page;
780 return count;
781}
782#endif
783
784#ifdef CONFIG_HIBERNATION
785
786
787
788
789
790
791
792
793int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
794{
795 struct block_device *bdev = NULL;
796 int type;
797
798 if (device)
799 bdev = bdget(device);
800
801 spin_lock(&swap_lock);
802 for (type = 0; type < nr_swapfiles; type++) {
803 struct swap_info_struct *sis = swap_info[type];
804
805 if (!(sis->flags & SWP_WRITEOK))
806 continue;
807
808 if (!bdev) {
809 if (bdev_p)
810 *bdev_p = bdgrab(sis->bdev);
811
812 spin_unlock(&swap_lock);
813 return type;
814 }
815 if (bdev == sis->bdev) {
816 struct swap_extent *se = &sis->first_swap_extent;
817
818 if (se->start_block == offset) {
819 if (bdev_p)
820 *bdev_p = bdgrab(sis->bdev);
821
822 spin_unlock(&swap_lock);
823 bdput(bdev);
824 return type;
825 }
826 }
827 }
828 spin_unlock(&swap_lock);
829 if (bdev)
830 bdput(bdev);
831
832 return -ENODEV;
833}
834
835
836
837
838
839sector_t swapdev_block(int type, pgoff_t offset)
840{
841 struct block_device *bdev;
842
843 if ((unsigned int)type >= nr_swapfiles)
844 return 0;
845 if (!(swap_info[type]->flags & SWP_WRITEOK))
846 return 0;
847 return map_swap_entry(swp_entry(type, offset), &bdev);
848}
849
850
851
852
853
854
855
856unsigned int count_swap_pages(int type, int free)
857{
858 unsigned int n = 0;
859
860 spin_lock(&swap_lock);
861 if ((unsigned int)type < nr_swapfiles) {
862 struct swap_info_struct *sis = swap_info[type];
863
864 if (sis->flags & SWP_WRITEOK) {
865 n = sis->pages;
866 if (free)
867 n -= sis->inuse_pages;
868 }
869 }
870 spin_unlock(&swap_lock);
871 return n;
872}
873#endif
874
875
876
877
878
879
880static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
881 unsigned long addr, swp_entry_t entry, struct page *page)
882{
883 struct mem_cgroup *ptr = NULL;
884 spinlock_t *ptl;
885 pte_t *pte;
886 int ret = 1;
887
888 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) {
889 ret = -ENOMEM;
890 goto out_nolock;
891 }
892
893 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
894 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
895 if (ret > 0)
896 mem_cgroup_cancel_charge_swapin(ptr);
897 ret = 0;
898 goto out;
899 }
900
901 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
902 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
903 get_page(page);
904 set_pte_at(vma->vm_mm, addr, pte,
905 pte_mkold(mk_pte(page, vma->vm_page_prot)));
906 page_add_anon_rmap(page, vma, addr);
907 mem_cgroup_commit_charge_swapin(page, ptr);
908 swap_free(entry);
909
910
911
912
913 activate_page(page);
914out:
915 pte_unmap_unlock(pte, ptl);
916out_nolock:
917 return ret;
918}
919
920static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
921 unsigned long addr, unsigned long end,
922 swp_entry_t entry, struct page *page)
923{
924 pte_t swp_pte = swp_entry_to_pte(entry);
925 pte_t *pte;
926 int ret = 0;
927
928
929
930
931
932
933
934
935
936
937 pte = pte_offset_map(pmd, addr);
938 do {
939
940
941
942
943 if (unlikely(pte_same(*pte, swp_pte))) {
944 pte_unmap(pte);
945 ret = unuse_pte(vma, pmd, addr, entry, page);
946 if (ret)
947 goto out;
948 pte = pte_offset_map(pmd, addr);
949 }
950 } while (pte++, addr += PAGE_SIZE, addr != end);
951 pte_unmap(pte - 1);
952out:
953 return ret;
954}
955
956static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
957 unsigned long addr, unsigned long end,
958 swp_entry_t entry, struct page *page)
959{
960 pmd_t *pmd;
961 unsigned long next;
962 int ret;
963
964 pmd = pmd_offset(pud, addr);
965 do {
966 next = pmd_addr_end(addr, end);
967 if (unlikely(pmd_trans_huge(*pmd)))
968 continue;
969 if (pmd_none_or_clear_bad(pmd))
970 continue;
971 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
972 if (ret)
973 return ret;
974 } while (pmd++, addr = next, addr != end);
975 return 0;
976}
977
978static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
979 unsigned long addr, unsigned long end,
980 swp_entry_t entry, struct page *page)
981{
982 pud_t *pud;
983 unsigned long next;
984 int ret;
985
986 pud = pud_offset(pgd, addr);
987 do {
988 next = pud_addr_end(addr, end);
989 if (pud_none_or_clear_bad(pud))
990 continue;
991 ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
992 if (ret)
993 return ret;
994 } while (pud++, addr = next, addr != end);
995 return 0;
996}
997
998static int unuse_vma(struct vm_area_struct *vma,
999 swp_entry_t entry, struct page *page)
1000{
1001 pgd_t *pgd;
1002 unsigned long addr, end, next;
1003 int ret;
1004
1005 if (page_anon_vma(page)) {
1006 addr = page_address_in_vma(page, vma);
1007 if (addr == -EFAULT)
1008 return 0;
1009 else
1010 end = addr + PAGE_SIZE;
1011 } else {
1012 addr = vma->vm_start;
1013 end = vma->vm_end;
1014 }
1015
1016 pgd = pgd_offset(vma->vm_mm, addr);
1017 do {
1018 next = pgd_addr_end(addr, end);
1019 if (pgd_none_or_clear_bad(pgd))
1020 continue;
1021 ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
1022 if (ret)
1023 return ret;
1024 } while (pgd++, addr = next, addr != end);
1025 return 0;
1026}
1027
1028static int unuse_mm(struct mm_struct *mm,
1029 swp_entry_t entry, struct page *page)
1030{
1031 struct vm_area_struct *vma;
1032 int ret = 0;
1033
1034 if (!down_read_trylock(&mm->mmap_sem)) {
1035
1036
1037
1038
1039 activate_page(page);
1040 unlock_page(page);
1041 down_read(&mm->mmap_sem);
1042 lock_page(page);
1043 }
1044 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1045 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
1046 break;
1047 }
1048 up_read(&mm->mmap_sem);
1049 return (ret < 0)? ret: 0;
1050}
1051
1052
1053
1054
1055
1056static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1057 unsigned int prev)
1058{
1059 unsigned int max = si->max;
1060 unsigned int i = prev;
1061 unsigned char count;
1062
1063
1064
1065
1066
1067
1068
1069 for (;;) {
1070 if (++i >= max) {
1071 if (!prev) {
1072 i = 0;
1073 break;
1074 }
1075
1076
1077
1078
1079 max = prev + 1;
1080 prev = 0;
1081 i = 1;
1082 }
1083 count = si->swap_map[i];
1084 if (count && swap_count(count) != SWAP_MAP_BAD)
1085 break;
1086 }
1087 return i;
1088}
1089
1090
1091
1092
1093
1094
1095static int try_to_unuse(unsigned int type)
1096{
1097 struct swap_info_struct *si = swap_info[type];
1098 struct mm_struct *start_mm;
1099 unsigned char *swap_map;
1100 unsigned char swcount;
1101 struct page *page;
1102 swp_entry_t entry;
1103 unsigned int i = 0;
1104 int retval = 0;
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120 start_mm = &init_mm;
1121 atomic_inc(&init_mm.mm_users);
1122
1123
1124
1125
1126
1127
1128 while ((i = find_next_to_unuse(si, i)) != 0) {
1129 if (signal_pending(current)) {
1130 retval = -EINTR;
1131 break;
1132 }
1133
1134
1135
1136
1137
1138
1139 swap_map = &si->swap_map[i];
1140 entry = swp_entry(type, i);
1141 page = read_swap_cache_async(entry,
1142 GFP_HIGHUSER_MOVABLE, NULL, 0);
1143 if (!page) {
1144
1145
1146
1147
1148
1149
1150 if (!*swap_map)
1151 continue;
1152 retval = -ENOMEM;
1153 break;
1154 }
1155
1156
1157
1158
1159 if (atomic_read(&start_mm->mm_users) == 1) {
1160 mmput(start_mm);
1161 start_mm = &init_mm;
1162 atomic_inc(&init_mm.mm_users);
1163 }
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173 wait_on_page_locked(page);
1174 wait_on_page_writeback(page);
1175 lock_page(page);
1176 wait_on_page_writeback(page);
1177
1178
1179
1180
1181 swcount = *swap_map;
1182 if (swap_count(swcount) == SWAP_MAP_SHMEM) {
1183 retval = shmem_unuse(entry, page);
1184
1185 if (retval < 0)
1186 break;
1187 continue;
1188 }
1189 if (swap_count(swcount) && start_mm != &init_mm)
1190 retval = unuse_mm(start_mm, entry, page);
1191
1192 if (swap_count(*swap_map)) {
1193 int set_start_mm = (*swap_map >= swcount);
1194 struct list_head *p = &start_mm->mmlist;
1195 struct mm_struct *new_start_mm = start_mm;
1196 struct mm_struct *prev_mm = start_mm;
1197 struct mm_struct *mm;
1198
1199 atomic_inc(&new_start_mm->mm_users);
1200 atomic_inc(&prev_mm->mm_users);
1201 spin_lock(&mmlist_lock);
1202 while (swap_count(*swap_map) && !retval &&
1203 (p = p->next) != &start_mm->mmlist) {
1204 mm = list_entry(p, struct mm_struct, mmlist);
1205 if (!atomic_inc_not_zero(&mm->mm_users))
1206 continue;
1207 spin_unlock(&mmlist_lock);
1208 mmput(prev_mm);
1209 prev_mm = mm;
1210
1211 cond_resched();
1212
1213 swcount = *swap_map;
1214 if (!swap_count(swcount))
1215 ;
1216 else if (mm == &init_mm)
1217 set_start_mm = 1;
1218 else
1219 retval = unuse_mm(mm, entry, page);
1220
1221 if (set_start_mm && *swap_map < swcount) {
1222 mmput(new_start_mm);
1223 atomic_inc(&mm->mm_users);
1224 new_start_mm = mm;
1225 set_start_mm = 0;
1226 }
1227 spin_lock(&mmlist_lock);
1228 }
1229 spin_unlock(&mmlist_lock);
1230 mmput(prev_mm);
1231 mmput(start_mm);
1232 start_mm = new_start_mm;
1233 }
1234 if (retval) {
1235 unlock_page(page);
1236 page_cache_release(page);
1237 break;
1238 }
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259 if (swap_count(*swap_map) &&
1260 PageDirty(page) && PageSwapCache(page)) {
1261 struct writeback_control wbc = {
1262 .sync_mode = WB_SYNC_NONE,
1263 };
1264
1265 swap_writepage(page, &wbc);
1266 lock_page(page);
1267 wait_on_page_writeback(page);
1268 }
1269
1270
1271
1272
1273
1274
1275
1276
1277 if (PageSwapCache(page) &&
1278 likely(page_private(page) == entry.val))
1279 delete_from_swap_cache(page);
1280
1281
1282
1283
1284
1285
1286 SetPageDirty(page);
1287 unlock_page(page);
1288 page_cache_release(page);
1289
1290
1291
1292
1293
1294 cond_resched();
1295 }
1296
1297 mmput(start_mm);
1298 return retval;
1299}
1300
1301
1302
1303
1304
1305
1306
1307static void drain_mmlist(void)
1308{
1309 struct list_head *p, *next;
1310 unsigned int type;
1311
1312 for (type = 0; type < nr_swapfiles; type++)
1313 if (swap_info[type]->inuse_pages)
1314 return;
1315 spin_lock(&mmlist_lock);
1316 list_for_each_safe(p, next, &init_mm.mmlist)
1317 list_del_init(p);
1318 spin_unlock(&mmlist_lock);
1319}
1320
1321
1322
1323
1324
1325
1326
1327static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
1328{
1329 struct swap_info_struct *sis;
1330 struct swap_extent *start_se;
1331 struct swap_extent *se;
1332 pgoff_t offset;
1333
1334 sis = swap_info[swp_type(entry)];
1335 *bdev = sis->bdev;
1336
1337 offset = swp_offset(entry);
1338 start_se = sis->curr_swap_extent;
1339 se = start_se;
1340
1341 for ( ; ; ) {
1342 struct list_head *lh;
1343
1344 if (se->start_page <= offset &&
1345 offset < (se->start_page + se->nr_pages)) {
1346 return se->start_block + (offset - se->start_page);
1347 }
1348 lh = se->list.next;
1349 se = list_entry(lh, struct swap_extent, list);
1350 sis->curr_swap_extent = se;
1351 BUG_ON(se == start_se);
1352 }
1353}
1354
1355
1356
1357
1358sector_t map_swap_page(struct page *page, struct block_device **bdev)
1359{
1360 swp_entry_t entry;
1361 entry.val = page_private(page);
1362 return map_swap_entry(entry, bdev);
1363}
1364
1365
1366
1367
1368static void destroy_swap_extents(struct swap_info_struct *sis)
1369{
1370 while (!list_empty(&sis->first_swap_extent.list)) {
1371 struct swap_extent *se;
1372
1373 se = list_entry(sis->first_swap_extent.list.next,
1374 struct swap_extent, list);
1375 list_del(&se->list);
1376 kfree(se);
1377 }
1378}
1379
1380
1381
1382
1383
1384
1385
1386static int
1387add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1388 unsigned long nr_pages, sector_t start_block)
1389{
1390 struct swap_extent *se;
1391 struct swap_extent *new_se;
1392 struct list_head *lh;
1393
1394 if (start_page == 0) {
1395 se = &sis->first_swap_extent;
1396 sis->curr_swap_extent = se;
1397 se->start_page = 0;
1398 se->nr_pages = nr_pages;
1399 se->start_block = start_block;
1400 return 1;
1401 } else {
1402 lh = sis->first_swap_extent.list.prev;
1403 se = list_entry(lh, struct swap_extent, list);
1404 BUG_ON(se->start_page + se->nr_pages != start_page);
1405 if (se->start_block + se->nr_pages == start_block) {
1406
1407 se->nr_pages += nr_pages;
1408 return 0;
1409 }
1410 }
1411
1412
1413
1414
1415 new_se = kmalloc(sizeof(*se), GFP_KERNEL);
1416 if (new_se == NULL)
1417 return -ENOMEM;
1418 new_se->start_page = start_page;
1419 new_se->nr_pages = nr_pages;
1420 new_se->start_block = start_block;
1421
1422 list_add_tail(&new_se->list, &sis->first_swap_extent.list);
1423 return 1;
1424}
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1458{
1459 struct inode *inode;
1460 unsigned blocks_per_page;
1461 unsigned long page_no;
1462 unsigned blkbits;
1463 sector_t probe_block;
1464 sector_t last_block;
1465 sector_t lowest_block = -1;
1466 sector_t highest_block = 0;
1467 int nr_extents = 0;
1468 int ret;
1469
1470 inode = sis->swap_file->f_mapping->host;
1471 if (S_ISBLK(inode->i_mode)) {
1472 ret = add_swap_extent(sis, 0, sis->max, 0);
1473 *span = sis->pages;
1474 goto out;
1475 }
1476
1477 blkbits = inode->i_blkbits;
1478 blocks_per_page = PAGE_SIZE >> blkbits;
1479
1480
1481
1482
1483
1484 probe_block = 0;
1485 page_no = 0;
1486 last_block = i_size_read(inode) >> blkbits;
1487 while ((probe_block + blocks_per_page) <= last_block &&
1488 page_no < sis->max) {
1489 unsigned block_in_page;
1490 sector_t first_block;
1491
1492 first_block = bmap(inode, probe_block);
1493 if (first_block == 0)
1494 goto bad_bmap;
1495
1496
1497
1498
1499 if (first_block & (blocks_per_page - 1)) {
1500 probe_block++;
1501 goto reprobe;
1502 }
1503
1504 for (block_in_page = 1; block_in_page < blocks_per_page;
1505 block_in_page++) {
1506 sector_t block;
1507
1508 block = bmap(inode, probe_block + block_in_page);
1509 if (block == 0)
1510 goto bad_bmap;
1511 if (block != first_block + block_in_page) {
1512
1513 probe_block++;
1514 goto reprobe;
1515 }
1516 }
1517
1518 first_block >>= (PAGE_SHIFT - blkbits);
1519 if (page_no) {
1520 if (first_block < lowest_block)
1521 lowest_block = first_block;
1522 if (first_block > highest_block)
1523 highest_block = first_block;
1524 }
1525
1526
1527
1528
1529 ret = add_swap_extent(sis, page_no, 1, first_block);
1530 if (ret < 0)
1531 goto out;
1532 nr_extents += ret;
1533 page_no++;
1534 probe_block += blocks_per_page;
1535reprobe:
1536 continue;
1537 }
1538 ret = nr_extents;
1539 *span = 1 + highest_block - lowest_block;
1540 if (page_no == 0)
1541 page_no = 1;
1542 sis->max = page_no;
1543 sis->pages = page_no - 1;
1544 sis->highest_bit = page_no - 1;
1545out:
1546 return ret;
1547bad_bmap:
1548 printk(KERN_ERR "swapon: swapfile has holes\n");
1549 ret = -EINVAL;
1550 goto out;
1551}
1552
1553SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1554{
1555 struct swap_info_struct *p = NULL;
1556 unsigned char *swap_map;
1557 struct file *swap_file, *victim;
1558 struct address_space *mapping;
1559 struct inode *inode;
1560 char *pathname;
1561 int i, type, prev;
1562 int err;
1563
1564 if (!capable(CAP_SYS_ADMIN))
1565 return -EPERM;
1566
1567 pathname = getname(specialfile);
1568 err = PTR_ERR(pathname);
1569 if (IS_ERR(pathname))
1570 goto out;
1571
1572 victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
1573 putname(pathname);
1574 err = PTR_ERR(victim);
1575 if (IS_ERR(victim))
1576 goto out;
1577
1578 mapping = victim->f_mapping;
1579 prev = -1;
1580 spin_lock(&swap_lock);
1581 for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
1582 p = swap_info[type];
1583 if (p->flags & SWP_WRITEOK) {
1584 if (p->swap_file->f_mapping == mapping)
1585 break;
1586 }
1587 prev = type;
1588 }
1589 if (type < 0) {
1590 err = -EINVAL;
1591 spin_unlock(&swap_lock);
1592 goto out_dput;
1593 }
1594 if (!security_vm_enough_memory(p->pages))
1595 vm_unacct_memory(p->pages);
1596 else {
1597 err = -ENOMEM;
1598 spin_unlock(&swap_lock);
1599 goto out_dput;
1600 }
1601 if (prev < 0)
1602 swap_list.head = p->next;
1603 else
1604 swap_info[prev]->next = p->next;
1605 if (type == swap_list.next) {
1606
1607 swap_list.next = swap_list.head;
1608 }
1609 if (p->prio < 0) {
1610 for (i = p->next; i >= 0; i = swap_info[i]->next)
1611 swap_info[i]->prio = p->prio--;
1612 least_priority++;
1613 }
1614 nr_swap_pages -= p->pages;
1615 total_swap_pages -= p->pages;
1616 p->flags &= ~SWP_WRITEOK;
1617 spin_unlock(&swap_lock);
1618
1619 current->flags |= PF_OOM_ORIGIN;
1620 err = try_to_unuse(type);
1621 current->flags &= ~PF_OOM_ORIGIN;
1622
1623 if (err) {
1624
1625 spin_lock(&swap_lock);
1626 if (p->prio < 0)
1627 p->prio = --least_priority;
1628 prev = -1;
1629 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
1630 if (p->prio >= swap_info[i]->prio)
1631 break;
1632 prev = i;
1633 }
1634 p->next = i;
1635 if (prev < 0)
1636 swap_list.head = swap_list.next = type;
1637 else
1638 swap_info[prev]->next = type;
1639 nr_swap_pages += p->pages;
1640 total_swap_pages += p->pages;
1641 p->flags |= SWP_WRITEOK;
1642 spin_unlock(&swap_lock);
1643 goto out_dput;
1644 }
1645
1646
1647 down_write(&swap_unplug_sem);
1648 up_write(&swap_unplug_sem);
1649
1650 destroy_swap_extents(p);
1651 if (p->flags & SWP_CONTINUED)
1652 free_swap_count_continuations(p);
1653
1654 mutex_lock(&swapon_mutex);
1655 spin_lock(&swap_lock);
1656 drain_mmlist();
1657
1658
1659 p->highest_bit = 0;
1660 while (p->flags >= SWP_SCANNING) {
1661 spin_unlock(&swap_lock);
1662 schedule_timeout_uninterruptible(1);
1663 spin_lock(&swap_lock);
1664 }
1665
1666 swap_file = p->swap_file;
1667 p->swap_file = NULL;
1668 p->max = 0;
1669 swap_map = p->swap_map;
1670 p->swap_map = NULL;
1671 p->flags = 0;
1672 spin_unlock(&swap_lock);
1673 mutex_unlock(&swapon_mutex);
1674 vfree(swap_map);
1675
1676 swap_cgroup_swapoff(type);
1677
1678 inode = mapping->host;
1679 if (S_ISBLK(inode->i_mode)) {
1680 struct block_device *bdev = I_BDEV(inode);
1681 set_blocksize(bdev, p->old_block_size);
1682 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1683 } else {
1684 mutex_lock(&inode->i_mutex);
1685 inode->i_flags &= ~S_SWAPFILE;
1686 mutex_unlock(&inode->i_mutex);
1687 }
1688 filp_close(swap_file, NULL);
1689 err = 0;
1690 atomic_inc(&proc_poll_event);
1691 wake_up_interruptible(&proc_poll_wait);
1692
1693out_dput:
1694 filp_close(victim, NULL);
1695out:
1696 return err;
1697}
1698
1699#ifdef CONFIG_PROC_FS
1700struct proc_swaps {
1701 struct seq_file seq;
1702 int event;
1703};
1704
1705static unsigned swaps_poll(struct file *file, poll_table *wait)
1706{
1707 struct proc_swaps *s = file->private_data;
1708
1709 poll_wait(file, &proc_poll_wait, wait);
1710
1711 if (s->event != atomic_read(&proc_poll_event)) {
1712 s->event = atomic_read(&proc_poll_event);
1713 return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
1714 }
1715
1716 return POLLIN | POLLRDNORM;
1717}
1718
1719
1720static void *swap_start(struct seq_file *swap, loff_t *pos)
1721{
1722 struct swap_info_struct *si;
1723 int type;
1724 loff_t l = *pos;
1725
1726 mutex_lock(&swapon_mutex);
1727
1728 if (!l)
1729 return SEQ_START_TOKEN;
1730
1731 for (type = 0; type < nr_swapfiles; type++) {
1732 smp_rmb();
1733 si = swap_info[type];
1734 if (!(si->flags & SWP_USED) || !si->swap_map)
1735 continue;
1736 if (!--l)
1737 return si;
1738 }
1739
1740 return NULL;
1741}
1742
1743static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1744{
1745 struct swap_info_struct *si = v;
1746 int type;
1747
1748 if (v == SEQ_START_TOKEN)
1749 type = 0;
1750 else
1751 type = si->type + 1;
1752
1753 for (; type < nr_swapfiles; type++) {
1754 smp_rmb();
1755 si = swap_info[type];
1756 if (!(si->flags & SWP_USED) || !si->swap_map)
1757 continue;
1758 ++*pos;
1759 return si;
1760 }
1761
1762 return NULL;
1763}
1764
1765static void swap_stop(struct seq_file *swap, void *v)
1766{
1767 mutex_unlock(&swapon_mutex);
1768}
1769
1770static int swap_show(struct seq_file *swap, void *v)
1771{
1772 struct swap_info_struct *si = v;
1773 struct file *file;
1774 int len;
1775
1776 if (si == SEQ_START_TOKEN) {
1777 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1778 return 0;
1779 }
1780
1781 file = si->swap_file;
1782 len = seq_path(swap, &file->f_path, " \t\n\\");
1783 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1784 len < 40 ? 40 - len : 1, " ",
1785 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1786 "partition" : "file\t",
1787 si->pages << (PAGE_SHIFT - 10),
1788 si->inuse_pages << (PAGE_SHIFT - 10),
1789 si->prio);
1790 return 0;
1791}
1792
1793static const struct seq_operations swaps_op = {
1794 .start = swap_start,
1795 .next = swap_next,
1796 .stop = swap_stop,
1797 .show = swap_show
1798};
1799
1800static int swaps_open(struct inode *inode, struct file *file)
1801{
1802 struct proc_swaps *s;
1803 int ret;
1804
1805 s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
1806 if (!s)
1807 return -ENOMEM;
1808
1809 file->private_data = s;
1810
1811 ret = seq_open(file, &swaps_op);
1812 if (ret) {
1813 kfree(s);
1814 return ret;
1815 }
1816
1817 s->seq.private = s;
1818 s->event = atomic_read(&proc_poll_event);
1819 return ret;
1820}
1821
1822static const struct file_operations proc_swaps_operations = {
1823 .open = swaps_open,
1824 .read = seq_read,
1825 .llseek = seq_lseek,
1826 .release = seq_release,
1827 .poll = swaps_poll,
1828};
1829
1830static int __init procswaps_init(void)
1831{
1832 proc_create("swaps", 0, NULL, &proc_swaps_operations);
1833 return 0;
1834}
1835__initcall(procswaps_init);
1836#endif
1837
1838#ifdef MAX_SWAPFILES_CHECK
1839static int __init max_swapfiles_check(void)
1840{
1841 MAX_SWAPFILES_CHECK();
1842 return 0;
1843}
1844late_initcall(max_swapfiles_check);
1845#endif
1846
1847
1848
1849
1850
1851
1852SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1853{
1854 struct swap_info_struct *p;
1855 char *name = NULL;
1856 struct block_device *bdev = NULL;
1857 struct file *swap_file = NULL;
1858 struct address_space *mapping;
1859 unsigned int type;
1860 int i, prev;
1861 int error;
1862 union swap_header *swap_header;
1863 unsigned int nr_good_pages;
1864 int nr_extents = 0;
1865 sector_t span;
1866 unsigned long maxpages;
1867 unsigned long swapfilepages;
1868 unsigned char *swap_map = NULL;
1869 struct page *page = NULL;
1870 struct inode *inode = NULL;
1871 int did_down = 0;
1872
1873 if (!capable(CAP_SYS_ADMIN))
1874 return -EPERM;
1875
1876 p = kzalloc(sizeof(*p), GFP_KERNEL);
1877 if (!p)
1878 return -ENOMEM;
1879
1880 spin_lock(&swap_lock);
1881 for (type = 0; type < nr_swapfiles; type++) {
1882 if (!(swap_info[type]->flags & SWP_USED))
1883 break;
1884 }
1885 error = -EPERM;
1886 if (type >= MAX_SWAPFILES) {
1887 spin_unlock(&swap_lock);
1888 kfree(p);
1889 goto out;
1890 }
1891 if (type >= nr_swapfiles) {
1892 p->type = type;
1893 swap_info[type] = p;
1894
1895
1896
1897
1898
1899 smp_wmb();
1900 nr_swapfiles++;
1901 } else {
1902 kfree(p);
1903 p = swap_info[type];
1904
1905
1906
1907
1908 }
1909 INIT_LIST_HEAD(&p->first_swap_extent.list);
1910 p->flags = SWP_USED;
1911 p->next = -1;
1912 spin_unlock(&swap_lock);
1913
1914 name = getname(specialfile);
1915 error = PTR_ERR(name);
1916 if (IS_ERR(name)) {
1917 name = NULL;
1918 goto bad_swap_2;
1919 }
1920 swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
1921 error = PTR_ERR(swap_file);
1922 if (IS_ERR(swap_file)) {
1923 swap_file = NULL;
1924 goto bad_swap_2;
1925 }
1926
1927 p->swap_file = swap_file;
1928 mapping = swap_file->f_mapping;
1929 inode = mapping->host;
1930
1931 error = -EBUSY;
1932 for (i = 0; i < nr_swapfiles; i++) {
1933 struct swap_info_struct *q = swap_info[i];
1934
1935 if (i == type || !q->swap_file)
1936 continue;
1937 if (mapping == q->swap_file->f_mapping)
1938 goto bad_swap;
1939 }
1940
1941 error = -EINVAL;
1942 if (S_ISBLK(inode->i_mode)) {
1943 bdev = bdgrab(I_BDEV(inode));
1944 error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL,
1945 sys_swapon);
1946 if (error < 0) {
1947 bdev = NULL;
1948 error = -EINVAL;
1949 goto bad_swap;
1950 }
1951 p->old_block_size = block_size(bdev);
1952 error = set_blocksize(bdev, PAGE_SIZE);
1953 if (error < 0)
1954 goto bad_swap;
1955 p->bdev = bdev;
1956 p->flags |= SWP_BLKDEV;
1957 } else if (S_ISREG(inode->i_mode)) {
1958 p->bdev = inode->i_sb->s_bdev;
1959 mutex_lock(&inode->i_mutex);
1960 did_down = 1;
1961 if (IS_SWAPFILE(inode)) {
1962 error = -EBUSY;
1963 goto bad_swap;
1964 }
1965 } else {
1966 goto bad_swap;
1967 }
1968
1969 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
1970
1971
1972
1973
1974 if (!mapping->a_ops->readpage) {
1975 error = -EINVAL;
1976 goto bad_swap;
1977 }
1978 page = read_mapping_page(mapping, 0, swap_file);
1979 if (IS_ERR(page)) {
1980 error = PTR_ERR(page);
1981 goto bad_swap;
1982 }
1983 swap_header = kmap(page);
1984
1985 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
1986 printk(KERN_ERR "Unable to find swap-space signature\n");
1987 error = -EINVAL;
1988 goto bad_swap;
1989 }
1990
1991
1992 if (swab32(swap_header->info.version) == 1) {
1993 swab32s(&swap_header->info.version);
1994 swab32s(&swap_header->info.last_page);
1995 swab32s(&swap_header->info.nr_badpages);
1996 for (i = 0; i < swap_header->info.nr_badpages; i++)
1997 swab32s(&swap_header->info.badpages[i]);
1998 }
1999
2000 if (swap_header->info.version != 1) {
2001 printk(KERN_WARNING
2002 "Unable to handle swap header version %d\n",
2003 swap_header->info.version);
2004 error = -EINVAL;
2005 goto bad_swap;
2006 }
2007
2008 p->lowest_bit = 1;
2009 p->cluster_next = 1;
2010 p->cluster_nr = 0;
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026 maxpages = swp_offset(pte_to_swp_entry(
2027 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
2028 if (maxpages > swap_header->info.last_page) {
2029 maxpages = swap_header->info.last_page + 1;
2030
2031 if ((unsigned int)maxpages == 0)
2032 maxpages = UINT_MAX;
2033 }
2034 p->highest_bit = maxpages - 1;
2035
2036 error = -EINVAL;
2037 if (!maxpages)
2038 goto bad_swap;
2039 if (swapfilepages && maxpages > swapfilepages) {
2040 printk(KERN_WARNING
2041 "Swap area shorter than signature indicates\n");
2042 goto bad_swap;
2043 }
2044 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
2045 goto bad_swap;
2046 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2047 goto bad_swap;
2048
2049
2050 swap_map = vmalloc(maxpages);
2051 if (!swap_map) {
2052 error = -ENOMEM;
2053 goto bad_swap;
2054 }
2055
2056 memset(swap_map, 0, maxpages);
2057 nr_good_pages = maxpages - 1;
2058
2059 for (i = 0; i < swap_header->info.nr_badpages; i++) {
2060 unsigned int page_nr = swap_header->info.badpages[i];
2061 if (page_nr == 0 || page_nr > swap_header->info.last_page) {
2062 error = -EINVAL;
2063 goto bad_swap;
2064 }
2065 if (page_nr < maxpages) {
2066 swap_map[page_nr] = SWAP_MAP_BAD;
2067 nr_good_pages--;
2068 }
2069 }
2070
2071 error = swap_cgroup_swapon(type, maxpages);
2072 if (error)
2073 goto bad_swap;
2074
2075 if (nr_good_pages) {
2076 swap_map[0] = SWAP_MAP_BAD;
2077 p->max = maxpages;
2078 p->pages = nr_good_pages;
2079 nr_extents = setup_swap_extents(p, &span);
2080 if (nr_extents < 0) {
2081 error = nr_extents;
2082 goto bad_swap;
2083 }
2084 nr_good_pages = p->pages;
2085 }
2086 if (!nr_good_pages) {
2087 printk(KERN_WARNING "Empty swap-file\n");
2088 error = -EINVAL;
2089 goto bad_swap;
2090 }
2091
2092 if (p->bdev) {
2093 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
2094 p->flags |= SWP_SOLIDSTATE;
2095 p->cluster_next = 1 + (random32() % p->highest_bit);
2096 }
2097 if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD))
2098 p->flags |= SWP_DISCARDABLE;
2099 }
2100
2101 mutex_lock(&swapon_mutex);
2102 spin_lock(&swap_lock);
2103 if (swap_flags & SWAP_FLAG_PREFER)
2104 p->prio =
2105 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2106 else
2107 p->prio = --least_priority;
2108 p->swap_map = swap_map;
2109 p->flags |= SWP_WRITEOK;
2110 nr_swap_pages += nr_good_pages;
2111 total_swap_pages += nr_good_pages;
2112
2113 printk(KERN_INFO "Adding %uk swap on %s. "
2114 "Priority:%d extents:%d across:%lluk %s%s\n",
2115 nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
2116 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2117 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2118 (p->flags & SWP_DISCARDABLE) ? "D" : "");
2119
2120
2121 prev = -1;
2122 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
2123 if (p->prio >= swap_info[i]->prio)
2124 break;
2125 prev = i;
2126 }
2127 p->next = i;
2128 if (prev < 0)
2129 swap_list.head = swap_list.next = type;
2130 else
2131 swap_info[prev]->next = type;
2132 spin_unlock(&swap_lock);
2133 mutex_unlock(&swapon_mutex);
2134 atomic_inc(&proc_poll_event);
2135 wake_up_interruptible(&proc_poll_wait);
2136
2137 error = 0;
2138 goto out;
2139bad_swap:
2140 if (bdev) {
2141 set_blocksize(bdev, p->old_block_size);
2142 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2143 }
2144 destroy_swap_extents(p);
2145 swap_cgroup_swapoff(type);
2146bad_swap_2:
2147 spin_lock(&swap_lock);
2148 p->swap_file = NULL;
2149 p->flags = 0;
2150 spin_unlock(&swap_lock);
2151 vfree(swap_map);
2152 if (swap_file)
2153 filp_close(swap_file, NULL);
2154out:
2155 if (page && !IS_ERR(page)) {
2156 kunmap(page);
2157 page_cache_release(page);
2158 }
2159 if (name)
2160 putname(name);
2161 if (did_down) {
2162 if (!error)
2163 inode->i_flags |= S_SWAPFILE;
2164 mutex_unlock(&inode->i_mutex);
2165 }
2166 return error;
2167}
2168
2169void si_swapinfo(struct sysinfo *val)
2170{
2171 unsigned int type;
2172 unsigned long nr_to_be_unused = 0;
2173
2174 spin_lock(&swap_lock);
2175 for (type = 0; type < nr_swapfiles; type++) {
2176 struct swap_info_struct *si = swap_info[type];
2177
2178 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
2179 nr_to_be_unused += si->inuse_pages;
2180 }
2181 val->freeswap = nr_swap_pages + nr_to_be_unused;
2182 val->totalswap = total_swap_pages + nr_to_be_unused;
2183 spin_unlock(&swap_lock);
2184}
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2198{
2199 struct swap_info_struct *p;
2200 unsigned long offset, type;
2201 unsigned char count;
2202 unsigned char has_cache;
2203 int err = -EINVAL;
2204
2205 if (non_swap_entry(entry))
2206 goto out;
2207
2208 type = swp_type(entry);
2209 if (type >= nr_swapfiles)
2210 goto bad_file;
2211 p = swap_info[type];
2212 offset = swp_offset(entry);
2213
2214 spin_lock(&swap_lock);
2215 if (unlikely(offset >= p->max))
2216 goto unlock_out;
2217
2218 count = p->swap_map[offset];
2219 has_cache = count & SWAP_HAS_CACHE;
2220 count &= ~SWAP_HAS_CACHE;
2221 err = 0;
2222
2223 if (usage == SWAP_HAS_CACHE) {
2224
2225
2226 if (!has_cache && count)
2227 has_cache = SWAP_HAS_CACHE;
2228 else if (has_cache)
2229 err = -EEXIST;
2230 else
2231 err = -ENOENT;
2232
2233 } else if (count || has_cache) {
2234
2235 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
2236 count += usage;
2237 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
2238 err = -EINVAL;
2239 else if (swap_count_continued(p, offset, count))
2240 count = COUNT_CONTINUED;
2241 else
2242 err = -ENOMEM;
2243 } else
2244 err = -ENOENT;
2245
2246 p->swap_map[offset] = count | has_cache;
2247
2248unlock_out:
2249 spin_unlock(&swap_lock);
2250out:
2251 return err;
2252
2253bad_file:
2254 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
2255 goto out;
2256}
2257
2258
2259
2260
2261
2262void swap_shmem_alloc(swp_entry_t entry)
2263{
2264 __swap_duplicate(entry, SWAP_MAP_SHMEM);
2265}
2266
2267
2268
2269
2270
2271
2272
2273
2274int swap_duplicate(swp_entry_t entry)
2275{
2276 int err = 0;
2277
2278 while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
2279 err = add_swap_count_continuation(entry, GFP_ATOMIC);
2280 return err;
2281}
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291int swapcache_prepare(swp_entry_t entry)
2292{
2293 return __swap_duplicate(entry, SWAP_HAS_CACHE);
2294}
2295
2296
2297
2298
2299
2300int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2301{
2302 struct swap_info_struct *si;
2303 int our_page_cluster = page_cluster;
2304 pgoff_t target, toff;
2305 pgoff_t base, end;
2306 int nr_pages = 0;
2307
2308 if (!our_page_cluster)
2309 return 0;
2310
2311 si = swap_info[swp_type(entry)];
2312 target = swp_offset(entry);
2313 base = (target >> our_page_cluster) << our_page_cluster;
2314 end = base + (1 << our_page_cluster);
2315 if (!base)
2316 base++;
2317
2318 spin_lock(&swap_lock);
2319 if (end > si->max)
2320 end = si->max;
2321
2322
2323 for (toff = target; ++toff < end; nr_pages++) {
2324
2325 if (!si->swap_map[toff])
2326 break;
2327 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2328 break;
2329 }
2330
2331 for (toff = target; --toff >= base; nr_pages++) {
2332
2333 if (!si->swap_map[toff])
2334 break;
2335 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2336 break;
2337 }
2338 spin_unlock(&swap_lock);
2339
2340
2341
2342
2343
2344 *offset = ++toff;
2345 return nr_pages? ++nr_pages: 0;
2346}
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2364{
2365 struct swap_info_struct *si;
2366 struct page *head;
2367 struct page *page;
2368 struct page *list_page;
2369 pgoff_t offset;
2370 unsigned char count;
2371
2372
2373
2374
2375
2376 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
2377
2378 si = swap_info_get(entry);
2379 if (!si) {
2380
2381
2382
2383
2384
2385 goto outer;
2386 }
2387
2388 offset = swp_offset(entry);
2389 count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
2390
2391 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
2392
2393
2394
2395
2396
2397 goto out;
2398 }
2399
2400 if (!page) {
2401 spin_unlock(&swap_lock);
2402 return -ENOMEM;
2403 }
2404
2405
2406
2407
2408
2409
2410 head = vmalloc_to_page(si->swap_map + offset);
2411 offset &= ~PAGE_MASK;
2412
2413
2414
2415
2416
2417 if (!page_private(head)) {
2418 BUG_ON(count & COUNT_CONTINUED);
2419 INIT_LIST_HEAD(&head->lru);
2420 set_page_private(head, SWP_CONTINUED);
2421 si->flags |= SWP_CONTINUED;
2422 }
2423
2424 list_for_each_entry(list_page, &head->lru, lru) {
2425 unsigned char *map;
2426
2427
2428
2429
2430
2431 if (!(count & COUNT_CONTINUED))
2432 goto out;
2433
2434 map = kmap_atomic(list_page, KM_USER0) + offset;
2435 count = *map;
2436 kunmap_atomic(map, KM_USER0);
2437
2438
2439
2440
2441
2442 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
2443 goto out;
2444 }
2445
2446 list_add_tail(&page->lru, &head->lru);
2447 page = NULL;
2448out:
2449 spin_unlock(&swap_lock);
2450outer:
2451 if (page)
2452 __free_page(page);
2453 return 0;
2454}
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464static bool swap_count_continued(struct swap_info_struct *si,
2465 pgoff_t offset, unsigned char count)
2466{
2467 struct page *head;
2468 struct page *page;
2469 unsigned char *map;
2470
2471 head = vmalloc_to_page(si->swap_map + offset);
2472 if (page_private(head) != SWP_CONTINUED) {
2473 BUG_ON(count & COUNT_CONTINUED);
2474 return false;
2475 }
2476
2477 offset &= ~PAGE_MASK;
2478 page = list_entry(head->lru.next, struct page, lru);
2479 map = kmap_atomic(page, KM_USER0) + offset;
2480
2481 if (count == SWAP_MAP_MAX)
2482 goto init_map;
2483
2484 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) {
2485
2486
2487
2488 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
2489 kunmap_atomic(map, KM_USER0);
2490 page = list_entry(page->lru.next, struct page, lru);
2491 BUG_ON(page == head);
2492 map = kmap_atomic(page, KM_USER0) + offset;
2493 }
2494 if (*map == SWAP_CONT_MAX) {
2495 kunmap_atomic(map, KM_USER0);
2496 page = list_entry(page->lru.next, struct page, lru);
2497 if (page == head)
2498 return false;
2499 map = kmap_atomic(page, KM_USER0) + offset;
2500init_map: *map = 0;
2501 }
2502 *map += 1;
2503 kunmap_atomic(map, KM_USER0);
2504 page = list_entry(page->lru.prev, struct page, lru);
2505 while (page != head) {
2506 map = kmap_atomic(page, KM_USER0) + offset;
2507 *map = COUNT_CONTINUED;
2508 kunmap_atomic(map, KM_USER0);
2509 page = list_entry(page->lru.prev, struct page, lru);
2510 }
2511 return true;
2512
2513 } else {
2514
2515
2516
2517 BUG_ON(count != COUNT_CONTINUED);
2518 while (*map == COUNT_CONTINUED) {
2519 kunmap_atomic(map, KM_USER0);
2520 page = list_entry(page->lru.next, struct page, lru);
2521 BUG_ON(page == head);
2522 map = kmap_atomic(page, KM_USER0) + offset;
2523 }
2524 BUG_ON(*map == 0);
2525 *map -= 1;
2526 if (*map == 0)
2527 count = 0;
2528 kunmap_atomic(map, KM_USER0);
2529 page = list_entry(page->lru.prev, struct page, lru);
2530 while (page != head) {
2531 map = kmap_atomic(page, KM_USER0) + offset;
2532 *map = SWAP_CONT_MAX | count;
2533 count = COUNT_CONTINUED;
2534 kunmap_atomic(map, KM_USER0);
2535 page = list_entry(page->lru.prev, struct page, lru);
2536 }
2537 return count == COUNT_CONTINUED;
2538 }
2539}
2540
2541
2542
2543
2544
2545static void free_swap_count_continuations(struct swap_info_struct *si)
2546{
2547 pgoff_t offset;
2548
2549 for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
2550 struct page *head;
2551 head = vmalloc_to_page(si->swap_map + offset);
2552 if (page_private(head)) {
2553 struct list_head *this, *next;
2554 list_for_each_safe(this, next, &head->lru) {
2555 struct page *page;
2556 page = list_entry(this, struct page, lru);
2557 list_del(this);
2558 __free_page(page);
2559 }
2560 }
2561 }
2562}
2563