1
2
3
4
5
6
7
8#include <linux/mm.h>
9#include <linux/hugetlb.h>
10#include <linux/mman.h>
11#include <linux/slab.h>
12#include <linux/kernel_stat.h>
13#include <linux/swap.h>
14#include <linux/vmalloc.h>
15#include <linux/pagemap.h>
16#include <linux/namei.h>
17#include <linux/shmem_fs.h>
18#include <linux/blkdev.h>
19#include <linux/random.h>
20#include <linux/writeback.h>
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/init.h>
24#include <linux/module.h>
25#include <linux/ksm.h>
26#include <linux/rmap.h>
27#include <linux/security.h>
28#include <linux/backing-dev.h>
29#include <linux/mutex.h>
30#include <linux/capability.h>
31#include <linux/syscalls.h>
32#include <linux/memcontrol.h>
33#include <linux/poll.h>
34#include <linux/oom.h>
35
36#include <asm/pgtable.h>
37#include <asm/tlbflush.h>
38#include <linux/swapops.h>
39#include <linux/page_cgroup.h>
40
41static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
42 unsigned char);
43static void free_swap_count_continuations(struct swap_info_struct *);
44static sector_t map_swap_entry(swp_entry_t, struct block_device**);
45
46static DEFINE_SPINLOCK(swap_lock);
47static unsigned int nr_swapfiles;
48long nr_swap_pages;
49long total_swap_pages;
50static int least_priority;
51
52static const char Bad_file[] = "Bad swap file entry ";
53static const char Unused_file[] = "Unused swap file entry ";
54static const char Bad_offset[] = "Bad swap offset entry ";
55static const char Unused_offset[] = "Unused swap offset entry ";
56
57static struct swap_list_t swap_list = {-1, -1};
58
59static struct swap_info_struct *swap_info[MAX_SWAPFILES];
60
61static DEFINE_MUTEX(swapon_mutex);
62
63static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
64
65static atomic_t proc_poll_event = ATOMIC_INIT(0);
66
67static inline unsigned char swap_count(unsigned char ent)
68{
69 return ent & ~SWAP_HAS_CACHE;
70}
71
72
73static int
74__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
75{
76 swp_entry_t entry = swp_entry(si->type, offset);
77 struct page *page;
78 int ret = 0;
79
80 page = find_get_page(&swapper_space, entry.val);
81 if (!page)
82 return 0;
83
84
85
86
87
88
89
90 if (trylock_page(page)) {
91 ret = try_to_free_swap(page);
92 unlock_page(page);
93 }
94 page_cache_release(page);
95 return ret;
96}
97
98
99
100
101
102static int discard_swap(struct swap_info_struct *si)
103{
104 struct swap_extent *se;
105 sector_t start_block;
106 sector_t nr_blocks;
107 int err = 0;
108
109
110 se = &si->first_swap_extent;
111 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
112 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
113 if (nr_blocks) {
114 err = blkdev_issue_discard(si->bdev, start_block,
115 nr_blocks, GFP_KERNEL, 0);
116 if (err)
117 return err;
118 cond_resched();
119 }
120
121 list_for_each_entry(se, &si->first_swap_extent.list, list) {
122 start_block = se->start_block << (PAGE_SHIFT - 9);
123 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
124
125 err = blkdev_issue_discard(si->bdev, start_block,
126 nr_blocks, GFP_KERNEL, 0);
127 if (err)
128 break;
129
130 cond_resched();
131 }
132 return err;
133}
134
135
136
137
138
139static void discard_swap_cluster(struct swap_info_struct *si,
140 pgoff_t start_page, pgoff_t nr_pages)
141{
142 struct swap_extent *se = si->curr_swap_extent;
143 int found_extent = 0;
144
145 while (nr_pages) {
146 struct list_head *lh;
147
148 if (se->start_page <= start_page &&
149 start_page < se->start_page + se->nr_pages) {
150 pgoff_t offset = start_page - se->start_page;
151 sector_t start_block = se->start_block + offset;
152 sector_t nr_blocks = se->nr_pages - offset;
153
154 if (nr_blocks > nr_pages)
155 nr_blocks = nr_pages;
156 start_page += nr_blocks;
157 nr_pages -= nr_blocks;
158
159 if (!found_extent++)
160 si->curr_swap_extent = se;
161
162 start_block <<= PAGE_SHIFT - 9;
163 nr_blocks <<= PAGE_SHIFT - 9;
164 if (blkdev_issue_discard(si->bdev, start_block,
165 nr_blocks, GFP_NOIO, 0))
166 break;
167 }
168
169 lh = se->list.next;
170 se = list_entry(lh, struct swap_extent, list);
171 }
172}
173
174static int wait_for_discard(void *word)
175{
176 schedule();
177 return 0;
178}
179
180#define SWAPFILE_CLUSTER 256
181#define LATENCY_LIMIT 256
182
183static unsigned long scan_swap_map(struct swap_info_struct *si,
184 unsigned char usage)
185{
186 unsigned long offset;
187 unsigned long scan_base;
188 unsigned long last_in_cluster = 0;
189 int latency_ration = LATENCY_LIMIT;
190 int found_free_cluster = 0;
191
192
193
194
195
196
197
198
199
200
201
202
203 si->flags += SWP_SCANNING;
204 scan_base = offset = si->cluster_next;
205
206 if (unlikely(!si->cluster_nr--)) {
207 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
208 si->cluster_nr = SWAPFILE_CLUSTER - 1;
209 goto checks;
210 }
211 if (si->flags & SWP_DISCARDABLE) {
212
213
214
215
216
217
218
219 if (si->lowest_alloc)
220 goto checks;
221 si->lowest_alloc = si->max;
222 si->highest_alloc = 0;
223 }
224 spin_unlock(&swap_lock);
225
226
227
228
229
230
231
232
233
234 if (!(si->flags & SWP_SOLIDSTATE))
235 scan_base = offset = si->lowest_bit;
236 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
237
238
239 for (; last_in_cluster <= si->highest_bit; offset++) {
240 if (si->swap_map[offset])
241 last_in_cluster = offset + SWAPFILE_CLUSTER;
242 else if (offset == last_in_cluster) {
243 spin_lock(&swap_lock);
244 offset -= SWAPFILE_CLUSTER - 1;
245 si->cluster_next = offset;
246 si->cluster_nr = SWAPFILE_CLUSTER - 1;
247 found_free_cluster = 1;
248 goto checks;
249 }
250 if (unlikely(--latency_ration < 0)) {
251 cond_resched();
252 latency_ration = LATENCY_LIMIT;
253 }
254 }
255
256 offset = si->lowest_bit;
257 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
258
259
260 for (; last_in_cluster < scan_base; offset++) {
261 if (si->swap_map[offset])
262 last_in_cluster = offset + SWAPFILE_CLUSTER;
263 else if (offset == last_in_cluster) {
264 spin_lock(&swap_lock);
265 offset -= SWAPFILE_CLUSTER - 1;
266 si->cluster_next = offset;
267 si->cluster_nr = SWAPFILE_CLUSTER - 1;
268 found_free_cluster = 1;
269 goto checks;
270 }
271 if (unlikely(--latency_ration < 0)) {
272 cond_resched();
273 latency_ration = LATENCY_LIMIT;
274 }
275 }
276
277 offset = scan_base;
278 spin_lock(&swap_lock);
279 si->cluster_nr = SWAPFILE_CLUSTER - 1;
280 si->lowest_alloc = 0;
281 }
282
283checks:
284 if (!(si->flags & SWP_WRITEOK))
285 goto no_page;
286 if (!si->highest_bit)
287 goto no_page;
288 if (offset > si->highest_bit)
289 scan_base = offset = si->lowest_bit;
290
291
292 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
293 int swap_was_freed;
294 spin_unlock(&swap_lock);
295 swap_was_freed = __try_to_reclaim_swap(si, offset);
296 spin_lock(&swap_lock);
297
298 if (swap_was_freed)
299 goto checks;
300 goto scan;
301 }
302
303 if (si->swap_map[offset])
304 goto scan;
305
306 if (offset == si->lowest_bit)
307 si->lowest_bit++;
308 if (offset == si->highest_bit)
309 si->highest_bit--;
310 si->inuse_pages++;
311 if (si->inuse_pages == si->pages) {
312 si->lowest_bit = si->max;
313 si->highest_bit = 0;
314 }
315 si->swap_map[offset] = usage;
316 si->cluster_next = offset + 1;
317 si->flags -= SWP_SCANNING;
318
319 if (si->lowest_alloc) {
320
321
322
323
324 if (found_free_cluster) {
325
326
327
328
329
330
331
332 if (offset < si->highest_alloc &&
333 si->lowest_alloc <= last_in_cluster)
334 last_in_cluster = si->lowest_alloc - 1;
335 si->flags |= SWP_DISCARDING;
336 spin_unlock(&swap_lock);
337
338 if (offset < last_in_cluster)
339 discard_swap_cluster(si, offset,
340 last_in_cluster - offset + 1);
341
342 spin_lock(&swap_lock);
343 si->lowest_alloc = 0;
344 si->flags &= ~SWP_DISCARDING;
345
346 smp_mb();
347 wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
348
349 } else if (si->flags & SWP_DISCARDING) {
350
351
352
353
354
355
356 spin_unlock(&swap_lock);
357 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
358 wait_for_discard, TASK_UNINTERRUPTIBLE);
359 spin_lock(&swap_lock);
360 } else {
361
362
363
364
365
366 if (offset < si->lowest_alloc)
367 si->lowest_alloc = offset;
368 if (offset > si->highest_alloc)
369 si->highest_alloc = offset;
370 }
371 }
372 return offset;
373
374scan:
375 spin_unlock(&swap_lock);
376 while (++offset <= si->highest_bit) {
377 if (!si->swap_map[offset]) {
378 spin_lock(&swap_lock);
379 goto checks;
380 }
381 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
382 spin_lock(&swap_lock);
383 goto checks;
384 }
385 if (unlikely(--latency_ration < 0)) {
386 cond_resched();
387 latency_ration = LATENCY_LIMIT;
388 }
389 }
390 offset = si->lowest_bit;
391 while (++offset < scan_base) {
392 if (!si->swap_map[offset]) {
393 spin_lock(&swap_lock);
394 goto checks;
395 }
396 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
397 spin_lock(&swap_lock);
398 goto checks;
399 }
400 if (unlikely(--latency_ration < 0)) {
401 cond_resched();
402 latency_ration = LATENCY_LIMIT;
403 }
404 }
405 spin_lock(&swap_lock);
406
407no_page:
408 si->flags -= SWP_SCANNING;
409 return 0;
410}
411
412swp_entry_t get_swap_page(void)
413{
414 struct swap_info_struct *si;
415 pgoff_t offset;
416 int type, next;
417 int wrapped = 0;
418
419 spin_lock(&swap_lock);
420 if (nr_swap_pages <= 0)
421 goto noswap;
422 nr_swap_pages--;
423
424 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
425 si = swap_info[type];
426 next = si->next;
427 if (next < 0 ||
428 (!wrapped && si->prio != swap_info[next]->prio)) {
429 next = swap_list.head;
430 wrapped++;
431 }
432
433 if (!si->highest_bit)
434 continue;
435 if (!(si->flags & SWP_WRITEOK))
436 continue;
437
438 swap_list.next = next;
439
440 offset = scan_swap_map(si, SWAP_HAS_CACHE);
441 if (offset) {
442 spin_unlock(&swap_lock);
443 return swp_entry(type, offset);
444 }
445 next = swap_list.next;
446 }
447
448 nr_swap_pages++;
449noswap:
450 spin_unlock(&swap_lock);
451 return (swp_entry_t) {0};
452}
453
454
455swp_entry_t get_swap_page_of_type(int type)
456{
457 struct swap_info_struct *si;
458 pgoff_t offset;
459
460 spin_lock(&swap_lock);
461 si = swap_info[type];
462 if (si && (si->flags & SWP_WRITEOK)) {
463 nr_swap_pages--;
464
465 offset = scan_swap_map(si, 1);
466 if (offset) {
467 spin_unlock(&swap_lock);
468 return swp_entry(type, offset);
469 }
470 nr_swap_pages++;
471 }
472 spin_unlock(&swap_lock);
473 return (swp_entry_t) {0};
474}
475
476static struct swap_info_struct *swap_info_get(swp_entry_t entry)
477{
478 struct swap_info_struct *p;
479 unsigned long offset, type;
480
481 if (!entry.val)
482 goto out;
483 type = swp_type(entry);
484 if (type >= nr_swapfiles)
485 goto bad_nofile;
486 p = swap_info[type];
487 if (!(p->flags & SWP_USED))
488 goto bad_device;
489 offset = swp_offset(entry);
490 if (offset >= p->max)
491 goto bad_offset;
492 if (!p->swap_map[offset])
493 goto bad_free;
494 spin_lock(&swap_lock);
495 return p;
496
497bad_free:
498 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
499 goto out;
500bad_offset:
501 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
502 goto out;
503bad_device:
504 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
505 goto out;
506bad_nofile:
507 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
508out:
509 return NULL;
510}
511
512static unsigned char swap_entry_free(struct swap_info_struct *p,
513 swp_entry_t entry, unsigned char usage)
514{
515 unsigned long offset = swp_offset(entry);
516 unsigned char count;
517 unsigned char has_cache;
518
519 count = p->swap_map[offset];
520 has_cache = count & SWAP_HAS_CACHE;
521 count &= ~SWAP_HAS_CACHE;
522
523 if (usage == SWAP_HAS_CACHE) {
524 VM_BUG_ON(!has_cache);
525 has_cache = 0;
526 } else if (count == SWAP_MAP_SHMEM) {
527
528
529
530
531 count = 0;
532 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
533 if (count == COUNT_CONTINUED) {
534 if (swap_count_continued(p, offset, count))
535 count = SWAP_MAP_MAX | COUNT_CONTINUED;
536 else
537 count = SWAP_MAP_MAX;
538 } else
539 count--;
540 }
541
542 if (!count)
543 mem_cgroup_uncharge_swap(entry);
544
545 usage = count | has_cache;
546 p->swap_map[offset] = usage;
547
548
549 if (!usage) {
550 struct gendisk *disk = p->bdev->bd_disk;
551 if (offset < p->lowest_bit)
552 p->lowest_bit = offset;
553 if (offset > p->highest_bit)
554 p->highest_bit = offset;
555 if (swap_list.next >= 0 &&
556 p->prio > swap_info[swap_list.next]->prio)
557 swap_list.next = p->type;
558 nr_swap_pages++;
559 p->inuse_pages--;
560 if ((p->flags & SWP_BLKDEV) &&
561 disk->fops->swap_slot_free_notify)
562 disk->fops->swap_slot_free_notify(p->bdev, offset);
563 }
564
565 return usage;
566}
567
568
569
570
571
572void swap_free(swp_entry_t entry)
573{
574 struct swap_info_struct *p;
575
576 p = swap_info_get(entry);
577 if (p) {
578 swap_entry_free(p, entry, 1);
579 spin_unlock(&swap_lock);
580 }
581}
582
583
584
585
586void swapcache_free(swp_entry_t entry, struct page *page)
587{
588 struct swap_info_struct *p;
589 unsigned char count;
590
591 p = swap_info_get(entry);
592 if (p) {
593 count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
594 if (page)
595 mem_cgroup_uncharge_swapcache(page, entry, count != 0);
596 spin_unlock(&swap_lock);
597 }
598}
599
600
601
602
603
604
605static inline int page_swapcount(struct page *page)
606{
607 int count = 0;
608 struct swap_info_struct *p;
609 swp_entry_t entry;
610
611 entry.val = page_private(page);
612 p = swap_info_get(entry);
613 if (p) {
614 count = swap_count(p->swap_map[swp_offset(entry)]);
615 spin_unlock(&swap_lock);
616 }
617 return count;
618}
619
620
621
622
623
624
625
626int reuse_swap_page(struct page *page)
627{
628 int count;
629
630 VM_BUG_ON(!PageLocked(page));
631 if (unlikely(PageKsm(page)))
632 return 0;
633 count = page_mapcount(page);
634 if (count <= 1 && PageSwapCache(page)) {
635 count += page_swapcount(page);
636 if (count == 1 && !PageWriteback(page)) {
637 delete_from_swap_cache(page);
638 SetPageDirty(page);
639 }
640 }
641 return count <= 1;
642}
643
644
645
646
647
648int try_to_free_swap(struct page *page)
649{
650 VM_BUG_ON(!PageLocked(page));
651
652 if (!PageSwapCache(page))
653 return 0;
654 if (PageWriteback(page))
655 return 0;
656 if (page_swapcount(page))
657 return 0;
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674 if (!(gfp_allowed_mask & __GFP_IO))
675 return 0;
676
677 delete_from_swap_cache(page);
678 SetPageDirty(page);
679 return 1;
680}
681
682
683
684
685
686int free_swap_and_cache(swp_entry_t entry)
687{
688 struct swap_info_struct *p;
689 struct page *page = NULL;
690
691 if (non_swap_entry(entry))
692 return 1;
693
694 p = swap_info_get(entry);
695 if (p) {
696 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
697 page = find_get_page(&swapper_space, entry.val);
698 if (page && !trylock_page(page)) {
699 page_cache_release(page);
700 page = NULL;
701 }
702 }
703 spin_unlock(&swap_lock);
704 }
705 if (page) {
706
707
708
709
710 if (PageSwapCache(page) && !PageWriteback(page) &&
711 (!page_mapped(page) || vm_swap_full())) {
712 delete_from_swap_cache(page);
713 SetPageDirty(page);
714 }
715 unlock_page(page);
716 page_cache_release(page);
717 }
718 return p != NULL;
719}
720
721#ifdef CONFIG_CGROUP_MEM_RES_CTLR
722
723
724
725
726
727
728
729
730
731
732int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
733{
734 struct page *page;
735 struct swap_info_struct *p;
736 int count = 0;
737
738 page = find_get_page(&swapper_space, ent.val);
739 if (page)
740 count += page_mapcount(page);
741 p = swap_info_get(ent);
742 if (p) {
743 count += swap_count(p->swap_map[swp_offset(ent)]);
744 spin_unlock(&swap_lock);
745 }
746
747 *pagep = page;
748 return count;
749}
750#endif
751
752#ifdef CONFIG_HIBERNATION
753
754
755
756
757
758
759
760
761int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
762{
763 struct block_device *bdev = NULL;
764 int type;
765
766 if (device)
767 bdev = bdget(device);
768
769 spin_lock(&swap_lock);
770 for (type = 0; type < nr_swapfiles; type++) {
771 struct swap_info_struct *sis = swap_info[type];
772
773 if (!(sis->flags & SWP_WRITEOK))
774 continue;
775
776 if (!bdev) {
777 if (bdev_p)
778 *bdev_p = bdgrab(sis->bdev);
779
780 spin_unlock(&swap_lock);
781 return type;
782 }
783 if (bdev == sis->bdev) {
784 struct swap_extent *se = &sis->first_swap_extent;
785
786 if (se->start_block == offset) {
787 if (bdev_p)
788 *bdev_p = bdgrab(sis->bdev);
789
790 spin_unlock(&swap_lock);
791 bdput(bdev);
792 return type;
793 }
794 }
795 }
796 spin_unlock(&swap_lock);
797 if (bdev)
798 bdput(bdev);
799
800 return -ENODEV;
801}
802
803
804
805
806
807sector_t swapdev_block(int type, pgoff_t offset)
808{
809 struct block_device *bdev;
810
811 if ((unsigned int)type >= nr_swapfiles)
812 return 0;
813 if (!(swap_info[type]->flags & SWP_WRITEOK))
814 return 0;
815 return map_swap_entry(swp_entry(type, offset), &bdev);
816}
817
818
819
820
821
822
823
824unsigned int count_swap_pages(int type, int free)
825{
826 unsigned int n = 0;
827
828 spin_lock(&swap_lock);
829 if ((unsigned int)type < nr_swapfiles) {
830 struct swap_info_struct *sis = swap_info[type];
831
832 if (sis->flags & SWP_WRITEOK) {
833 n = sis->pages;
834 if (free)
835 n -= sis->inuse_pages;
836 }
837 }
838 spin_unlock(&swap_lock);
839 return n;
840}
841#endif
842
843
844
845
846
847
848static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
849 unsigned long addr, swp_entry_t entry, struct page *page)
850{
851 struct mem_cgroup *ptr;
852 spinlock_t *ptl;
853 pte_t *pte;
854 int ret = 1;
855
856 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) {
857 ret = -ENOMEM;
858 goto out_nolock;
859 }
860
861 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
862 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
863 if (ret > 0)
864 mem_cgroup_cancel_charge_swapin(ptr);
865 ret = 0;
866 goto out;
867 }
868
869 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
870 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
871 get_page(page);
872 set_pte_at(vma->vm_mm, addr, pte,
873 pte_mkold(mk_pte(page, vma->vm_page_prot)));
874 page_add_anon_rmap(page, vma, addr);
875 mem_cgroup_commit_charge_swapin(page, ptr);
876 swap_free(entry);
877
878
879
880
881 activate_page(page);
882out:
883 pte_unmap_unlock(pte, ptl);
884out_nolock:
885 return ret;
886}
887
888static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
889 unsigned long addr, unsigned long end,
890 swp_entry_t entry, struct page *page)
891{
892 pte_t swp_pte = swp_entry_to_pte(entry);
893 pte_t *pte;
894 int ret = 0;
895
896
897
898
899
900
901
902
903
904
905 pte = pte_offset_map(pmd, addr);
906 do {
907
908
909
910
911 if (unlikely(pte_same(*pte, swp_pte))) {
912 pte_unmap(pte);
913 ret = unuse_pte(vma, pmd, addr, entry, page);
914 if (ret)
915 goto out;
916 pte = pte_offset_map(pmd, addr);
917 }
918 } while (pte++, addr += PAGE_SIZE, addr != end);
919 pte_unmap(pte - 1);
920out:
921 return ret;
922}
923
924static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
925 unsigned long addr, unsigned long end,
926 swp_entry_t entry, struct page *page)
927{
928 pmd_t *pmd;
929 unsigned long next;
930 int ret;
931
932 pmd = pmd_offset(pud, addr);
933 do {
934 next = pmd_addr_end(addr, end);
935 if (unlikely(pmd_trans_huge(*pmd)))
936 continue;
937 if (pmd_none_or_clear_bad(pmd))
938 continue;
939 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
940 if (ret)
941 return ret;
942 } while (pmd++, addr = next, addr != end);
943 return 0;
944}
945
946static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
947 unsigned long addr, unsigned long end,
948 swp_entry_t entry, struct page *page)
949{
950 pud_t *pud;
951 unsigned long next;
952 int ret;
953
954 pud = pud_offset(pgd, addr);
955 do {
956 next = pud_addr_end(addr, end);
957 if (pud_none_or_clear_bad(pud))
958 continue;
959 ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
960 if (ret)
961 return ret;
962 } while (pud++, addr = next, addr != end);
963 return 0;
964}
965
966static int unuse_vma(struct vm_area_struct *vma,
967 swp_entry_t entry, struct page *page)
968{
969 pgd_t *pgd;
970 unsigned long addr, end, next;
971 int ret;
972
973 if (page_anon_vma(page)) {
974 addr = page_address_in_vma(page, vma);
975 if (addr == -EFAULT)
976 return 0;
977 else
978 end = addr + PAGE_SIZE;
979 } else {
980 addr = vma->vm_start;
981 end = vma->vm_end;
982 }
983
984 pgd = pgd_offset(vma->vm_mm, addr);
985 do {
986 next = pgd_addr_end(addr, end);
987 if (pgd_none_or_clear_bad(pgd))
988 continue;
989 ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
990 if (ret)
991 return ret;
992 } while (pgd++, addr = next, addr != end);
993 return 0;
994}
995
996static int unuse_mm(struct mm_struct *mm,
997 swp_entry_t entry, struct page *page)
998{
999 struct vm_area_struct *vma;
1000 int ret = 0;
1001
1002 if (!down_read_trylock(&mm->mmap_sem)) {
1003
1004
1005
1006
1007 activate_page(page);
1008 unlock_page(page);
1009 down_read(&mm->mmap_sem);
1010 lock_page(page);
1011 }
1012 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1013 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
1014 break;
1015 }
1016 up_read(&mm->mmap_sem);
1017 return (ret < 0)? ret: 0;
1018}
1019
1020
1021
1022
1023
1024static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1025 unsigned int prev)
1026{
1027 unsigned int max = si->max;
1028 unsigned int i = prev;
1029 unsigned char count;
1030
1031
1032
1033
1034
1035
1036
1037 for (;;) {
1038 if (++i >= max) {
1039 if (!prev) {
1040 i = 0;
1041 break;
1042 }
1043
1044
1045
1046
1047 max = prev + 1;
1048 prev = 0;
1049 i = 1;
1050 }
1051 count = si->swap_map[i];
1052 if (count && swap_count(count) != SWAP_MAP_BAD)
1053 break;
1054 }
1055 return i;
1056}
1057
1058
1059
1060
1061
1062
1063static int try_to_unuse(unsigned int type)
1064{
1065 struct swap_info_struct *si = swap_info[type];
1066 struct mm_struct *start_mm;
1067 unsigned char *swap_map;
1068 unsigned char swcount;
1069 struct page *page;
1070 swp_entry_t entry;
1071 unsigned int i = 0;
1072 int retval = 0;
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088 start_mm = &init_mm;
1089 atomic_inc(&init_mm.mm_users);
1090
1091
1092
1093
1094
1095
1096 while ((i = find_next_to_unuse(si, i)) != 0) {
1097 if (signal_pending(current)) {
1098 retval = -EINTR;
1099 break;
1100 }
1101
1102
1103
1104
1105
1106
1107 swap_map = &si->swap_map[i];
1108 entry = swp_entry(type, i);
1109 page = read_swap_cache_async(entry,
1110 GFP_HIGHUSER_MOVABLE, NULL, 0);
1111 if (!page) {
1112
1113
1114
1115
1116
1117
1118 if (!*swap_map)
1119 continue;
1120 retval = -ENOMEM;
1121 break;
1122 }
1123
1124
1125
1126
1127 if (atomic_read(&start_mm->mm_users) == 1) {
1128 mmput(start_mm);
1129 start_mm = &init_mm;
1130 atomic_inc(&init_mm.mm_users);
1131 }
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141 wait_on_page_locked(page);
1142 wait_on_page_writeback(page);
1143 lock_page(page);
1144 wait_on_page_writeback(page);
1145
1146
1147
1148
1149 swcount = *swap_map;
1150 if (swap_count(swcount) == SWAP_MAP_SHMEM) {
1151 retval = shmem_unuse(entry, page);
1152
1153 if (retval < 0)
1154 break;
1155 continue;
1156 }
1157 if (swap_count(swcount) && start_mm != &init_mm)
1158 retval = unuse_mm(start_mm, entry, page);
1159
1160 if (swap_count(*swap_map)) {
1161 int set_start_mm = (*swap_map >= swcount);
1162 struct list_head *p = &start_mm->mmlist;
1163 struct mm_struct *new_start_mm = start_mm;
1164 struct mm_struct *prev_mm = start_mm;
1165 struct mm_struct *mm;
1166
1167 atomic_inc(&new_start_mm->mm_users);
1168 atomic_inc(&prev_mm->mm_users);
1169 spin_lock(&mmlist_lock);
1170 while (swap_count(*swap_map) && !retval &&
1171 (p = p->next) != &start_mm->mmlist) {
1172 mm = list_entry(p, struct mm_struct, mmlist);
1173 if (!atomic_inc_not_zero(&mm->mm_users))
1174 continue;
1175 spin_unlock(&mmlist_lock);
1176 mmput(prev_mm);
1177 prev_mm = mm;
1178
1179 cond_resched();
1180
1181 swcount = *swap_map;
1182 if (!swap_count(swcount))
1183 ;
1184 else if (mm == &init_mm)
1185 set_start_mm = 1;
1186 else
1187 retval = unuse_mm(mm, entry, page);
1188
1189 if (set_start_mm && *swap_map < swcount) {
1190 mmput(new_start_mm);
1191 atomic_inc(&mm->mm_users);
1192 new_start_mm = mm;
1193 set_start_mm = 0;
1194 }
1195 spin_lock(&mmlist_lock);
1196 }
1197 spin_unlock(&mmlist_lock);
1198 mmput(prev_mm);
1199 mmput(start_mm);
1200 start_mm = new_start_mm;
1201 }
1202 if (retval) {
1203 unlock_page(page);
1204 page_cache_release(page);
1205 break;
1206 }
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227 if (swap_count(*swap_map) &&
1228 PageDirty(page) && PageSwapCache(page)) {
1229 struct writeback_control wbc = {
1230 .sync_mode = WB_SYNC_NONE,
1231 };
1232
1233 swap_writepage(page, &wbc);
1234 lock_page(page);
1235 wait_on_page_writeback(page);
1236 }
1237
1238
1239
1240
1241
1242
1243
1244
1245 if (PageSwapCache(page) &&
1246 likely(page_private(page) == entry.val))
1247 delete_from_swap_cache(page);
1248
1249
1250
1251
1252
1253
1254 SetPageDirty(page);
1255 unlock_page(page);
1256 page_cache_release(page);
1257
1258
1259
1260
1261
1262 cond_resched();
1263 }
1264
1265 mmput(start_mm);
1266 return retval;
1267}
1268
1269
1270
1271
1272
1273
1274
1275static void drain_mmlist(void)
1276{
1277 struct list_head *p, *next;
1278 unsigned int type;
1279
1280 for (type = 0; type < nr_swapfiles; type++)
1281 if (swap_info[type]->inuse_pages)
1282 return;
1283 spin_lock(&mmlist_lock);
1284 list_for_each_safe(p, next, &init_mm.mmlist)
1285 list_del_init(p);
1286 spin_unlock(&mmlist_lock);
1287}
1288
1289
1290
1291
1292
1293
1294
1295static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
1296{
1297 struct swap_info_struct *sis;
1298 struct swap_extent *start_se;
1299 struct swap_extent *se;
1300 pgoff_t offset;
1301
1302 sis = swap_info[swp_type(entry)];
1303 *bdev = sis->bdev;
1304
1305 offset = swp_offset(entry);
1306 start_se = sis->curr_swap_extent;
1307 se = start_se;
1308
1309 for ( ; ; ) {
1310 struct list_head *lh;
1311
1312 if (se->start_page <= offset &&
1313 offset < (se->start_page + se->nr_pages)) {
1314 return se->start_block + (offset - se->start_page);
1315 }
1316 lh = se->list.next;
1317 se = list_entry(lh, struct swap_extent, list);
1318 sis->curr_swap_extent = se;
1319 BUG_ON(se == start_se);
1320 }
1321}
1322
1323
1324
1325
1326sector_t map_swap_page(struct page *page, struct block_device **bdev)
1327{
1328 swp_entry_t entry;
1329 entry.val = page_private(page);
1330 return map_swap_entry(entry, bdev);
1331}
1332
1333
1334
1335
1336static void destroy_swap_extents(struct swap_info_struct *sis)
1337{
1338 while (!list_empty(&sis->first_swap_extent.list)) {
1339 struct swap_extent *se;
1340
1341 se = list_entry(sis->first_swap_extent.list.next,
1342 struct swap_extent, list);
1343 list_del(&se->list);
1344 kfree(se);
1345 }
1346}
1347
1348
1349
1350
1351
1352
1353
1354static int
1355add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1356 unsigned long nr_pages, sector_t start_block)
1357{
1358 struct swap_extent *se;
1359 struct swap_extent *new_se;
1360 struct list_head *lh;
1361
1362 if (start_page == 0) {
1363 se = &sis->first_swap_extent;
1364 sis->curr_swap_extent = se;
1365 se->start_page = 0;
1366 se->nr_pages = nr_pages;
1367 se->start_block = start_block;
1368 return 1;
1369 } else {
1370 lh = sis->first_swap_extent.list.prev;
1371 se = list_entry(lh, struct swap_extent, list);
1372 BUG_ON(se->start_page + se->nr_pages != start_page);
1373 if (se->start_block + se->nr_pages == start_block) {
1374
1375 se->nr_pages += nr_pages;
1376 return 0;
1377 }
1378 }
1379
1380
1381
1382
1383 new_se = kmalloc(sizeof(*se), GFP_KERNEL);
1384 if (new_se == NULL)
1385 return -ENOMEM;
1386 new_se->start_page = start_page;
1387 new_se->nr_pages = nr_pages;
1388 new_se->start_block = start_block;
1389
1390 list_add_tail(&new_se->list, &sis->first_swap_extent.list);
1391 return 1;
1392}
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1426{
1427 struct inode *inode;
1428 unsigned blocks_per_page;
1429 unsigned long page_no;
1430 unsigned blkbits;
1431 sector_t probe_block;
1432 sector_t last_block;
1433 sector_t lowest_block = -1;
1434 sector_t highest_block = 0;
1435 int nr_extents = 0;
1436 int ret;
1437
1438 inode = sis->swap_file->f_mapping->host;
1439 if (S_ISBLK(inode->i_mode)) {
1440 ret = add_swap_extent(sis, 0, sis->max, 0);
1441 *span = sis->pages;
1442 goto out;
1443 }
1444
1445 blkbits = inode->i_blkbits;
1446 blocks_per_page = PAGE_SIZE >> blkbits;
1447
1448
1449
1450
1451
1452 probe_block = 0;
1453 page_no = 0;
1454 last_block = i_size_read(inode) >> blkbits;
1455 while ((probe_block + blocks_per_page) <= last_block &&
1456 page_no < sis->max) {
1457 unsigned block_in_page;
1458 sector_t first_block;
1459
1460 first_block = bmap(inode, probe_block);
1461 if (first_block == 0)
1462 goto bad_bmap;
1463
1464
1465
1466
1467 if (first_block & (blocks_per_page - 1)) {
1468 probe_block++;
1469 goto reprobe;
1470 }
1471
1472 for (block_in_page = 1; block_in_page < blocks_per_page;
1473 block_in_page++) {
1474 sector_t block;
1475
1476 block = bmap(inode, probe_block + block_in_page);
1477 if (block == 0)
1478 goto bad_bmap;
1479 if (block != first_block + block_in_page) {
1480
1481 probe_block++;
1482 goto reprobe;
1483 }
1484 }
1485
1486 first_block >>= (PAGE_SHIFT - blkbits);
1487 if (page_no) {
1488 if (first_block < lowest_block)
1489 lowest_block = first_block;
1490 if (first_block > highest_block)
1491 highest_block = first_block;
1492 }
1493
1494
1495
1496
1497 ret = add_swap_extent(sis, page_no, 1, first_block);
1498 if (ret < 0)
1499 goto out;
1500 nr_extents += ret;
1501 page_no++;
1502 probe_block += blocks_per_page;
1503reprobe:
1504 continue;
1505 }
1506 ret = nr_extents;
1507 *span = 1 + highest_block - lowest_block;
1508 if (page_no == 0)
1509 page_no = 1;
1510 sis->max = page_no;
1511 sis->pages = page_no - 1;
1512 sis->highest_bit = page_no - 1;
1513out:
1514 return ret;
1515bad_bmap:
1516 printk(KERN_ERR "swapon: swapfile has holes\n");
1517 ret = -EINVAL;
1518 goto out;
1519}
1520
1521static void enable_swap_info(struct swap_info_struct *p, int prio,
1522 unsigned char *swap_map)
1523{
1524 int i, prev;
1525
1526 spin_lock(&swap_lock);
1527 if (prio >= 0)
1528 p->prio = prio;
1529 else
1530 p->prio = --least_priority;
1531 p->swap_map = swap_map;
1532 p->flags |= SWP_WRITEOK;
1533 nr_swap_pages += p->pages;
1534 total_swap_pages += p->pages;
1535
1536
1537 prev = -1;
1538 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
1539 if (p->prio >= swap_info[i]->prio)
1540 break;
1541 prev = i;
1542 }
1543 p->next = i;
1544 if (prev < 0)
1545 swap_list.head = swap_list.next = p->type;
1546 else
1547 swap_info[prev]->next = p->type;
1548 spin_unlock(&swap_lock);
1549}
1550
1551SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1552{
1553 struct swap_info_struct *p = NULL;
1554 unsigned char *swap_map;
1555 struct file *swap_file, *victim;
1556 struct address_space *mapping;
1557 struct inode *inode;
1558 char *pathname;
1559 int oom_score_adj;
1560 int i, type, prev;
1561 int err;
1562
1563 if (!capable(CAP_SYS_ADMIN))
1564 return -EPERM;
1565
1566 pathname = getname(specialfile);
1567 err = PTR_ERR(pathname);
1568 if (IS_ERR(pathname))
1569 goto out;
1570
1571 victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
1572 putname(pathname);
1573 err = PTR_ERR(victim);
1574 if (IS_ERR(victim))
1575 goto out;
1576
1577 mapping = victim->f_mapping;
1578 prev = -1;
1579 spin_lock(&swap_lock);
1580 for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
1581 p = swap_info[type];
1582 if (p->flags & SWP_WRITEOK) {
1583 if (p->swap_file->f_mapping == mapping)
1584 break;
1585 }
1586 prev = type;
1587 }
1588 if (type < 0) {
1589 err = -EINVAL;
1590 spin_unlock(&swap_lock);
1591 goto out_dput;
1592 }
1593 if (!security_vm_enough_memory(p->pages))
1594 vm_unacct_memory(p->pages);
1595 else {
1596 err = -ENOMEM;
1597 spin_unlock(&swap_lock);
1598 goto out_dput;
1599 }
1600 if (prev < 0)
1601 swap_list.head = p->next;
1602 else
1603 swap_info[prev]->next = p->next;
1604 if (type == swap_list.next) {
1605
1606 swap_list.next = swap_list.head;
1607 }
1608 if (p->prio < 0) {
1609 for (i = p->next; i >= 0; i = swap_info[i]->next)
1610 swap_info[i]->prio = p->prio--;
1611 least_priority++;
1612 }
1613 nr_swap_pages -= p->pages;
1614 total_swap_pages -= p->pages;
1615 p->flags &= ~SWP_WRITEOK;
1616 spin_unlock(&swap_lock);
1617
1618 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1619 err = try_to_unuse(type);
1620 test_set_oom_score_adj(oom_score_adj);
1621
1622 if (err) {
1623
1624
1625
1626
1627
1628
1629
1630 enable_swap_info(p, p->prio, p->swap_map);
1631 goto out_dput;
1632 }
1633
1634 destroy_swap_extents(p);
1635 if (p->flags & SWP_CONTINUED)
1636 free_swap_count_continuations(p);
1637
1638 mutex_lock(&swapon_mutex);
1639 spin_lock(&swap_lock);
1640 drain_mmlist();
1641
1642
1643 p->highest_bit = 0;
1644 while (p->flags >= SWP_SCANNING) {
1645 spin_unlock(&swap_lock);
1646 schedule_timeout_uninterruptible(1);
1647 spin_lock(&swap_lock);
1648 }
1649
1650 swap_file = p->swap_file;
1651 p->swap_file = NULL;
1652 p->max = 0;
1653 swap_map = p->swap_map;
1654 p->swap_map = NULL;
1655 p->flags = 0;
1656 spin_unlock(&swap_lock);
1657 mutex_unlock(&swapon_mutex);
1658 vfree(swap_map);
1659
1660 swap_cgroup_swapoff(type);
1661
1662 inode = mapping->host;
1663 if (S_ISBLK(inode->i_mode)) {
1664 struct block_device *bdev = I_BDEV(inode);
1665 set_blocksize(bdev, p->old_block_size);
1666 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1667 } else {
1668 mutex_lock(&inode->i_mutex);
1669 inode->i_flags &= ~S_SWAPFILE;
1670 mutex_unlock(&inode->i_mutex);
1671 }
1672 filp_close(swap_file, NULL);
1673 err = 0;
1674 atomic_inc(&proc_poll_event);
1675 wake_up_interruptible(&proc_poll_wait);
1676
1677out_dput:
1678 filp_close(victim, NULL);
1679out:
1680 return err;
1681}
1682
1683#ifdef CONFIG_PROC_FS
1684struct proc_swaps {
1685 struct seq_file seq;
1686 int event;
1687};
1688
1689static unsigned swaps_poll(struct file *file, poll_table *wait)
1690{
1691 struct proc_swaps *s = file->private_data;
1692
1693 poll_wait(file, &proc_poll_wait, wait);
1694
1695 if (s->event != atomic_read(&proc_poll_event)) {
1696 s->event = atomic_read(&proc_poll_event);
1697 return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
1698 }
1699
1700 return POLLIN | POLLRDNORM;
1701}
1702
1703
1704static void *swap_start(struct seq_file *swap, loff_t *pos)
1705{
1706 struct swap_info_struct *si;
1707 int type;
1708 loff_t l = *pos;
1709
1710 mutex_lock(&swapon_mutex);
1711
1712 if (!l)
1713 return SEQ_START_TOKEN;
1714
1715 for (type = 0; type < nr_swapfiles; type++) {
1716 smp_rmb();
1717 si = swap_info[type];
1718 if (!(si->flags & SWP_USED) || !si->swap_map)
1719 continue;
1720 if (!--l)
1721 return si;
1722 }
1723
1724 return NULL;
1725}
1726
1727static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1728{
1729 struct swap_info_struct *si = v;
1730 int type;
1731
1732 if (v == SEQ_START_TOKEN)
1733 type = 0;
1734 else
1735 type = si->type + 1;
1736
1737 for (; type < nr_swapfiles; type++) {
1738 smp_rmb();
1739 si = swap_info[type];
1740 if (!(si->flags & SWP_USED) || !si->swap_map)
1741 continue;
1742 ++*pos;
1743 return si;
1744 }
1745
1746 return NULL;
1747}
1748
1749static void swap_stop(struct seq_file *swap, void *v)
1750{
1751 mutex_unlock(&swapon_mutex);
1752}
1753
1754static int swap_show(struct seq_file *swap, void *v)
1755{
1756 struct swap_info_struct *si = v;
1757 struct file *file;
1758 int len;
1759
1760 if (si == SEQ_START_TOKEN) {
1761 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1762 return 0;
1763 }
1764
1765 file = si->swap_file;
1766 len = seq_path(swap, &file->f_path, " \t\n\\");
1767 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1768 len < 40 ? 40 - len : 1, " ",
1769 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1770 "partition" : "file\t",
1771 si->pages << (PAGE_SHIFT - 10),
1772 si->inuse_pages << (PAGE_SHIFT - 10),
1773 si->prio);
1774 return 0;
1775}
1776
1777static const struct seq_operations swaps_op = {
1778 .start = swap_start,
1779 .next = swap_next,
1780 .stop = swap_stop,
1781 .show = swap_show
1782};
1783
1784static int swaps_open(struct inode *inode, struct file *file)
1785{
1786 struct proc_swaps *s;
1787 int ret;
1788
1789 s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
1790 if (!s)
1791 return -ENOMEM;
1792
1793 file->private_data = s;
1794
1795 ret = seq_open(file, &swaps_op);
1796 if (ret) {
1797 kfree(s);
1798 return ret;
1799 }
1800
1801 s->seq.private = s;
1802 s->event = atomic_read(&proc_poll_event);
1803 return ret;
1804}
1805
1806static const struct file_operations proc_swaps_operations = {
1807 .open = swaps_open,
1808 .read = seq_read,
1809 .llseek = seq_lseek,
1810 .release = seq_release,
1811 .poll = swaps_poll,
1812};
1813
1814static int __init procswaps_init(void)
1815{
1816 proc_create("swaps", 0, NULL, &proc_swaps_operations);
1817 return 0;
1818}
1819__initcall(procswaps_init);
1820#endif
1821
1822#ifdef MAX_SWAPFILES_CHECK
1823static int __init max_swapfiles_check(void)
1824{
1825 MAX_SWAPFILES_CHECK();
1826 return 0;
1827}
1828late_initcall(max_swapfiles_check);
1829#endif
1830
1831static struct swap_info_struct *alloc_swap_info(void)
1832{
1833 struct swap_info_struct *p;
1834 unsigned int type;
1835
1836 p = kzalloc(sizeof(*p), GFP_KERNEL);
1837 if (!p)
1838 return ERR_PTR(-ENOMEM);
1839
1840 spin_lock(&swap_lock);
1841 for (type = 0; type < nr_swapfiles; type++) {
1842 if (!(swap_info[type]->flags & SWP_USED))
1843 break;
1844 }
1845 if (type >= MAX_SWAPFILES) {
1846 spin_unlock(&swap_lock);
1847 kfree(p);
1848 return ERR_PTR(-EPERM);
1849 }
1850 if (type >= nr_swapfiles) {
1851 p->type = type;
1852 swap_info[type] = p;
1853
1854
1855
1856
1857
1858 smp_wmb();
1859 nr_swapfiles++;
1860 } else {
1861 kfree(p);
1862 p = swap_info[type];
1863
1864
1865
1866
1867 }
1868 INIT_LIST_HEAD(&p->first_swap_extent.list);
1869 p->flags = SWP_USED;
1870 p->next = -1;
1871 spin_unlock(&swap_lock);
1872
1873 return p;
1874}
1875
1876static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
1877{
1878 int error;
1879
1880 if (S_ISBLK(inode->i_mode)) {
1881 p->bdev = bdgrab(I_BDEV(inode));
1882 error = blkdev_get(p->bdev,
1883 FMODE_READ | FMODE_WRITE | FMODE_EXCL,
1884 sys_swapon);
1885 if (error < 0) {
1886 p->bdev = NULL;
1887 return -EINVAL;
1888 }
1889 p->old_block_size = block_size(p->bdev);
1890 error = set_blocksize(p->bdev, PAGE_SIZE);
1891 if (error < 0)
1892 return error;
1893 p->flags |= SWP_BLKDEV;
1894 } else if (S_ISREG(inode->i_mode)) {
1895 p->bdev = inode->i_sb->s_bdev;
1896 mutex_lock(&inode->i_mutex);
1897 if (IS_SWAPFILE(inode))
1898 return -EBUSY;
1899 } else
1900 return -EINVAL;
1901
1902 return 0;
1903}
1904
1905static unsigned long read_swap_header(struct swap_info_struct *p,
1906 union swap_header *swap_header,
1907 struct inode *inode)
1908{
1909 int i;
1910 unsigned long maxpages;
1911 unsigned long swapfilepages;
1912
1913 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
1914 printk(KERN_ERR "Unable to find swap-space signature\n");
1915 return 0;
1916 }
1917
1918
1919 if (swab32(swap_header->info.version) == 1) {
1920 swab32s(&swap_header->info.version);
1921 swab32s(&swap_header->info.last_page);
1922 swab32s(&swap_header->info.nr_badpages);
1923 for (i = 0; i < swap_header->info.nr_badpages; i++)
1924 swab32s(&swap_header->info.badpages[i]);
1925 }
1926
1927 if (swap_header->info.version != 1) {
1928 printk(KERN_WARNING
1929 "Unable to handle swap header version %d\n",
1930 swap_header->info.version);
1931 return 0;
1932 }
1933
1934 p->lowest_bit = 1;
1935 p->cluster_next = 1;
1936 p->cluster_nr = 0;
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952 maxpages = swp_offset(pte_to_swp_entry(
1953 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
1954 if (maxpages > swap_header->info.last_page) {
1955 maxpages = swap_header->info.last_page + 1;
1956
1957 if ((unsigned int)maxpages == 0)
1958 maxpages = UINT_MAX;
1959 }
1960 p->highest_bit = maxpages - 1;
1961
1962 if (!maxpages)
1963 return 0;
1964 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
1965 if (swapfilepages && maxpages > swapfilepages) {
1966 printk(KERN_WARNING
1967 "Swap area shorter than signature indicates\n");
1968 return 0;
1969 }
1970 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
1971 return 0;
1972 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
1973 return 0;
1974
1975 return maxpages;
1976}
1977
1978static int setup_swap_map_and_extents(struct swap_info_struct *p,
1979 union swap_header *swap_header,
1980 unsigned char *swap_map,
1981 unsigned long maxpages,
1982 sector_t *span)
1983{
1984 int i;
1985 unsigned int nr_good_pages;
1986 int nr_extents;
1987
1988 nr_good_pages = maxpages - 1;
1989
1990 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1991 unsigned int page_nr = swap_header->info.badpages[i];
1992 if (page_nr == 0 || page_nr > swap_header->info.last_page)
1993 return -EINVAL;
1994 if (page_nr < maxpages) {
1995 swap_map[page_nr] = SWAP_MAP_BAD;
1996 nr_good_pages--;
1997 }
1998 }
1999
2000 if (nr_good_pages) {
2001 swap_map[0] = SWAP_MAP_BAD;
2002 p->max = maxpages;
2003 p->pages = nr_good_pages;
2004 nr_extents = setup_swap_extents(p, span);
2005 if (nr_extents < 0)
2006 return nr_extents;
2007 nr_good_pages = p->pages;
2008 }
2009 if (!nr_good_pages) {
2010 printk(KERN_WARNING "Empty swap-file\n");
2011 return -EINVAL;
2012 }
2013
2014 return nr_extents;
2015}
2016
2017SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2018{
2019 struct swap_info_struct *p;
2020 char *name;
2021 struct file *swap_file = NULL;
2022 struct address_space *mapping;
2023 int i;
2024 int prio;
2025 int error;
2026 union swap_header *swap_header;
2027 int nr_extents;
2028 sector_t span;
2029 unsigned long maxpages;
2030 unsigned char *swap_map = NULL;
2031 struct page *page = NULL;
2032 struct inode *inode = NULL;
2033
2034 if (!capable(CAP_SYS_ADMIN))
2035 return -EPERM;
2036
2037 p = alloc_swap_info();
2038 if (IS_ERR(p))
2039 return PTR_ERR(p);
2040
2041 name = getname(specialfile);
2042 if (IS_ERR(name)) {
2043 error = PTR_ERR(name);
2044 name = NULL;
2045 goto bad_swap;
2046 }
2047 swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
2048 if (IS_ERR(swap_file)) {
2049 error = PTR_ERR(swap_file);
2050 swap_file = NULL;
2051 goto bad_swap;
2052 }
2053
2054 p->swap_file = swap_file;
2055 mapping = swap_file->f_mapping;
2056
2057 for (i = 0; i < nr_swapfiles; i++) {
2058 struct swap_info_struct *q = swap_info[i];
2059
2060 if (q == p || !q->swap_file)
2061 continue;
2062 if (mapping == q->swap_file->f_mapping) {
2063 error = -EBUSY;
2064 goto bad_swap;
2065 }
2066 }
2067
2068 inode = mapping->host;
2069
2070 error = claim_swapfile(p, inode);
2071 if (unlikely(error))
2072 goto bad_swap;
2073
2074
2075
2076
2077 if (!mapping->a_ops->readpage) {
2078 error = -EINVAL;
2079 goto bad_swap;
2080 }
2081 page = read_mapping_page(mapping, 0, swap_file);
2082 if (IS_ERR(page)) {
2083 error = PTR_ERR(page);
2084 goto bad_swap;
2085 }
2086 swap_header = kmap(page);
2087
2088 maxpages = read_swap_header(p, swap_header, inode);
2089 if (unlikely(!maxpages)) {
2090 error = -EINVAL;
2091 goto bad_swap;
2092 }
2093
2094
2095 swap_map = vzalloc(maxpages);
2096 if (!swap_map) {
2097 error = -ENOMEM;
2098 goto bad_swap;
2099 }
2100
2101 error = swap_cgroup_swapon(p->type, maxpages);
2102 if (error)
2103 goto bad_swap;
2104
2105 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
2106 maxpages, &span);
2107 if (unlikely(nr_extents < 0)) {
2108 error = nr_extents;
2109 goto bad_swap;
2110 }
2111
2112 if (p->bdev) {
2113 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
2114 p->flags |= SWP_SOLIDSTATE;
2115 p->cluster_next = 1 + (random32() % p->highest_bit);
2116 }
2117 if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD))
2118 p->flags |= SWP_DISCARDABLE;
2119 }
2120
2121 mutex_lock(&swapon_mutex);
2122 prio = -1;
2123 if (swap_flags & SWAP_FLAG_PREFER)
2124 prio =
2125 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2126 enable_swap_info(p, prio, swap_map);
2127
2128 printk(KERN_INFO "Adding %uk swap on %s. "
2129 "Priority:%d extents:%d across:%lluk %s%s\n",
2130 p->pages<<(PAGE_SHIFT-10), name, p->prio,
2131 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2132 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2133 (p->flags & SWP_DISCARDABLE) ? "D" : "");
2134
2135 mutex_unlock(&swapon_mutex);
2136 atomic_inc(&proc_poll_event);
2137 wake_up_interruptible(&proc_poll_wait);
2138
2139 if (S_ISREG(inode->i_mode))
2140 inode->i_flags |= S_SWAPFILE;
2141 error = 0;
2142 goto out;
2143bad_swap:
2144 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
2145 set_blocksize(p->bdev, p->old_block_size);
2146 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2147 }
2148 destroy_swap_extents(p);
2149 swap_cgroup_swapoff(p->type);
2150 spin_lock(&swap_lock);
2151 p->swap_file = NULL;
2152 p->flags = 0;
2153 spin_unlock(&swap_lock);
2154 vfree(swap_map);
2155 if (swap_file) {
2156 if (inode && S_ISREG(inode->i_mode)) {
2157 mutex_unlock(&inode->i_mutex);
2158 inode = NULL;
2159 }
2160 filp_close(swap_file, NULL);
2161 }
2162out:
2163 if (page && !IS_ERR(page)) {
2164 kunmap(page);
2165 page_cache_release(page);
2166 }
2167 if (name)
2168 putname(name);
2169 if (inode && S_ISREG(inode->i_mode))
2170 mutex_unlock(&inode->i_mutex);
2171 return error;
2172}
2173
2174void si_swapinfo(struct sysinfo *val)
2175{
2176 unsigned int type;
2177 unsigned long nr_to_be_unused = 0;
2178
2179 spin_lock(&swap_lock);
2180 for (type = 0; type < nr_swapfiles; type++) {
2181 struct swap_info_struct *si = swap_info[type];
2182
2183 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
2184 nr_to_be_unused += si->inuse_pages;
2185 }
2186 val->freeswap = nr_swap_pages + nr_to_be_unused;
2187 val->totalswap = total_swap_pages + nr_to_be_unused;
2188 spin_unlock(&swap_lock);
2189}
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2203{
2204 struct swap_info_struct *p;
2205 unsigned long offset, type;
2206 unsigned char count;
2207 unsigned char has_cache;
2208 int err = -EINVAL;
2209
2210 if (non_swap_entry(entry))
2211 goto out;
2212
2213 type = swp_type(entry);
2214 if (type >= nr_swapfiles)
2215 goto bad_file;
2216 p = swap_info[type];
2217 offset = swp_offset(entry);
2218
2219 spin_lock(&swap_lock);
2220 if (unlikely(offset >= p->max))
2221 goto unlock_out;
2222
2223 count = p->swap_map[offset];
2224 has_cache = count & SWAP_HAS_CACHE;
2225 count &= ~SWAP_HAS_CACHE;
2226 err = 0;
2227
2228 if (usage == SWAP_HAS_CACHE) {
2229
2230
2231 if (!has_cache && count)
2232 has_cache = SWAP_HAS_CACHE;
2233 else if (has_cache)
2234 err = -EEXIST;
2235 else
2236 err = -ENOENT;
2237
2238 } else if (count || has_cache) {
2239
2240 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
2241 count += usage;
2242 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
2243 err = -EINVAL;
2244 else if (swap_count_continued(p, offset, count))
2245 count = COUNT_CONTINUED;
2246 else
2247 err = -ENOMEM;
2248 } else
2249 err = -ENOENT;
2250
2251 p->swap_map[offset] = count | has_cache;
2252
2253unlock_out:
2254 spin_unlock(&swap_lock);
2255out:
2256 return err;
2257
2258bad_file:
2259 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
2260 goto out;
2261}
2262
2263
2264
2265
2266
2267void swap_shmem_alloc(swp_entry_t entry)
2268{
2269 __swap_duplicate(entry, SWAP_MAP_SHMEM);
2270}
2271
2272
2273
2274
2275
2276
2277
2278
2279int swap_duplicate(swp_entry_t entry)
2280{
2281 int err = 0;
2282
2283 while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
2284 err = add_swap_count_continuation(entry, GFP_ATOMIC);
2285 return err;
2286}
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296int swapcache_prepare(swp_entry_t entry)
2297{
2298 return __swap_duplicate(entry, SWAP_HAS_CACHE);
2299}
2300
2301
2302
2303
2304
2305int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2306{
2307 struct swap_info_struct *si;
2308 int our_page_cluster = page_cluster;
2309 pgoff_t target, toff;
2310 pgoff_t base, end;
2311 int nr_pages = 0;
2312
2313 if (!our_page_cluster)
2314 return 0;
2315
2316 si = swap_info[swp_type(entry)];
2317 target = swp_offset(entry);
2318 base = (target >> our_page_cluster) << our_page_cluster;
2319 end = base + (1 << our_page_cluster);
2320 if (!base)
2321 base++;
2322
2323 spin_lock(&swap_lock);
2324 if (end > si->max)
2325 end = si->max;
2326
2327
2328 for (toff = target; ++toff < end; nr_pages++) {
2329
2330 if (!si->swap_map[toff])
2331 break;
2332 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2333 break;
2334 }
2335
2336 for (toff = target; --toff >= base; nr_pages++) {
2337
2338 if (!si->swap_map[toff])
2339 break;
2340 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2341 break;
2342 }
2343 spin_unlock(&swap_lock);
2344
2345
2346
2347
2348
2349 *offset = ++toff;
2350 return nr_pages? ++nr_pages: 0;
2351}
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2369{
2370 struct swap_info_struct *si;
2371 struct page *head;
2372 struct page *page;
2373 struct page *list_page;
2374 pgoff_t offset;
2375 unsigned char count;
2376
2377
2378
2379
2380
2381 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
2382
2383 si = swap_info_get(entry);
2384 if (!si) {
2385
2386
2387
2388
2389
2390 goto outer;
2391 }
2392
2393 offset = swp_offset(entry);
2394 count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
2395
2396 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
2397
2398
2399
2400
2401
2402 goto out;
2403 }
2404
2405 if (!page) {
2406 spin_unlock(&swap_lock);
2407 return -ENOMEM;
2408 }
2409
2410
2411
2412
2413
2414
2415 head = vmalloc_to_page(si->swap_map + offset);
2416 offset &= ~PAGE_MASK;
2417
2418
2419
2420
2421
2422 if (!page_private(head)) {
2423 BUG_ON(count & COUNT_CONTINUED);
2424 INIT_LIST_HEAD(&head->lru);
2425 set_page_private(head, SWP_CONTINUED);
2426 si->flags |= SWP_CONTINUED;
2427 }
2428
2429 list_for_each_entry(list_page, &head->lru, lru) {
2430 unsigned char *map;
2431
2432
2433
2434
2435
2436 if (!(count & COUNT_CONTINUED))
2437 goto out;
2438
2439 map = kmap_atomic(list_page, KM_USER0) + offset;
2440 count = *map;
2441 kunmap_atomic(map, KM_USER0);
2442
2443
2444
2445
2446
2447 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
2448 goto out;
2449 }
2450
2451 list_add_tail(&page->lru, &head->lru);
2452 page = NULL;
2453out:
2454 spin_unlock(&swap_lock);
2455outer:
2456 if (page)
2457 __free_page(page);
2458 return 0;
2459}
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469static bool swap_count_continued(struct swap_info_struct *si,
2470 pgoff_t offset, unsigned char count)
2471{
2472 struct page *head;
2473 struct page *page;
2474 unsigned char *map;
2475
2476 head = vmalloc_to_page(si->swap_map + offset);
2477 if (page_private(head) != SWP_CONTINUED) {
2478 BUG_ON(count & COUNT_CONTINUED);
2479 return false;
2480 }
2481
2482 offset &= ~PAGE_MASK;
2483 page = list_entry(head->lru.next, struct page, lru);
2484 map = kmap_atomic(page, KM_USER0) + offset;
2485
2486 if (count == SWAP_MAP_MAX)
2487 goto init_map;
2488
2489 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) {
2490
2491
2492
2493 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
2494 kunmap_atomic(map, KM_USER0);
2495 page = list_entry(page->lru.next, struct page, lru);
2496 BUG_ON(page == head);
2497 map = kmap_atomic(page, KM_USER0) + offset;
2498 }
2499 if (*map == SWAP_CONT_MAX) {
2500 kunmap_atomic(map, KM_USER0);
2501 page = list_entry(page->lru.next, struct page, lru);
2502 if (page == head)
2503 return false;
2504 map = kmap_atomic(page, KM_USER0) + offset;
2505init_map: *map = 0;
2506 }
2507 *map += 1;
2508 kunmap_atomic(map, KM_USER0);
2509 page = list_entry(page->lru.prev, struct page, lru);
2510 while (page != head) {
2511 map = kmap_atomic(page, KM_USER0) + offset;
2512 *map = COUNT_CONTINUED;
2513 kunmap_atomic(map, KM_USER0);
2514 page = list_entry(page->lru.prev, struct page, lru);
2515 }
2516 return true;
2517
2518 } else {
2519
2520
2521
2522 BUG_ON(count != COUNT_CONTINUED);
2523 while (*map == COUNT_CONTINUED) {
2524 kunmap_atomic(map, KM_USER0);
2525 page = list_entry(page->lru.next, struct page, lru);
2526 BUG_ON(page == head);
2527 map = kmap_atomic(page, KM_USER0) + offset;
2528 }
2529 BUG_ON(*map == 0);
2530 *map -= 1;
2531 if (*map == 0)
2532 count = 0;
2533 kunmap_atomic(map, KM_USER0);
2534 page = list_entry(page->lru.prev, struct page, lru);
2535 while (page != head) {
2536 map = kmap_atomic(page, KM_USER0) + offset;
2537 *map = SWAP_CONT_MAX | count;
2538 count = COUNT_CONTINUED;
2539 kunmap_atomic(map, KM_USER0);
2540 page = list_entry(page->lru.prev, struct page, lru);
2541 }
2542 return count == COUNT_CONTINUED;
2543 }
2544}
2545
2546
2547
2548
2549
2550static void free_swap_count_continuations(struct swap_info_struct *si)
2551{
2552 pgoff_t offset;
2553
2554 for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
2555 struct page *head;
2556 head = vmalloc_to_page(si->swap_map + offset);
2557 if (page_private(head)) {
2558 struct list_head *this, *next;
2559 list_for_each_safe(this, next, &head->lru) {
2560 struct page *page;
2561 page = list_entry(this, struct page, lru);
2562 list_del(this);
2563 __free_page(page);
2564 }
2565 }
2566 }
2567}
2568