1
2
3
4
5
6
7
8#include <linux/mm.h>
9#include <linux/sched/mm.h>
10#include <linux/sched/task.h>
11#include <linux/hugetlb.h>
12#include <linux/mman.h>
13#include <linux/slab.h>
14#include <linux/kernel_stat.h>
15#include <linux/swap.h>
16#include <linux/vmalloc.h>
17#include <linux/pagemap.h>
18#include <linux/namei.h>
19#include <linux/shmem_fs.h>
20#include <linux/blkdev.h>
21#include <linux/random.h>
22#include <linux/writeback.h>
23#include <linux/proc_fs.h>
24#include <linux/seq_file.h>
25#include <linux/init.h>
26#include <linux/ksm.h>
27#include <linux/rmap.h>
28#include <linux/security.h>
29#include <linux/backing-dev.h>
30#include <linux/mutex.h>
31#include <linux/capability.h>
32#include <linux/syscalls.h>
33#include <linux/memcontrol.h>
34#include <linux/poll.h>
35#include <linux/oom.h>
36#include <linux/frontswap.h>
37#include <linux/swapfile.h>
38#include <linux/export.h>
39#include <linux/swap_slots.h>
40#include <linux/sort.h>
41
42#include <asm/pgtable.h>
43#include <asm/tlbflush.h>
44#include <linux/swapops.h>
45#include <linux/swap_cgroup.h>
46
47static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
48 unsigned char);
49static void free_swap_count_continuations(struct swap_info_struct *);
50static sector_t map_swap_entry(swp_entry_t, struct block_device**);
51
52DEFINE_SPINLOCK(swap_lock);
53static unsigned int nr_swapfiles;
54atomic_long_t nr_swap_pages;
55
56
57
58
59
60EXPORT_SYMBOL_GPL(nr_swap_pages);
61
62long total_swap_pages;
63static int least_priority = -1;
64
65static const char Bad_file[] = "Bad swap file entry ";
66static const char Unused_file[] = "Unused swap file entry ";
67static const char Bad_offset[] = "Bad swap offset entry ";
68static const char Unused_offset[] = "Unused swap offset entry ";
69
70
71
72
73
74PLIST_HEAD(swap_active_head);
75
76
77
78
79
80
81
82
83
84
85
86
87
88static struct plist_head *swap_avail_heads;
89static DEFINE_SPINLOCK(swap_avail_lock);
90
91struct swap_info_struct *swap_info[MAX_SWAPFILES];
92
93static DEFINE_MUTEX(swapon_mutex);
94
95static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
96
97static atomic_t proc_poll_event = ATOMIC_INIT(0);
98
99atomic_t nr_rotate_swap = ATOMIC_INIT(0);
100
101static inline unsigned char swap_count(unsigned char ent)
102{
103 return ent & ~SWAP_HAS_CACHE;
104}
105
106
107static int
108__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
109{
110 swp_entry_t entry = swp_entry(si->type, offset);
111 struct page *page;
112 int ret = 0;
113
114 page = find_get_page(swap_address_space(entry), swp_offset(entry));
115 if (!page)
116 return 0;
117
118
119
120
121
122
123
124 if (trylock_page(page)) {
125 ret = try_to_free_swap(page);
126 unlock_page(page);
127 }
128 put_page(page);
129 return ret;
130}
131
132
133
134
135
136static int discard_swap(struct swap_info_struct *si)
137{
138 struct swap_extent *se;
139 sector_t start_block;
140 sector_t nr_blocks;
141 int err = 0;
142
143
144 se = &si->first_swap_extent;
145 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
146 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
147 if (nr_blocks) {
148 err = blkdev_issue_discard(si->bdev, start_block,
149 nr_blocks, GFP_KERNEL, 0);
150 if (err)
151 return err;
152 cond_resched();
153 }
154
155 list_for_each_entry(se, &si->first_swap_extent.list, list) {
156 start_block = se->start_block << (PAGE_SHIFT - 9);
157 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
158
159 err = blkdev_issue_discard(si->bdev, start_block,
160 nr_blocks, GFP_KERNEL, 0);
161 if (err)
162 break;
163
164 cond_resched();
165 }
166 return err;
167}
168
169
170
171
172
173static void discard_swap_cluster(struct swap_info_struct *si,
174 pgoff_t start_page, pgoff_t nr_pages)
175{
176 struct swap_extent *se = si->curr_swap_extent;
177 int found_extent = 0;
178
179 while (nr_pages) {
180 if (se->start_page <= start_page &&
181 start_page < se->start_page + se->nr_pages) {
182 pgoff_t offset = start_page - se->start_page;
183 sector_t start_block = se->start_block + offset;
184 sector_t nr_blocks = se->nr_pages - offset;
185
186 if (nr_blocks > nr_pages)
187 nr_blocks = nr_pages;
188 start_page += nr_blocks;
189 nr_pages -= nr_blocks;
190
191 if (!found_extent++)
192 si->curr_swap_extent = se;
193
194 start_block <<= PAGE_SHIFT - 9;
195 nr_blocks <<= PAGE_SHIFT - 9;
196 if (blkdev_issue_discard(si->bdev, start_block,
197 nr_blocks, GFP_NOIO, 0))
198 break;
199 }
200
201 se = list_next_entry(se, list);
202 }
203}
204
205#ifdef CONFIG_THP_SWAP
206#define SWAPFILE_CLUSTER HPAGE_PMD_NR
207#else
208#define SWAPFILE_CLUSTER 256
209#endif
210#define LATENCY_LIMIT 256
211
212static inline void cluster_set_flag(struct swap_cluster_info *info,
213 unsigned int flag)
214{
215 info->flags = flag;
216}
217
218static inline unsigned int cluster_count(struct swap_cluster_info *info)
219{
220 return info->data;
221}
222
223static inline void cluster_set_count(struct swap_cluster_info *info,
224 unsigned int c)
225{
226 info->data = c;
227}
228
229static inline void cluster_set_count_flag(struct swap_cluster_info *info,
230 unsigned int c, unsigned int f)
231{
232 info->flags = f;
233 info->data = c;
234}
235
236static inline unsigned int cluster_next(struct swap_cluster_info *info)
237{
238 return info->data;
239}
240
241static inline void cluster_set_next(struct swap_cluster_info *info,
242 unsigned int n)
243{
244 info->data = n;
245}
246
247static inline void cluster_set_next_flag(struct swap_cluster_info *info,
248 unsigned int n, unsigned int f)
249{
250 info->flags = f;
251 info->data = n;
252}
253
254static inline bool cluster_is_free(struct swap_cluster_info *info)
255{
256 return info->flags & CLUSTER_FLAG_FREE;
257}
258
259static inline bool cluster_is_null(struct swap_cluster_info *info)
260{
261 return info->flags & CLUSTER_FLAG_NEXT_NULL;
262}
263
264static inline void cluster_set_null(struct swap_cluster_info *info)
265{
266 info->flags = CLUSTER_FLAG_NEXT_NULL;
267 info->data = 0;
268}
269
270static inline bool cluster_is_huge(struct swap_cluster_info *info)
271{
272 return info->flags & CLUSTER_FLAG_HUGE;
273}
274
275static inline void cluster_clear_huge(struct swap_cluster_info *info)
276{
277 info->flags &= ~CLUSTER_FLAG_HUGE;
278}
279
280static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
281 unsigned long offset)
282{
283 struct swap_cluster_info *ci;
284
285 ci = si->cluster_info;
286 if (ci) {
287 ci += offset / SWAPFILE_CLUSTER;
288 spin_lock(&ci->lock);
289 }
290 return ci;
291}
292
293static inline void unlock_cluster(struct swap_cluster_info *ci)
294{
295 if (ci)
296 spin_unlock(&ci->lock);
297}
298
299static inline struct swap_cluster_info *lock_cluster_or_swap_info(
300 struct swap_info_struct *si,
301 unsigned long offset)
302{
303 struct swap_cluster_info *ci;
304
305 ci = lock_cluster(si, offset);
306 if (!ci)
307 spin_lock(&si->lock);
308
309 return ci;
310}
311
312static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
313 struct swap_cluster_info *ci)
314{
315 if (ci)
316 unlock_cluster(ci);
317 else
318 spin_unlock(&si->lock);
319}
320
321static inline bool cluster_list_empty(struct swap_cluster_list *list)
322{
323 return cluster_is_null(&list->head);
324}
325
326static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
327{
328 return cluster_next(&list->head);
329}
330
331static void cluster_list_init(struct swap_cluster_list *list)
332{
333 cluster_set_null(&list->head);
334 cluster_set_null(&list->tail);
335}
336
337static void cluster_list_add_tail(struct swap_cluster_list *list,
338 struct swap_cluster_info *ci,
339 unsigned int idx)
340{
341 if (cluster_list_empty(list)) {
342 cluster_set_next_flag(&list->head, idx, 0);
343 cluster_set_next_flag(&list->tail, idx, 0);
344 } else {
345 struct swap_cluster_info *ci_tail;
346 unsigned int tail = cluster_next(&list->tail);
347
348
349
350
351
352 ci_tail = ci + tail;
353 spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
354 cluster_set_next(ci_tail, idx);
355 spin_unlock(&ci_tail->lock);
356 cluster_set_next_flag(&list->tail, idx, 0);
357 }
358}
359
360static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
361 struct swap_cluster_info *ci)
362{
363 unsigned int idx;
364
365 idx = cluster_next(&list->head);
366 if (cluster_next(&list->tail) == idx) {
367 cluster_set_null(&list->head);
368 cluster_set_null(&list->tail);
369 } else
370 cluster_set_next_flag(&list->head,
371 cluster_next(&ci[idx]), 0);
372
373 return idx;
374}
375
376
377static void swap_cluster_schedule_discard(struct swap_info_struct *si,
378 unsigned int idx)
379{
380
381
382
383
384
385
386 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
387 SWAP_MAP_BAD, SWAPFILE_CLUSTER);
388
389 cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
390
391 schedule_work(&si->discard_work);
392}
393
394static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
395{
396 struct swap_cluster_info *ci = si->cluster_info;
397
398 cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
399 cluster_list_add_tail(&si->free_clusters, ci, idx);
400}
401
402
403
404
405
406static void swap_do_scheduled_discard(struct swap_info_struct *si)
407{
408 struct swap_cluster_info *info, *ci;
409 unsigned int idx;
410
411 info = si->cluster_info;
412
413 while (!cluster_list_empty(&si->discard_clusters)) {
414 idx = cluster_list_del_first(&si->discard_clusters, info);
415 spin_unlock(&si->lock);
416
417 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
418 SWAPFILE_CLUSTER);
419
420 spin_lock(&si->lock);
421 ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
422 __free_cluster(si, idx);
423 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
424 0, SWAPFILE_CLUSTER);
425 unlock_cluster(ci);
426 }
427}
428
429static void swap_discard_work(struct work_struct *work)
430{
431 struct swap_info_struct *si;
432
433 si = container_of(work, struct swap_info_struct, discard_work);
434
435 spin_lock(&si->lock);
436 swap_do_scheduled_discard(si);
437 spin_unlock(&si->lock);
438}
439
440static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
441{
442 struct swap_cluster_info *ci = si->cluster_info;
443
444 VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
445 cluster_list_del_first(&si->free_clusters, ci);
446 cluster_set_count_flag(ci + idx, 0, 0);
447}
448
449static void free_cluster(struct swap_info_struct *si, unsigned long idx)
450{
451 struct swap_cluster_info *ci = si->cluster_info + idx;
452
453 VM_BUG_ON(cluster_count(ci) != 0);
454
455
456
457
458
459 if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
460 (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
461 swap_cluster_schedule_discard(si, idx);
462 return;
463 }
464
465 __free_cluster(si, idx);
466}
467
468
469
470
471
472static void inc_cluster_info_page(struct swap_info_struct *p,
473 struct swap_cluster_info *cluster_info, unsigned long page_nr)
474{
475 unsigned long idx = page_nr / SWAPFILE_CLUSTER;
476
477 if (!cluster_info)
478 return;
479 if (cluster_is_free(&cluster_info[idx]))
480 alloc_cluster(p, idx);
481
482 VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
483 cluster_set_count(&cluster_info[idx],
484 cluster_count(&cluster_info[idx]) + 1);
485}
486
487
488
489
490
491
492static void dec_cluster_info_page(struct swap_info_struct *p,
493 struct swap_cluster_info *cluster_info, unsigned long page_nr)
494{
495 unsigned long idx = page_nr / SWAPFILE_CLUSTER;
496
497 if (!cluster_info)
498 return;
499
500 VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
501 cluster_set_count(&cluster_info[idx],
502 cluster_count(&cluster_info[idx]) - 1);
503
504 if (cluster_count(&cluster_info[idx]) == 0)
505 free_cluster(p, idx);
506}
507
508
509
510
511
512static bool
513scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
514 unsigned long offset)
515{
516 struct percpu_cluster *percpu_cluster;
517 bool conflict;
518
519 offset /= SWAPFILE_CLUSTER;
520 conflict = !cluster_list_empty(&si->free_clusters) &&
521 offset != cluster_list_first(&si->free_clusters) &&
522 cluster_is_free(&si->cluster_info[offset]);
523
524 if (!conflict)
525 return false;
526
527 percpu_cluster = this_cpu_ptr(si->percpu_cluster);
528 cluster_set_null(&percpu_cluster->index);
529 return true;
530}
531
532
533
534
535
536static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
537 unsigned long *offset, unsigned long *scan_base)
538{
539 struct percpu_cluster *cluster;
540 struct swap_cluster_info *ci;
541 bool found_free;
542 unsigned long tmp, max;
543
544new_cluster:
545 cluster = this_cpu_ptr(si->percpu_cluster);
546 if (cluster_is_null(&cluster->index)) {
547 if (!cluster_list_empty(&si->free_clusters)) {
548 cluster->index = si->free_clusters.head;
549 cluster->next = cluster_next(&cluster->index) *
550 SWAPFILE_CLUSTER;
551 } else if (!cluster_list_empty(&si->discard_clusters)) {
552
553
554
555
556 swap_do_scheduled_discard(si);
557 *scan_base = *offset = si->cluster_next;
558 goto new_cluster;
559 } else
560 return false;
561 }
562
563 found_free = false;
564
565
566
567
568
569 tmp = cluster->next;
570 max = min_t(unsigned long, si->max,
571 (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
572 if (tmp >= max) {
573 cluster_set_null(&cluster->index);
574 goto new_cluster;
575 }
576 ci = lock_cluster(si, tmp);
577 while (tmp < max) {
578 if (!si->swap_map[tmp]) {
579 found_free = true;
580 break;
581 }
582 tmp++;
583 }
584 unlock_cluster(ci);
585 if (!found_free) {
586 cluster_set_null(&cluster->index);
587 goto new_cluster;
588 }
589 cluster->next = tmp + 1;
590 *offset = tmp;
591 *scan_base = tmp;
592 return found_free;
593}
594
595static void __del_from_avail_list(struct swap_info_struct *p)
596{
597 int nid;
598
599 for_each_node(nid)
600 plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
601}
602
603static void del_from_avail_list(struct swap_info_struct *p)
604{
605 spin_lock(&swap_avail_lock);
606 __del_from_avail_list(p);
607 spin_unlock(&swap_avail_lock);
608}
609
610static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
611 unsigned int nr_entries)
612{
613 unsigned int end = offset + nr_entries - 1;
614
615 if (offset == si->lowest_bit)
616 si->lowest_bit += nr_entries;
617 if (end == si->highest_bit)
618 si->highest_bit -= nr_entries;
619 si->inuse_pages += nr_entries;
620 if (si->inuse_pages == si->pages) {
621 si->lowest_bit = si->max;
622 si->highest_bit = 0;
623 del_from_avail_list(si);
624 }
625}
626
627static void add_to_avail_list(struct swap_info_struct *p)
628{
629 int nid;
630
631 spin_lock(&swap_avail_lock);
632 for_each_node(nid) {
633 WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
634 plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
635 }
636 spin_unlock(&swap_avail_lock);
637}
638
639static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
640 unsigned int nr_entries)
641{
642 unsigned long end = offset + nr_entries - 1;
643 void (*swap_slot_free_notify)(struct block_device *, unsigned long);
644
645 if (offset < si->lowest_bit)
646 si->lowest_bit = offset;
647 if (end > si->highest_bit) {
648 bool was_full = !si->highest_bit;
649
650 si->highest_bit = end;
651 if (was_full && (si->flags & SWP_WRITEOK))
652 add_to_avail_list(si);
653 }
654 atomic_long_add(nr_entries, &nr_swap_pages);
655 si->inuse_pages -= nr_entries;
656 if (si->flags & SWP_BLKDEV)
657 swap_slot_free_notify =
658 si->bdev->bd_disk->fops->swap_slot_free_notify;
659 else
660 swap_slot_free_notify = NULL;
661 while (offset <= end) {
662 frontswap_invalidate_page(si->type, offset);
663 if (swap_slot_free_notify)
664 swap_slot_free_notify(si->bdev, offset);
665 offset++;
666 }
667}
668
669static int scan_swap_map_slots(struct swap_info_struct *si,
670 unsigned char usage, int nr,
671 swp_entry_t slots[])
672{
673 struct swap_cluster_info *ci;
674 unsigned long offset;
675 unsigned long scan_base;
676 unsigned long last_in_cluster = 0;
677 int latency_ration = LATENCY_LIMIT;
678 int n_ret = 0;
679
680 if (nr > SWAP_BATCH)
681 nr = SWAP_BATCH;
682
683
684
685
686
687
688
689
690
691
692
693
694 si->flags += SWP_SCANNING;
695 scan_base = offset = si->cluster_next;
696
697
698 if (si->cluster_info) {
699 if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
700 goto checks;
701 else
702 goto scan;
703 }
704
705 if (unlikely(!si->cluster_nr--)) {
706 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
707 si->cluster_nr = SWAPFILE_CLUSTER - 1;
708 goto checks;
709 }
710
711 spin_unlock(&si->lock);
712
713
714
715
716
717
718
719 scan_base = offset = si->lowest_bit;
720 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
721
722
723 for (; last_in_cluster <= si->highest_bit; offset++) {
724 if (si->swap_map[offset])
725 last_in_cluster = offset + SWAPFILE_CLUSTER;
726 else if (offset == last_in_cluster) {
727 spin_lock(&si->lock);
728 offset -= SWAPFILE_CLUSTER - 1;
729 si->cluster_next = offset;
730 si->cluster_nr = SWAPFILE_CLUSTER - 1;
731 goto checks;
732 }
733 if (unlikely(--latency_ration < 0)) {
734 cond_resched();
735 latency_ration = LATENCY_LIMIT;
736 }
737 }
738
739 offset = scan_base;
740 spin_lock(&si->lock);
741 si->cluster_nr = SWAPFILE_CLUSTER - 1;
742 }
743
744checks:
745 if (si->cluster_info) {
746 while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
747
748 if (n_ret)
749 goto done;
750 if (!scan_swap_map_try_ssd_cluster(si, &offset,
751 &scan_base))
752 goto scan;
753 }
754 }
755 if (!(si->flags & SWP_WRITEOK))
756 goto no_page;
757 if (!si->highest_bit)
758 goto no_page;
759 if (offset > si->highest_bit)
760 scan_base = offset = si->lowest_bit;
761
762 ci = lock_cluster(si, offset);
763
764 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
765 int swap_was_freed;
766 unlock_cluster(ci);
767 spin_unlock(&si->lock);
768 swap_was_freed = __try_to_reclaim_swap(si, offset);
769 spin_lock(&si->lock);
770
771 if (swap_was_freed)
772 goto checks;
773 goto scan;
774 }
775
776 if (si->swap_map[offset]) {
777 unlock_cluster(ci);
778 if (!n_ret)
779 goto scan;
780 else
781 goto done;
782 }
783 si->swap_map[offset] = usage;
784 inc_cluster_info_page(si, si->cluster_info, offset);
785 unlock_cluster(ci);
786
787 swap_range_alloc(si, offset, 1);
788 si->cluster_next = offset + 1;
789 slots[n_ret++] = swp_entry(si->type, offset);
790
791
792 if ((n_ret == nr) || (offset >= si->highest_bit))
793 goto done;
794
795
796
797
798 if (unlikely(--latency_ration < 0)) {
799 if (n_ret)
800 goto done;
801 spin_unlock(&si->lock);
802 cond_resched();
803 spin_lock(&si->lock);
804 latency_ration = LATENCY_LIMIT;
805 }
806
807
808 if (si->cluster_info) {
809 if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
810 goto checks;
811 else
812 goto done;
813 }
814
815 ++offset;
816
817
818 if (si->cluster_nr && !si->swap_map[offset]) {
819 --si->cluster_nr;
820 goto checks;
821 }
822
823done:
824 si->flags -= SWP_SCANNING;
825 return n_ret;
826
827scan:
828 spin_unlock(&si->lock);
829 while (++offset <= si->highest_bit) {
830 if (!si->swap_map[offset]) {
831 spin_lock(&si->lock);
832 goto checks;
833 }
834 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
835 spin_lock(&si->lock);
836 goto checks;
837 }
838 if (unlikely(--latency_ration < 0)) {
839 cond_resched();
840 latency_ration = LATENCY_LIMIT;
841 }
842 }
843 offset = si->lowest_bit;
844 while (offset < scan_base) {
845 if (!si->swap_map[offset]) {
846 spin_lock(&si->lock);
847 goto checks;
848 }
849 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
850 spin_lock(&si->lock);
851 goto checks;
852 }
853 if (unlikely(--latency_ration < 0)) {
854 cond_resched();
855 latency_ration = LATENCY_LIMIT;
856 }
857 offset++;
858 }
859 spin_lock(&si->lock);
860
861no_page:
862 si->flags -= SWP_SCANNING;
863 return n_ret;
864}
865
866#ifdef CONFIG_THP_SWAP
867static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
868{
869 unsigned long idx;
870 struct swap_cluster_info *ci;
871 unsigned long offset, i;
872 unsigned char *map;
873
874 if (cluster_list_empty(&si->free_clusters))
875 return 0;
876
877 idx = cluster_list_first(&si->free_clusters);
878 offset = idx * SWAPFILE_CLUSTER;
879 ci = lock_cluster(si, offset);
880 alloc_cluster(si, idx);
881 cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
882
883 map = si->swap_map + offset;
884 for (i = 0; i < SWAPFILE_CLUSTER; i++)
885 map[i] = SWAP_HAS_CACHE;
886 unlock_cluster(ci);
887 swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
888 *slot = swp_entry(si->type, offset);
889
890 return 1;
891}
892
893static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
894{
895 unsigned long offset = idx * SWAPFILE_CLUSTER;
896 struct swap_cluster_info *ci;
897
898 ci = lock_cluster(si, offset);
899 cluster_set_count_flag(ci, 0, 0);
900 free_cluster(si, idx);
901 unlock_cluster(ci);
902 swap_range_free(si, offset, SWAPFILE_CLUSTER);
903}
904#else
905static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
906{
907 VM_WARN_ON_ONCE(1);
908 return 0;
909}
910#endif
911
912static unsigned long scan_swap_map(struct swap_info_struct *si,
913 unsigned char usage)
914{
915 swp_entry_t entry;
916 int n_ret;
917
918 n_ret = scan_swap_map_slots(si, usage, 1, &entry);
919
920 if (n_ret)
921 return swp_offset(entry);
922 else
923 return 0;
924
925}
926
927int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
928{
929 unsigned long nr_pages = cluster ? SWAPFILE_CLUSTER : 1;
930 struct swap_info_struct *si, *next;
931 long avail_pgs;
932 int n_ret = 0;
933 int node;
934
935
936 WARN_ON_ONCE(n_goal > 1 && cluster);
937
938 avail_pgs = atomic_long_read(&nr_swap_pages) / nr_pages;
939 if (avail_pgs <= 0)
940 goto noswap;
941
942 if (n_goal > SWAP_BATCH)
943 n_goal = SWAP_BATCH;
944
945 if (n_goal > avail_pgs)
946 n_goal = avail_pgs;
947
948 atomic_long_sub(n_goal * nr_pages, &nr_swap_pages);
949
950 spin_lock(&swap_avail_lock);
951
952start_over:
953 node = numa_node_id();
954 plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
955
956 plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
957 spin_unlock(&swap_avail_lock);
958 spin_lock(&si->lock);
959 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
960 spin_lock(&swap_avail_lock);
961 if (plist_node_empty(&si->avail_lists[node])) {
962 spin_unlock(&si->lock);
963 goto nextsi;
964 }
965 WARN(!si->highest_bit,
966 "swap_info %d in list but !highest_bit\n",
967 si->type);
968 WARN(!(si->flags & SWP_WRITEOK),
969 "swap_info %d in list but !SWP_WRITEOK\n",
970 si->type);
971 __del_from_avail_list(si);
972 spin_unlock(&si->lock);
973 goto nextsi;
974 }
975 if (cluster) {
976 if (!(si->flags & SWP_FILE))
977 n_ret = swap_alloc_cluster(si, swp_entries);
978 } else
979 n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
980 n_goal, swp_entries);
981 spin_unlock(&si->lock);
982 if (n_ret || cluster)
983 goto check_out;
984 pr_debug("scan_swap_map of si %d failed to find offset\n",
985 si->type);
986
987 spin_lock(&swap_avail_lock);
988nextsi:
989
990
991
992
993
994
995
996
997
998
999
1000 if (plist_node_empty(&next->avail_lists[node]))
1001 goto start_over;
1002 }
1003
1004 spin_unlock(&swap_avail_lock);
1005
1006check_out:
1007 if (n_ret < n_goal)
1008 atomic_long_add((long)(n_goal - n_ret) * nr_pages,
1009 &nr_swap_pages);
1010noswap:
1011 return n_ret;
1012}
1013
1014
1015swp_entry_t get_swap_page_of_type(int type)
1016{
1017 struct swap_info_struct *si;
1018 pgoff_t offset;
1019
1020 si = swap_info[type];
1021 spin_lock(&si->lock);
1022 if (si && (si->flags & SWP_WRITEOK)) {
1023 atomic_long_dec(&nr_swap_pages);
1024
1025 offset = scan_swap_map(si, 1);
1026 if (offset) {
1027 spin_unlock(&si->lock);
1028 return swp_entry(type, offset);
1029 }
1030 atomic_long_inc(&nr_swap_pages);
1031 }
1032 spin_unlock(&si->lock);
1033 return (swp_entry_t) {0};
1034}
1035
1036static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
1037{
1038 struct swap_info_struct *p;
1039 unsigned long offset, type;
1040
1041 if (!entry.val)
1042 goto out;
1043 type = swp_type(entry);
1044 if (type >= nr_swapfiles)
1045 goto bad_nofile;
1046 p = swap_info[type];
1047 if (!(p->flags & SWP_USED))
1048 goto bad_device;
1049 offset = swp_offset(entry);
1050 if (offset >= p->max)
1051 goto bad_offset;
1052 return p;
1053
1054bad_offset:
1055 pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val);
1056 goto out;
1057bad_device:
1058 pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val);
1059 goto out;
1060bad_nofile:
1061 pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val);
1062out:
1063 return NULL;
1064}
1065
1066static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
1067{
1068 struct swap_info_struct *p;
1069
1070 p = __swap_info_get(entry);
1071 if (!p)
1072 goto out;
1073 if (!p->swap_map[swp_offset(entry)])
1074 goto bad_free;
1075 return p;
1076
1077bad_free:
1078 pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
1079 goto out;
1080out:
1081 return NULL;
1082}
1083
1084static struct swap_info_struct *swap_info_get(swp_entry_t entry)
1085{
1086 struct swap_info_struct *p;
1087
1088 p = _swap_info_get(entry);
1089 if (p)
1090 spin_lock(&p->lock);
1091 return p;
1092}
1093
1094static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
1095 struct swap_info_struct *q)
1096{
1097 struct swap_info_struct *p;
1098
1099 p = _swap_info_get(entry);
1100
1101 if (p != q) {
1102 if (q != NULL)
1103 spin_unlock(&q->lock);
1104 if (p != NULL)
1105 spin_lock(&p->lock);
1106 }
1107 return p;
1108}
1109
1110static unsigned char __swap_entry_free(struct swap_info_struct *p,
1111 swp_entry_t entry, unsigned char usage)
1112{
1113 struct swap_cluster_info *ci;
1114 unsigned long offset = swp_offset(entry);
1115 unsigned char count;
1116 unsigned char has_cache;
1117
1118 ci = lock_cluster_or_swap_info(p, offset);
1119
1120 count = p->swap_map[offset];
1121
1122 has_cache = count & SWAP_HAS_CACHE;
1123 count &= ~SWAP_HAS_CACHE;
1124
1125 if (usage == SWAP_HAS_CACHE) {
1126 VM_BUG_ON(!has_cache);
1127 has_cache = 0;
1128 } else if (count == SWAP_MAP_SHMEM) {
1129
1130
1131
1132
1133 count = 0;
1134 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
1135 if (count == COUNT_CONTINUED) {
1136 if (swap_count_continued(p, offset, count))
1137 count = SWAP_MAP_MAX | COUNT_CONTINUED;
1138 else
1139 count = SWAP_MAP_MAX;
1140 } else
1141 count--;
1142 }
1143
1144 usage = count | has_cache;
1145 p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
1146
1147 unlock_cluster_or_swap_info(p, ci);
1148
1149 return usage;
1150}
1151
1152static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
1153{
1154 struct swap_cluster_info *ci;
1155 unsigned long offset = swp_offset(entry);
1156 unsigned char count;
1157
1158 ci = lock_cluster(p, offset);
1159 count = p->swap_map[offset];
1160 VM_BUG_ON(count != SWAP_HAS_CACHE);
1161 p->swap_map[offset] = 0;
1162 dec_cluster_info_page(p, p->cluster_info, offset);
1163 unlock_cluster(ci);
1164
1165 mem_cgroup_uncharge_swap(entry, 1);
1166 swap_range_free(p, offset, 1);
1167}
1168
1169
1170
1171
1172
1173void swap_free(swp_entry_t entry)
1174{
1175 struct swap_info_struct *p;
1176
1177 p = _swap_info_get(entry);
1178 if (p) {
1179 if (!__swap_entry_free(p, entry, 1))
1180 free_swap_slot(entry);
1181 }
1182}
1183
1184
1185
1186
1187static void swapcache_free(swp_entry_t entry)
1188{
1189 struct swap_info_struct *p;
1190
1191 p = _swap_info_get(entry);
1192 if (p) {
1193 if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE))
1194 free_swap_slot(entry);
1195 }
1196}
1197
1198#ifdef CONFIG_THP_SWAP
1199static void swapcache_free_cluster(swp_entry_t entry)
1200{
1201 unsigned long offset = swp_offset(entry);
1202 unsigned long idx = offset / SWAPFILE_CLUSTER;
1203 struct swap_cluster_info *ci;
1204 struct swap_info_struct *si;
1205 unsigned char *map;
1206 unsigned int i, free_entries = 0;
1207 unsigned char val;
1208
1209 si = _swap_info_get(entry);
1210 if (!si)
1211 return;
1212
1213 ci = lock_cluster(si, offset);
1214 VM_BUG_ON(!cluster_is_huge(ci));
1215 map = si->swap_map + offset;
1216 for (i = 0; i < SWAPFILE_CLUSTER; i++) {
1217 val = map[i];
1218 VM_BUG_ON(!(val & SWAP_HAS_CACHE));
1219 if (val == SWAP_HAS_CACHE)
1220 free_entries++;
1221 }
1222 if (!free_entries) {
1223 for (i = 0; i < SWAPFILE_CLUSTER; i++)
1224 map[i] &= ~SWAP_HAS_CACHE;
1225 }
1226 cluster_clear_huge(ci);
1227 unlock_cluster(ci);
1228 if (free_entries == SWAPFILE_CLUSTER) {
1229 spin_lock(&si->lock);
1230 ci = lock_cluster(si, offset);
1231 memset(map, 0, SWAPFILE_CLUSTER);
1232 unlock_cluster(ci);
1233 mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
1234 swap_free_cluster(si, idx);
1235 spin_unlock(&si->lock);
1236 } else if (free_entries) {
1237 for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) {
1238 if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
1239 free_swap_slot(entry);
1240 }
1241 }
1242}
1243
1244int split_swap_cluster(swp_entry_t entry)
1245{
1246 struct swap_info_struct *si;
1247 struct swap_cluster_info *ci;
1248 unsigned long offset = swp_offset(entry);
1249
1250 si = _swap_info_get(entry);
1251 if (!si)
1252 return -EBUSY;
1253 ci = lock_cluster(si, offset);
1254 cluster_clear_huge(ci);
1255 unlock_cluster(ci);
1256 return 0;
1257}
1258#else
1259static inline void swapcache_free_cluster(swp_entry_t entry)
1260{
1261}
1262#endif
1263
1264void put_swap_page(struct page *page, swp_entry_t entry)
1265{
1266 if (!PageTransHuge(page))
1267 swapcache_free(entry);
1268 else
1269 swapcache_free_cluster(entry);
1270}
1271
1272static int swp_entry_cmp(const void *ent1, const void *ent2)
1273{
1274 const swp_entry_t *e1 = ent1, *e2 = ent2;
1275
1276 return (int)swp_type(*e1) - (int)swp_type(*e2);
1277}
1278
1279void swapcache_free_entries(swp_entry_t *entries, int n)
1280{
1281 struct swap_info_struct *p, *prev;
1282 int i;
1283
1284 if (n <= 0)
1285 return;
1286
1287 prev = NULL;
1288 p = NULL;
1289
1290
1291
1292
1293
1294
1295 if (nr_swapfiles > 1)
1296 sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
1297 for (i = 0; i < n; ++i) {
1298 p = swap_info_get_cont(entries[i], prev);
1299 if (p)
1300 swap_entry_free(p, entries[i]);
1301 prev = p;
1302 }
1303 if (p)
1304 spin_unlock(&p->lock);
1305}
1306
1307
1308
1309
1310
1311
1312int page_swapcount(struct page *page)
1313{
1314 int count = 0;
1315 struct swap_info_struct *p;
1316 struct swap_cluster_info *ci;
1317 swp_entry_t entry;
1318 unsigned long offset;
1319
1320 entry.val = page_private(page);
1321 p = _swap_info_get(entry);
1322 if (p) {
1323 offset = swp_offset(entry);
1324 ci = lock_cluster_or_swap_info(p, offset);
1325 count = swap_count(p->swap_map[offset]);
1326 unlock_cluster_or_swap_info(p, ci);
1327 }
1328 return count;
1329}
1330
1331int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
1332{
1333 pgoff_t offset = swp_offset(entry);
1334
1335 return swap_count(si->swap_map[offset]);
1336}
1337
1338static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
1339{
1340 int count = 0;
1341 pgoff_t offset = swp_offset(entry);
1342 struct swap_cluster_info *ci;
1343
1344 ci = lock_cluster_or_swap_info(si, offset);
1345 count = swap_count(si->swap_map[offset]);
1346 unlock_cluster_or_swap_info(si, ci);
1347 return count;
1348}
1349
1350
1351
1352
1353
1354
1355int __swp_swapcount(swp_entry_t entry)
1356{
1357 int count = 0;
1358 struct swap_info_struct *si;
1359
1360 si = __swap_info_get(entry);
1361 if (si)
1362 count = swap_swapcount(si, entry);
1363 return count;
1364}
1365
1366
1367
1368
1369
1370int swp_swapcount(swp_entry_t entry)
1371{
1372 int count, tmp_count, n;
1373 struct swap_info_struct *p;
1374 struct swap_cluster_info *ci;
1375 struct page *page;
1376 pgoff_t offset;
1377 unsigned char *map;
1378
1379 p = _swap_info_get(entry);
1380 if (!p)
1381 return 0;
1382
1383 offset = swp_offset(entry);
1384
1385 ci = lock_cluster_or_swap_info(p, offset);
1386
1387 count = swap_count(p->swap_map[offset]);
1388 if (!(count & COUNT_CONTINUED))
1389 goto out;
1390
1391 count &= ~COUNT_CONTINUED;
1392 n = SWAP_MAP_MAX + 1;
1393
1394 page = vmalloc_to_page(p->swap_map + offset);
1395 offset &= ~PAGE_MASK;
1396 VM_BUG_ON(page_private(page) != SWP_CONTINUED);
1397
1398 do {
1399 page = list_next_entry(page, lru);
1400 map = kmap_atomic(page);
1401 tmp_count = map[offset];
1402 kunmap_atomic(map);
1403
1404 count += (tmp_count & ~COUNT_CONTINUED) * n;
1405 n *= (SWAP_CONT_MAX + 1);
1406 } while (tmp_count & COUNT_CONTINUED);
1407out:
1408 unlock_cluster_or_swap_info(p, ci);
1409 return count;
1410}
1411
1412#ifdef CONFIG_THP_SWAP
1413static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
1414 swp_entry_t entry)
1415{
1416 struct swap_cluster_info *ci;
1417 unsigned char *map = si->swap_map;
1418 unsigned long roffset = swp_offset(entry);
1419 unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
1420 int i;
1421 bool ret = false;
1422
1423 ci = lock_cluster_or_swap_info(si, offset);
1424 if (!ci || !cluster_is_huge(ci)) {
1425 if (map[roffset] != SWAP_HAS_CACHE)
1426 ret = true;
1427 goto unlock_out;
1428 }
1429 for (i = 0; i < SWAPFILE_CLUSTER; i++) {
1430 if (map[offset + i] != SWAP_HAS_CACHE) {
1431 ret = true;
1432 break;
1433 }
1434 }
1435unlock_out:
1436 unlock_cluster_or_swap_info(si, ci);
1437 return ret;
1438}
1439
1440static bool page_swapped(struct page *page)
1441{
1442 swp_entry_t entry;
1443 struct swap_info_struct *si;
1444
1445 if (likely(!PageTransCompound(page)))
1446 return page_swapcount(page) != 0;
1447
1448 page = compound_head(page);
1449 entry.val = page_private(page);
1450 si = _swap_info_get(entry);
1451 if (si)
1452 return swap_page_trans_huge_swapped(si, entry);
1453 return false;
1454}
1455
1456static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
1457 int *total_swapcount)
1458{
1459 int i, map_swapcount, _total_mapcount, _total_swapcount;
1460 unsigned long offset = 0;
1461 struct swap_info_struct *si;
1462 struct swap_cluster_info *ci = NULL;
1463 unsigned char *map = NULL;
1464 int mapcount, swapcount = 0;
1465
1466
1467 VM_BUG_ON_PAGE(PageHuge(page), page);
1468
1469 if (likely(!PageTransCompound(page))) {
1470 mapcount = atomic_read(&page->_mapcount) + 1;
1471 if (total_mapcount)
1472 *total_mapcount = mapcount;
1473 if (PageSwapCache(page))
1474 swapcount = page_swapcount(page);
1475 if (total_swapcount)
1476 *total_swapcount = swapcount;
1477 return mapcount + swapcount;
1478 }
1479
1480 page = compound_head(page);
1481
1482 _total_mapcount = _total_swapcount = map_swapcount = 0;
1483 if (PageSwapCache(page)) {
1484 swp_entry_t entry;
1485
1486 entry.val = page_private(page);
1487 si = _swap_info_get(entry);
1488 if (si) {
1489 map = si->swap_map;
1490 offset = swp_offset(entry);
1491 }
1492 }
1493 if (map)
1494 ci = lock_cluster(si, offset);
1495 for (i = 0; i < HPAGE_PMD_NR; i++) {
1496 mapcount = atomic_read(&page[i]._mapcount) + 1;
1497 _total_mapcount += mapcount;
1498 if (map) {
1499 swapcount = swap_count(map[offset + i]);
1500 _total_swapcount += swapcount;
1501 }
1502 map_swapcount = max(map_swapcount, mapcount + swapcount);
1503 }
1504 unlock_cluster(ci);
1505 if (PageDoubleMap(page)) {
1506 map_swapcount -= 1;
1507 _total_mapcount -= HPAGE_PMD_NR;
1508 }
1509 mapcount = compound_mapcount(page);
1510 map_swapcount += mapcount;
1511 _total_mapcount += mapcount;
1512 if (total_mapcount)
1513 *total_mapcount = _total_mapcount;
1514 if (total_swapcount)
1515 *total_swapcount = _total_swapcount;
1516
1517 return map_swapcount;
1518}
1519#else
1520#define swap_page_trans_huge_swapped(si, entry) swap_swapcount(si, entry)
1521#define page_swapped(page) (page_swapcount(page) != 0)
1522
1523static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
1524 int *total_swapcount)
1525{
1526 int mapcount, swapcount = 0;
1527
1528
1529 VM_BUG_ON_PAGE(PageHuge(page), page);
1530
1531 mapcount = page_trans_huge_mapcount(page, total_mapcount);
1532 if (PageSwapCache(page))
1533 swapcount = page_swapcount(page);
1534 if (total_swapcount)
1535 *total_swapcount = swapcount;
1536 return mapcount + swapcount;
1537}
1538#endif
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550bool reuse_swap_page(struct page *page, int *total_map_swapcount)
1551{
1552 int count, total_mapcount, total_swapcount;
1553
1554 VM_BUG_ON_PAGE(!PageLocked(page), page);
1555 if (unlikely(PageKsm(page)))
1556 return false;
1557 count = page_trans_huge_map_swapcount(page, &total_mapcount,
1558 &total_swapcount);
1559 if (total_map_swapcount)
1560 *total_map_swapcount = total_mapcount + total_swapcount;
1561 if (count == 1 && PageSwapCache(page) &&
1562 (likely(!PageTransCompound(page)) ||
1563
1564 total_swapcount == page_swapcount(page))) {
1565 if (!PageWriteback(page)) {
1566 page = compound_head(page);
1567 delete_from_swap_cache(page);
1568 SetPageDirty(page);
1569 } else {
1570 swp_entry_t entry;
1571 struct swap_info_struct *p;
1572
1573 entry.val = page_private(page);
1574 p = swap_info_get(entry);
1575 if (p->flags & SWP_STABLE_WRITES) {
1576 spin_unlock(&p->lock);
1577 return false;
1578 }
1579 spin_unlock(&p->lock);
1580 }
1581 }
1582
1583 return count <= 1;
1584}
1585
1586
1587
1588
1589
1590int try_to_free_swap(struct page *page)
1591{
1592 VM_BUG_ON_PAGE(!PageLocked(page), page);
1593
1594 if (!PageSwapCache(page))
1595 return 0;
1596 if (PageWriteback(page))
1597 return 0;
1598 if (page_swapped(page))
1599 return 0;
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616 if (pm_suspended_storage())
1617 return 0;
1618
1619 page = compound_head(page);
1620 delete_from_swap_cache(page);
1621 SetPageDirty(page);
1622 return 1;
1623}
1624
1625
1626
1627
1628
1629int free_swap_and_cache(swp_entry_t entry)
1630{
1631 struct swap_info_struct *p;
1632 struct page *page = NULL;
1633 unsigned char count;
1634
1635 if (non_swap_entry(entry))
1636 return 1;
1637
1638 p = _swap_info_get(entry);
1639 if (p) {
1640 count = __swap_entry_free(p, entry, 1);
1641 if (count == SWAP_HAS_CACHE &&
1642 !swap_page_trans_huge_swapped(p, entry)) {
1643 page = find_get_page(swap_address_space(entry),
1644 swp_offset(entry));
1645 if (page && !trylock_page(page)) {
1646 put_page(page);
1647 page = NULL;
1648 }
1649 } else if (!count)
1650 free_swap_slot(entry);
1651 }
1652 if (page) {
1653
1654
1655
1656
1657 if (PageSwapCache(page) && !PageWriteback(page) &&
1658 (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
1659 !swap_page_trans_huge_swapped(p, entry)) {
1660 page = compound_head(page);
1661 delete_from_swap_cache(page);
1662 SetPageDirty(page);
1663 }
1664 unlock_page(page);
1665 put_page(page);
1666 }
1667 return p != NULL;
1668}
1669
1670#ifdef CONFIG_HIBERNATION
1671
1672
1673
1674
1675
1676
1677
1678
1679int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
1680{
1681 struct block_device *bdev = NULL;
1682 int type;
1683
1684 if (device)
1685 bdev = bdget(device);
1686
1687 spin_lock(&swap_lock);
1688 for (type = 0; type < nr_swapfiles; type++) {
1689 struct swap_info_struct *sis = swap_info[type];
1690
1691 if (!(sis->flags & SWP_WRITEOK))
1692 continue;
1693
1694 if (!bdev) {
1695 if (bdev_p)
1696 *bdev_p = bdgrab(sis->bdev);
1697
1698 spin_unlock(&swap_lock);
1699 return type;
1700 }
1701 if (bdev == sis->bdev) {
1702 struct swap_extent *se = &sis->first_swap_extent;
1703
1704 if (se->start_block == offset) {
1705 if (bdev_p)
1706 *bdev_p = bdgrab(sis->bdev);
1707
1708 spin_unlock(&swap_lock);
1709 bdput(bdev);
1710 return type;
1711 }
1712 }
1713 }
1714 spin_unlock(&swap_lock);
1715 if (bdev)
1716 bdput(bdev);
1717
1718 return -ENODEV;
1719}
1720
1721
1722
1723
1724
1725sector_t swapdev_block(int type, pgoff_t offset)
1726{
1727 struct block_device *bdev;
1728
1729 if ((unsigned int)type >= nr_swapfiles)
1730 return 0;
1731 if (!(swap_info[type]->flags & SWP_WRITEOK))
1732 return 0;
1733 return map_swap_entry(swp_entry(type, offset), &bdev);
1734}
1735
1736
1737
1738
1739
1740
1741
1742unsigned int count_swap_pages(int type, int free)
1743{
1744 unsigned int n = 0;
1745
1746 spin_lock(&swap_lock);
1747 if ((unsigned int)type < nr_swapfiles) {
1748 struct swap_info_struct *sis = swap_info[type];
1749
1750 spin_lock(&sis->lock);
1751 if (sis->flags & SWP_WRITEOK) {
1752 n = sis->pages;
1753 if (free)
1754 n -= sis->inuse_pages;
1755 }
1756 spin_unlock(&sis->lock);
1757 }
1758 spin_unlock(&swap_lock);
1759 return n;
1760}
1761#endif
1762
1763static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
1764{
1765 return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
1766}
1767
1768
1769
1770
1771
1772
1773static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
1774 unsigned long addr, swp_entry_t entry, struct page *page)
1775{
1776 struct page *swapcache;
1777 struct mem_cgroup *memcg;
1778 spinlock_t *ptl;
1779 pte_t *pte;
1780 int ret = 1;
1781
1782 swapcache = page;
1783 page = ksm_might_need_to_copy(page, vma, addr);
1784 if (unlikely(!page))
1785 return -ENOMEM;
1786
1787 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
1788 &memcg, false)) {
1789 ret = -ENOMEM;
1790 goto out_nolock;
1791 }
1792
1793 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1794 if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
1795 mem_cgroup_cancel_charge(page, memcg, false);
1796 ret = 0;
1797 goto out;
1798 }
1799
1800 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
1801 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
1802 get_page(page);
1803 set_pte_at(vma->vm_mm, addr, pte,
1804 pte_mkold(mk_pte(page, vma->vm_page_prot)));
1805 if (page == swapcache) {
1806 page_add_anon_rmap(page, vma, addr, false);
1807 mem_cgroup_commit_charge(page, memcg, true, false);
1808 } else {
1809 page_add_new_anon_rmap(page, vma, addr, false);
1810 mem_cgroup_commit_charge(page, memcg, false, false);
1811 lru_cache_add_active_or_unevictable(page, vma);
1812 }
1813 swap_free(entry);
1814
1815
1816
1817
1818 activate_page(page);
1819out:
1820 pte_unmap_unlock(pte, ptl);
1821out_nolock:
1822 if (page != swapcache) {
1823 unlock_page(page);
1824 put_page(page);
1825 }
1826 return ret;
1827}
1828
1829static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
1830 unsigned long addr, unsigned long end,
1831 swp_entry_t entry, struct page *page)
1832{
1833 pte_t swp_pte = swp_entry_to_pte(entry);
1834 pte_t *pte;
1835 int ret = 0;
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846 pte = pte_offset_map(pmd, addr);
1847 do {
1848
1849
1850
1851
1852 if (unlikely(pte_same_as_swp(*pte, swp_pte))) {
1853 pte_unmap(pte);
1854 ret = unuse_pte(vma, pmd, addr, entry, page);
1855 if (ret)
1856 goto out;
1857 pte = pte_offset_map(pmd, addr);
1858 }
1859 } while (pte++, addr += PAGE_SIZE, addr != end);
1860 pte_unmap(pte - 1);
1861out:
1862 return ret;
1863}
1864
1865static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
1866 unsigned long addr, unsigned long end,
1867 swp_entry_t entry, struct page *page)
1868{
1869 pmd_t *pmd;
1870 unsigned long next;
1871 int ret;
1872
1873 pmd = pmd_offset(pud, addr);
1874 do {
1875 cond_resched();
1876 next = pmd_addr_end(addr, end);
1877 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1878 continue;
1879 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
1880 if (ret)
1881 return ret;
1882 } while (pmd++, addr = next, addr != end);
1883 return 0;
1884}
1885
1886static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
1887 unsigned long addr, unsigned long end,
1888 swp_entry_t entry, struct page *page)
1889{
1890 pud_t *pud;
1891 unsigned long next;
1892 int ret;
1893
1894 pud = pud_offset(p4d, addr);
1895 do {
1896 next = pud_addr_end(addr, end);
1897 if (pud_none_or_clear_bad(pud))
1898 continue;
1899 ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
1900 if (ret)
1901 return ret;
1902 } while (pud++, addr = next, addr != end);
1903 return 0;
1904}
1905
1906static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
1907 unsigned long addr, unsigned long end,
1908 swp_entry_t entry, struct page *page)
1909{
1910 p4d_t *p4d;
1911 unsigned long next;
1912 int ret;
1913
1914 p4d = p4d_offset(pgd, addr);
1915 do {
1916 next = p4d_addr_end(addr, end);
1917 if (p4d_none_or_clear_bad(p4d))
1918 continue;
1919 ret = unuse_pud_range(vma, p4d, addr, next, entry, page);
1920 if (ret)
1921 return ret;
1922 } while (p4d++, addr = next, addr != end);
1923 return 0;
1924}
1925
1926static int unuse_vma(struct vm_area_struct *vma,
1927 swp_entry_t entry, struct page *page)
1928{
1929 pgd_t *pgd;
1930 unsigned long addr, end, next;
1931 int ret;
1932
1933 if (page_anon_vma(page)) {
1934 addr = page_address_in_vma(page, vma);
1935 if (addr == -EFAULT)
1936 return 0;
1937 else
1938 end = addr + PAGE_SIZE;
1939 } else {
1940 addr = vma->vm_start;
1941 end = vma->vm_end;
1942 }
1943
1944 pgd = pgd_offset(vma->vm_mm, addr);
1945 do {
1946 next = pgd_addr_end(addr, end);
1947 if (pgd_none_or_clear_bad(pgd))
1948 continue;
1949 ret = unuse_p4d_range(vma, pgd, addr, next, entry, page);
1950 if (ret)
1951 return ret;
1952 } while (pgd++, addr = next, addr != end);
1953 return 0;
1954}
1955
1956static int unuse_mm(struct mm_struct *mm,
1957 swp_entry_t entry, struct page *page)
1958{
1959 struct vm_area_struct *vma;
1960 int ret = 0;
1961
1962 if (!down_read_trylock(&mm->mmap_sem)) {
1963
1964
1965
1966
1967 activate_page(page);
1968 unlock_page(page);
1969 down_read(&mm->mmap_sem);
1970 lock_page(page);
1971 }
1972 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1973 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
1974 break;
1975 cond_resched();
1976 }
1977 up_read(&mm->mmap_sem);
1978 return (ret < 0)? ret: 0;
1979}
1980
1981
1982
1983
1984
1985
1986static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1987 unsigned int prev, bool frontswap)
1988{
1989 unsigned int max = si->max;
1990 unsigned int i = prev;
1991 unsigned char count;
1992
1993
1994
1995
1996
1997
1998
1999 for (;;) {
2000 if (++i >= max) {
2001 if (!prev) {
2002 i = 0;
2003 break;
2004 }
2005
2006
2007
2008
2009 max = prev + 1;
2010 prev = 0;
2011 i = 1;
2012 }
2013 count = READ_ONCE(si->swap_map[i]);
2014 if (count && swap_count(count) != SWAP_MAP_BAD)
2015 if (!frontswap || frontswap_test(si, i))
2016 break;
2017 if ((i % LATENCY_LIMIT) == 0)
2018 cond_resched();
2019 }
2020 return i;
2021}
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031int try_to_unuse(unsigned int type, bool frontswap,
2032 unsigned long pages_to_unuse)
2033{
2034 struct swap_info_struct *si = swap_info[type];
2035 struct mm_struct *start_mm;
2036 volatile unsigned char *swap_map;
2037
2038
2039
2040
2041 unsigned char swcount;
2042 struct page *page;
2043 swp_entry_t entry;
2044 unsigned int i = 0;
2045 int retval = 0;
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061 start_mm = &init_mm;
2062 mmget(&init_mm);
2063
2064
2065
2066
2067
2068
2069 while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
2070 if (signal_pending(current)) {
2071 retval = -EINTR;
2072 break;
2073 }
2074
2075
2076
2077
2078
2079
2080 swap_map = &si->swap_map[i];
2081 entry = swp_entry(type, i);
2082 page = read_swap_cache_async(entry,
2083 GFP_HIGHUSER_MOVABLE, NULL, 0, false);
2084 if (!page) {
2085
2086
2087
2088
2089
2090
2091 swcount = *swap_map;
2092
2093
2094
2095
2096
2097
2098
2099 if (!swcount || swcount == SWAP_MAP_BAD)
2100 continue;
2101 retval = -ENOMEM;
2102 break;
2103 }
2104
2105
2106
2107
2108 if (atomic_read(&start_mm->mm_users) == 1) {
2109 mmput(start_mm);
2110 start_mm = &init_mm;
2111 mmget(&init_mm);
2112 }
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122 wait_on_page_locked(page);
2123 wait_on_page_writeback(page);
2124 lock_page(page);
2125 wait_on_page_writeback(page);
2126
2127
2128
2129
2130 swcount = *swap_map;
2131 if (swap_count(swcount) == SWAP_MAP_SHMEM) {
2132 retval = shmem_unuse(entry, page);
2133
2134 if (retval < 0)
2135 break;
2136 continue;
2137 }
2138 if (swap_count(swcount) && start_mm != &init_mm)
2139 retval = unuse_mm(start_mm, entry, page);
2140
2141 if (swap_count(*swap_map)) {
2142 int set_start_mm = (*swap_map >= swcount);
2143 struct list_head *p = &start_mm->mmlist;
2144 struct mm_struct *new_start_mm = start_mm;
2145 struct mm_struct *prev_mm = start_mm;
2146 struct mm_struct *mm;
2147
2148 mmget(new_start_mm);
2149 mmget(prev_mm);
2150 spin_lock(&mmlist_lock);
2151 while (swap_count(*swap_map) && !retval &&
2152 (p = p->next) != &start_mm->mmlist) {
2153 mm = list_entry(p, struct mm_struct, mmlist);
2154 if (!mmget_not_zero(mm))
2155 continue;
2156 spin_unlock(&mmlist_lock);
2157 mmput(prev_mm);
2158 prev_mm = mm;
2159
2160 cond_resched();
2161
2162 swcount = *swap_map;
2163 if (!swap_count(swcount))
2164 ;
2165 else if (mm == &init_mm)
2166 set_start_mm = 1;
2167 else
2168 retval = unuse_mm(mm, entry, page);
2169
2170 if (set_start_mm && *swap_map < swcount) {
2171 mmput(new_start_mm);
2172 mmget(mm);
2173 new_start_mm = mm;
2174 set_start_mm = 0;
2175 }
2176 spin_lock(&mmlist_lock);
2177 }
2178 spin_unlock(&mmlist_lock);
2179 mmput(prev_mm);
2180 mmput(start_mm);
2181 start_mm = new_start_mm;
2182 }
2183 if (retval) {
2184 unlock_page(page);
2185 put_page(page);
2186 break;
2187 }
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208 if (swap_count(*swap_map) &&
2209 PageDirty(page) && PageSwapCache(page)) {
2210 struct writeback_control wbc = {
2211 .sync_mode = WB_SYNC_NONE,
2212 };
2213
2214 swap_writepage(compound_head(page), &wbc);
2215 lock_page(page);
2216 wait_on_page_writeback(page);
2217 }
2218
2219
2220
2221
2222
2223
2224
2225
2226 if (PageSwapCache(page) &&
2227 likely(page_private(page) == entry.val) &&
2228 !page_swapped(page))
2229 delete_from_swap_cache(compound_head(page));
2230
2231
2232
2233
2234
2235
2236 SetPageDirty(page);
2237 unlock_page(page);
2238 put_page(page);
2239
2240
2241
2242
2243
2244 cond_resched();
2245 if (frontswap && pages_to_unuse > 0) {
2246 if (!--pages_to_unuse)
2247 break;
2248 }
2249 }
2250
2251 mmput(start_mm);
2252 return retval;
2253}
2254
2255
2256
2257
2258
2259
2260
2261static void drain_mmlist(void)
2262{
2263 struct list_head *p, *next;
2264 unsigned int type;
2265
2266 for (type = 0; type < nr_swapfiles; type++)
2267 if (swap_info[type]->inuse_pages)
2268 return;
2269 spin_lock(&mmlist_lock);
2270 list_for_each_safe(p, next, &init_mm.mmlist)
2271 list_del_init(p);
2272 spin_unlock(&mmlist_lock);
2273}
2274
2275
2276
2277
2278
2279
2280
2281static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
2282{
2283 struct swap_info_struct *sis;
2284 struct swap_extent *start_se;
2285 struct swap_extent *se;
2286 pgoff_t offset;
2287
2288 sis = swap_info[swp_type(entry)];
2289 *bdev = sis->bdev;
2290
2291 offset = swp_offset(entry);
2292 start_se = sis->curr_swap_extent;
2293 se = start_se;
2294
2295 for ( ; ; ) {
2296 if (se->start_page <= offset &&
2297 offset < (se->start_page + se->nr_pages)) {
2298 return se->start_block + (offset - se->start_page);
2299 }
2300 se = list_next_entry(se, list);
2301 sis->curr_swap_extent = se;
2302 BUG_ON(se == start_se);
2303 }
2304}
2305
2306
2307
2308
2309sector_t map_swap_page(struct page *page, struct block_device **bdev)
2310{
2311 swp_entry_t entry;
2312 entry.val = page_private(page);
2313 return map_swap_entry(entry, bdev);
2314}
2315
2316
2317
2318
2319static void destroy_swap_extents(struct swap_info_struct *sis)
2320{
2321 while (!list_empty(&sis->first_swap_extent.list)) {
2322 struct swap_extent *se;
2323
2324 se = list_first_entry(&sis->first_swap_extent.list,
2325 struct swap_extent, list);
2326 list_del(&se->list);
2327 kfree(se);
2328 }
2329
2330 if (sis->flags & SWP_FILE) {
2331 struct file *swap_file = sis->swap_file;
2332 struct address_space *mapping = swap_file->f_mapping;
2333
2334 sis->flags &= ~SWP_FILE;
2335 mapping->a_ops->swap_deactivate(swap_file);
2336 }
2337}
2338
2339
2340
2341
2342
2343
2344
2345int
2346add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
2347 unsigned long nr_pages, sector_t start_block)
2348{
2349 struct swap_extent *se;
2350 struct swap_extent *new_se;
2351 struct list_head *lh;
2352
2353 if (start_page == 0) {
2354 se = &sis->first_swap_extent;
2355 sis->curr_swap_extent = se;
2356 se->start_page = 0;
2357 se->nr_pages = nr_pages;
2358 se->start_block = start_block;
2359 return 1;
2360 } else {
2361 lh = sis->first_swap_extent.list.prev;
2362 se = list_entry(lh, struct swap_extent, list);
2363 BUG_ON(se->start_page + se->nr_pages != start_page);
2364 if (se->start_block + se->nr_pages == start_block) {
2365
2366 se->nr_pages += nr_pages;
2367 return 0;
2368 }
2369 }
2370
2371
2372
2373
2374 new_se = kmalloc(sizeof(*se), GFP_KERNEL);
2375 if (new_se == NULL)
2376 return -ENOMEM;
2377 new_se->start_page = start_page;
2378 new_se->nr_pages = nr_pages;
2379 new_se->start_block = start_block;
2380
2381 list_add_tail(&new_se->list, &sis->first_swap_extent.list);
2382 return 1;
2383}
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
2417{
2418 struct file *swap_file = sis->swap_file;
2419 struct address_space *mapping = swap_file->f_mapping;
2420 struct inode *inode = mapping->host;
2421 int ret;
2422
2423 if (S_ISBLK(inode->i_mode)) {
2424 ret = add_swap_extent(sis, 0, sis->max, 0);
2425 *span = sis->pages;
2426 return ret;
2427 }
2428
2429 if (mapping->a_ops->swap_activate) {
2430 ret = mapping->a_ops->swap_activate(sis, swap_file, span);
2431 if (!ret) {
2432 sis->flags |= SWP_FILE;
2433 ret = add_swap_extent(sis, 0, sis->max, 0);
2434 *span = sis->pages;
2435 }
2436 return ret;
2437 }
2438
2439 return generic_swapfile_activate(sis, swap_file, span);
2440}
2441
2442static int swap_node(struct swap_info_struct *p)
2443{
2444 struct block_device *bdev;
2445
2446 if (p->bdev)
2447 bdev = p->bdev;
2448 else
2449 bdev = p->swap_file->f_inode->i_sb->s_bdev;
2450
2451 return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
2452}
2453
2454static void _enable_swap_info(struct swap_info_struct *p, int prio,
2455 unsigned char *swap_map,
2456 struct swap_cluster_info *cluster_info)
2457{
2458 int i;
2459
2460 if (prio >= 0)
2461 p->prio = prio;
2462 else
2463 p->prio = --least_priority;
2464
2465
2466
2467
2468 p->list.prio = -p->prio;
2469 for_each_node(i) {
2470 if (p->prio >= 0)
2471 p->avail_lists[i].prio = -p->prio;
2472 else {
2473 if (swap_node(p) == i)
2474 p->avail_lists[i].prio = 1;
2475 else
2476 p->avail_lists[i].prio = -p->prio;
2477 }
2478 }
2479 p->swap_map = swap_map;
2480 p->cluster_info = cluster_info;
2481 p->flags |= SWP_WRITEOK;
2482 atomic_long_add(p->pages, &nr_swap_pages);
2483 total_swap_pages += p->pages;
2484
2485 assert_spin_locked(&swap_lock);
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496 plist_add(&p->list, &swap_active_head);
2497 add_to_avail_list(p);
2498}
2499
2500static void enable_swap_info(struct swap_info_struct *p, int prio,
2501 unsigned char *swap_map,
2502 struct swap_cluster_info *cluster_info,
2503 unsigned long *frontswap_map)
2504{
2505 frontswap_init(p->type, frontswap_map);
2506 spin_lock(&swap_lock);
2507 spin_lock(&p->lock);
2508 _enable_swap_info(p, prio, swap_map, cluster_info);
2509 spin_unlock(&p->lock);
2510 spin_unlock(&swap_lock);
2511}
2512
2513static void reinsert_swap_info(struct swap_info_struct *p)
2514{
2515 spin_lock(&swap_lock);
2516 spin_lock(&p->lock);
2517 _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
2518 spin_unlock(&p->lock);
2519 spin_unlock(&swap_lock);
2520}
2521
2522bool has_usable_swap(void)
2523{
2524 bool ret = true;
2525
2526 spin_lock(&swap_lock);
2527 if (plist_head_empty(&swap_active_head))
2528 ret = false;
2529 spin_unlock(&swap_lock);
2530 return ret;
2531}
2532
2533SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
2534{
2535 struct swap_info_struct *p = NULL;
2536 unsigned char *swap_map;
2537 struct swap_cluster_info *cluster_info;
2538 unsigned long *frontswap_map;
2539 struct file *swap_file, *victim;
2540 struct address_space *mapping;
2541 struct inode *inode;
2542 struct filename *pathname;
2543 int err, found = 0;
2544 unsigned int old_block_size;
2545
2546 if (!capable(CAP_SYS_ADMIN))
2547 return -EPERM;
2548
2549 BUG_ON(!current->mm);
2550
2551 pathname = getname(specialfile);
2552 if (IS_ERR(pathname))
2553 return PTR_ERR(pathname);
2554
2555 victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
2556 err = PTR_ERR(victim);
2557 if (IS_ERR(victim))
2558 goto out;
2559
2560 mapping = victim->f_mapping;
2561 spin_lock(&swap_lock);
2562 plist_for_each_entry(p, &swap_active_head, list) {
2563 if (p->flags & SWP_WRITEOK) {
2564 if (p->swap_file->f_mapping == mapping) {
2565 found = 1;
2566 break;
2567 }
2568 }
2569 }
2570 if (!found) {
2571 err = -EINVAL;
2572 spin_unlock(&swap_lock);
2573 goto out_dput;
2574 }
2575 if (!security_vm_enough_memory_mm(current->mm, p->pages))
2576 vm_unacct_memory(p->pages);
2577 else {
2578 err = -ENOMEM;
2579 spin_unlock(&swap_lock);
2580 goto out_dput;
2581 }
2582 del_from_avail_list(p);
2583 spin_lock(&p->lock);
2584 if (p->prio < 0) {
2585 struct swap_info_struct *si = p;
2586 int nid;
2587
2588 plist_for_each_entry_continue(si, &swap_active_head, list) {
2589 si->prio++;
2590 si->list.prio--;
2591 for_each_node(nid) {
2592 if (si->avail_lists[nid].prio != 1)
2593 si->avail_lists[nid].prio--;
2594 }
2595 }
2596 least_priority++;
2597 }
2598 plist_del(&p->list, &swap_active_head);
2599 atomic_long_sub(p->pages, &nr_swap_pages);
2600 total_swap_pages -= p->pages;
2601 p->flags &= ~SWP_WRITEOK;
2602 spin_unlock(&p->lock);
2603 spin_unlock(&swap_lock);
2604
2605 disable_swap_slots_cache_lock();
2606
2607 set_current_oom_origin();
2608 err = try_to_unuse(p->type, false, 0);
2609 clear_current_oom_origin();
2610
2611 if (err) {
2612
2613 reinsert_swap_info(p);
2614 reenable_swap_slots_cache_unlock();
2615 goto out_dput;
2616 }
2617
2618 reenable_swap_slots_cache_unlock();
2619
2620 flush_work(&p->discard_work);
2621
2622 destroy_swap_extents(p);
2623 if (p->flags & SWP_CONTINUED)
2624 free_swap_count_continuations(p);
2625
2626 if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
2627 atomic_dec(&nr_rotate_swap);
2628
2629 mutex_lock(&swapon_mutex);
2630 spin_lock(&swap_lock);
2631 spin_lock(&p->lock);
2632 drain_mmlist();
2633
2634
2635 p->highest_bit = 0;
2636 while (p->flags >= SWP_SCANNING) {
2637 spin_unlock(&p->lock);
2638 spin_unlock(&swap_lock);
2639 schedule_timeout_uninterruptible(1);
2640 spin_lock(&swap_lock);
2641 spin_lock(&p->lock);
2642 }
2643
2644 swap_file = p->swap_file;
2645 old_block_size = p->old_block_size;
2646 p->swap_file = NULL;
2647 p->max = 0;
2648 swap_map = p->swap_map;
2649 p->swap_map = NULL;
2650 cluster_info = p->cluster_info;
2651 p->cluster_info = NULL;
2652 frontswap_map = frontswap_map_get(p);
2653 spin_unlock(&p->lock);
2654 spin_unlock(&swap_lock);
2655 frontswap_invalidate_area(p->type);
2656 frontswap_map_set(p, NULL);
2657 mutex_unlock(&swapon_mutex);
2658 free_percpu(p->percpu_cluster);
2659 p->percpu_cluster = NULL;
2660 vfree(swap_map);
2661 kvfree(cluster_info);
2662 kvfree(frontswap_map);
2663
2664 swap_cgroup_swapoff(p->type);
2665 exit_swap_address_space(p->type);
2666
2667 inode = mapping->host;
2668 if (S_ISBLK(inode->i_mode)) {
2669 struct block_device *bdev = I_BDEV(inode);
2670 set_blocksize(bdev, old_block_size);
2671 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2672 } else {
2673 inode_lock(inode);
2674 inode->i_flags &= ~S_SWAPFILE;
2675 inode_unlock(inode);
2676 }
2677 filp_close(swap_file, NULL);
2678
2679
2680
2681
2682
2683
2684 spin_lock(&swap_lock);
2685 p->flags = 0;
2686 spin_unlock(&swap_lock);
2687
2688 err = 0;
2689 atomic_inc(&proc_poll_event);
2690 wake_up_interruptible(&proc_poll_wait);
2691
2692out_dput:
2693 filp_close(victim, NULL);
2694out:
2695 putname(pathname);
2696 return err;
2697}
2698
2699#ifdef CONFIG_PROC_FS
2700static __poll_t swaps_poll(struct file *file, poll_table *wait)
2701{
2702 struct seq_file *seq = file->private_data;
2703
2704 poll_wait(file, &proc_poll_wait, wait);
2705
2706 if (seq->poll_event != atomic_read(&proc_poll_event)) {
2707 seq->poll_event = atomic_read(&proc_poll_event);
2708 return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
2709 }
2710
2711 return EPOLLIN | EPOLLRDNORM;
2712}
2713
2714
2715static void *swap_start(struct seq_file *swap, loff_t *pos)
2716{
2717 struct swap_info_struct *si;
2718 int type;
2719 loff_t l = *pos;
2720
2721 mutex_lock(&swapon_mutex);
2722
2723 if (!l)
2724 return SEQ_START_TOKEN;
2725
2726 for (type = 0; type < nr_swapfiles; type++) {
2727 smp_rmb();
2728 si = swap_info[type];
2729 if (!(si->flags & SWP_USED) || !si->swap_map)
2730 continue;
2731 if (!--l)
2732 return si;
2733 }
2734
2735 return NULL;
2736}
2737
2738static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
2739{
2740 struct swap_info_struct *si = v;
2741 int type;
2742
2743 if (v == SEQ_START_TOKEN)
2744 type = 0;
2745 else
2746 type = si->type + 1;
2747
2748 for (; type < nr_swapfiles; type++) {
2749 smp_rmb();
2750 si = swap_info[type];
2751 if (!(si->flags & SWP_USED) || !si->swap_map)
2752 continue;
2753 ++*pos;
2754 return si;
2755 }
2756
2757 return NULL;
2758}
2759
2760static void swap_stop(struct seq_file *swap, void *v)
2761{
2762 mutex_unlock(&swapon_mutex);
2763}
2764
2765static int swap_show(struct seq_file *swap, void *v)
2766{
2767 struct swap_info_struct *si = v;
2768 struct file *file;
2769 int len;
2770
2771 if (si == SEQ_START_TOKEN) {
2772 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
2773 return 0;
2774 }
2775
2776 file = si->swap_file;
2777 len = seq_file_path(swap, file, " \t\n\\");
2778 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
2779 len < 40 ? 40 - len : 1, " ",
2780 S_ISBLK(file_inode(file)->i_mode) ?
2781 "partition" : "file\t",
2782 si->pages << (PAGE_SHIFT - 10),
2783 si->inuse_pages << (PAGE_SHIFT - 10),
2784 si->prio);
2785 return 0;
2786}
2787
2788static const struct seq_operations swaps_op = {
2789 .start = swap_start,
2790 .next = swap_next,
2791 .stop = swap_stop,
2792 .show = swap_show
2793};
2794
2795static int swaps_open(struct inode *inode, struct file *file)
2796{
2797 struct seq_file *seq;
2798 int ret;
2799
2800 ret = seq_open(file, &swaps_op);
2801 if (ret)
2802 return ret;
2803
2804 seq = file->private_data;
2805 seq->poll_event = atomic_read(&proc_poll_event);
2806 return 0;
2807}
2808
2809static const struct file_operations proc_swaps_operations = {
2810 .open = swaps_open,
2811 .read = seq_read,
2812 .llseek = seq_lseek,
2813 .release = seq_release,
2814 .poll = swaps_poll,
2815};
2816
2817static int __init procswaps_init(void)
2818{
2819 proc_create("swaps", 0, NULL, &proc_swaps_operations);
2820 return 0;
2821}
2822__initcall(procswaps_init);
2823#endif
2824
2825#ifdef MAX_SWAPFILES_CHECK
2826static int __init max_swapfiles_check(void)
2827{
2828 MAX_SWAPFILES_CHECK();
2829 return 0;
2830}
2831late_initcall(max_swapfiles_check);
2832#endif
2833
2834static struct swap_info_struct *alloc_swap_info(void)
2835{
2836 struct swap_info_struct *p;
2837 unsigned int type;
2838 int i;
2839
2840 p = kzalloc(sizeof(*p), GFP_KERNEL);
2841 if (!p)
2842 return ERR_PTR(-ENOMEM);
2843
2844 spin_lock(&swap_lock);
2845 for (type = 0; type < nr_swapfiles; type++) {
2846 if (!(swap_info[type]->flags & SWP_USED))
2847 break;
2848 }
2849 if (type >= MAX_SWAPFILES) {
2850 spin_unlock(&swap_lock);
2851 kfree(p);
2852 return ERR_PTR(-EPERM);
2853 }
2854 if (type >= nr_swapfiles) {
2855 p->type = type;
2856 swap_info[type] = p;
2857
2858
2859
2860
2861
2862 smp_wmb();
2863 nr_swapfiles++;
2864 } else {
2865 kfree(p);
2866 p = swap_info[type];
2867
2868
2869
2870
2871 }
2872 INIT_LIST_HEAD(&p->first_swap_extent.list);
2873 plist_node_init(&p->list, 0);
2874 for_each_node(i)
2875 plist_node_init(&p->avail_lists[i], 0);
2876 p->flags = SWP_USED;
2877 spin_unlock(&swap_lock);
2878 spin_lock_init(&p->lock);
2879 spin_lock_init(&p->cont_lock);
2880
2881 return p;
2882}
2883
2884static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
2885{
2886 int error;
2887
2888 if (S_ISBLK(inode->i_mode)) {
2889 p->bdev = bdgrab(I_BDEV(inode));
2890 error = blkdev_get(p->bdev,
2891 FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
2892 if (error < 0) {
2893 p->bdev = NULL;
2894 return error;
2895 }
2896 p->old_block_size = block_size(p->bdev);
2897 error = set_blocksize(p->bdev, PAGE_SIZE);
2898 if (error < 0)
2899 return error;
2900 p->flags |= SWP_BLKDEV;
2901 } else if (S_ISREG(inode->i_mode)) {
2902 p->bdev = inode->i_sb->s_bdev;
2903 inode_lock(inode);
2904 if (IS_SWAPFILE(inode))
2905 return -EBUSY;
2906 } else
2907 return -EINVAL;
2908
2909 return 0;
2910}
2911
2912static unsigned long read_swap_header(struct swap_info_struct *p,
2913 union swap_header *swap_header,
2914 struct inode *inode)
2915{
2916 int i;
2917 unsigned long maxpages;
2918 unsigned long swapfilepages;
2919 unsigned long last_page;
2920
2921 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
2922 pr_err("Unable to find swap-space signature\n");
2923 return 0;
2924 }
2925
2926
2927 if (swab32(swap_header->info.version) == 1) {
2928 swab32s(&swap_header->info.version);
2929 swab32s(&swap_header->info.last_page);
2930 swab32s(&swap_header->info.nr_badpages);
2931 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2932 return 0;
2933 for (i = 0; i < swap_header->info.nr_badpages; i++)
2934 swab32s(&swap_header->info.badpages[i]);
2935 }
2936
2937 if (swap_header->info.version != 1) {
2938 pr_warn("Unable to handle swap header version %d\n",
2939 swap_header->info.version);
2940 return 0;
2941 }
2942
2943 p->lowest_bit = 1;
2944 p->cluster_next = 1;
2945 p->cluster_nr = 0;
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961 maxpages = swp_offset(pte_to_swp_entry(
2962 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
2963 last_page = swap_header->info.last_page;
2964 if (!last_page) {
2965 pr_warn("Empty swap-file\n");
2966 return 0;
2967 }
2968 if (last_page > maxpages) {
2969 pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
2970 maxpages << (PAGE_SHIFT - 10),
2971 last_page << (PAGE_SHIFT - 10));
2972 }
2973 if (maxpages > last_page) {
2974 maxpages = last_page + 1;
2975
2976 if ((unsigned int)maxpages == 0)
2977 maxpages = UINT_MAX;
2978 }
2979 p->highest_bit = maxpages - 1;
2980
2981 if (!maxpages)
2982 return 0;
2983 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
2984 if (swapfilepages && maxpages > swapfilepages) {
2985 pr_warn("Swap area shorter than signature indicates\n");
2986 return 0;
2987 }
2988 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
2989 return 0;
2990 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2991 return 0;
2992
2993 return maxpages;
2994}
2995
2996#define SWAP_CLUSTER_INFO_COLS \
2997 DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
2998#define SWAP_CLUSTER_SPACE_COLS \
2999 DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
3000#define SWAP_CLUSTER_COLS \
3001 max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
3002
3003static int setup_swap_map_and_extents(struct swap_info_struct *p,
3004 union swap_header *swap_header,
3005 unsigned char *swap_map,
3006 struct swap_cluster_info *cluster_info,
3007 unsigned long maxpages,
3008 sector_t *span)
3009{
3010 unsigned int j, k;
3011 unsigned int nr_good_pages;
3012 int nr_extents;
3013 unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
3014 unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
3015 unsigned long i, idx;
3016
3017 nr_good_pages = maxpages - 1;
3018
3019 cluster_list_init(&p->free_clusters);
3020 cluster_list_init(&p->discard_clusters);
3021
3022 for (i = 0; i < swap_header->info.nr_badpages; i++) {
3023 unsigned int page_nr = swap_header->info.badpages[i];
3024 if (page_nr == 0 || page_nr > swap_header->info.last_page)
3025 return -EINVAL;
3026 if (page_nr < maxpages) {
3027 swap_map[page_nr] = SWAP_MAP_BAD;
3028 nr_good_pages--;
3029
3030
3031
3032
3033 inc_cluster_info_page(p, cluster_info, page_nr);
3034 }
3035 }
3036
3037
3038 for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
3039 inc_cluster_info_page(p, cluster_info, i);
3040
3041 if (nr_good_pages) {
3042 swap_map[0] = SWAP_MAP_BAD;
3043
3044
3045
3046
3047 inc_cluster_info_page(p, cluster_info, 0);
3048 p->max = maxpages;
3049 p->pages = nr_good_pages;
3050 nr_extents = setup_swap_extents(p, span);
3051 if (nr_extents < 0)
3052 return nr_extents;
3053 nr_good_pages = p->pages;
3054 }
3055 if (!nr_good_pages) {
3056 pr_warn("Empty swap-file\n");
3057 return -EINVAL;
3058 }
3059
3060 if (!cluster_info)
3061 return nr_extents;
3062
3063
3064
3065
3066
3067
3068 for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
3069 j = (k + col) % SWAP_CLUSTER_COLS;
3070 for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
3071 idx = i * SWAP_CLUSTER_COLS + j;
3072 if (idx >= nr_clusters)
3073 continue;
3074 if (cluster_count(&cluster_info[idx]))
3075 continue;
3076 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
3077 cluster_list_add_tail(&p->free_clusters, cluster_info,
3078 idx);
3079 }
3080 }
3081 return nr_extents;
3082}
3083
3084
3085
3086
3087
3088static bool swap_discardable(struct swap_info_struct *si)
3089{
3090 struct request_queue *q = bdev_get_queue(si->bdev);
3091
3092 if (!q || !blk_queue_discard(q))
3093 return false;
3094
3095 return true;
3096}
3097
3098SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
3099{
3100 struct swap_info_struct *p;
3101 struct filename *name;
3102 struct file *swap_file = NULL;
3103 struct address_space *mapping;
3104 int prio;
3105 int error;
3106 union swap_header *swap_header;
3107 int nr_extents;
3108 sector_t span;
3109 unsigned long maxpages;
3110 unsigned char *swap_map = NULL;
3111 struct swap_cluster_info *cluster_info = NULL;
3112 unsigned long *frontswap_map = NULL;
3113 struct page *page = NULL;
3114 struct inode *inode = NULL;
3115 bool inced_nr_rotate_swap = false;
3116
3117 if (swap_flags & ~SWAP_FLAGS_VALID)
3118 return -EINVAL;
3119
3120 if (!capable(CAP_SYS_ADMIN))
3121 return -EPERM;
3122
3123 if (!swap_avail_heads)
3124 return -ENOMEM;
3125
3126 p = alloc_swap_info();
3127 if (IS_ERR(p))
3128 return PTR_ERR(p);
3129
3130 INIT_WORK(&p->discard_work, swap_discard_work);
3131
3132 name = getname(specialfile);
3133 if (IS_ERR(name)) {
3134 error = PTR_ERR(name);
3135 name = NULL;
3136 goto bad_swap;
3137 }
3138 swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
3139 if (IS_ERR(swap_file)) {
3140 error = PTR_ERR(swap_file);
3141 swap_file = NULL;
3142 goto bad_swap;
3143 }
3144
3145 p->swap_file = swap_file;
3146 mapping = swap_file->f_mapping;
3147 inode = mapping->host;
3148
3149
3150 error = claim_swapfile(p, inode);
3151 if (unlikely(error))
3152 goto bad_swap;
3153
3154
3155
3156
3157 if (!mapping->a_ops->readpage) {
3158 error = -EINVAL;
3159 goto bad_swap;
3160 }
3161 page = read_mapping_page(mapping, 0, swap_file);
3162 if (IS_ERR(page)) {
3163 error = PTR_ERR(page);
3164 goto bad_swap;
3165 }
3166 swap_header = kmap(page);
3167
3168 maxpages = read_swap_header(p, swap_header, inode);
3169 if (unlikely(!maxpages)) {
3170 error = -EINVAL;
3171 goto bad_swap;
3172 }
3173
3174
3175 swap_map = vzalloc(maxpages);
3176 if (!swap_map) {
3177 error = -ENOMEM;
3178 goto bad_swap;
3179 }
3180
3181 if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
3182 p->flags |= SWP_STABLE_WRITES;
3183
3184 if (bdi_cap_synchronous_io(inode_to_bdi(inode)))
3185 p->flags |= SWP_SYNCHRONOUS_IO;
3186
3187 if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
3188 int cpu;
3189 unsigned long ci, nr_cluster;
3190
3191 p->flags |= SWP_SOLIDSTATE;
3192
3193
3194
3195
3196 p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
3197 nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
3198
3199 cluster_info = kvzalloc(nr_cluster * sizeof(*cluster_info),
3200 GFP_KERNEL);
3201 if (!cluster_info) {
3202 error = -ENOMEM;
3203 goto bad_swap;
3204 }
3205
3206 for (ci = 0; ci < nr_cluster; ci++)
3207 spin_lock_init(&((cluster_info + ci)->lock));
3208
3209 p->percpu_cluster = alloc_percpu(struct percpu_cluster);
3210 if (!p->percpu_cluster) {
3211 error = -ENOMEM;
3212 goto bad_swap;
3213 }
3214 for_each_possible_cpu(cpu) {
3215 struct percpu_cluster *cluster;
3216 cluster = per_cpu_ptr(p->percpu_cluster, cpu);
3217 cluster_set_null(&cluster->index);
3218 }
3219 } else {
3220 atomic_inc(&nr_rotate_swap);
3221 inced_nr_rotate_swap = true;
3222 }
3223
3224 error = swap_cgroup_swapon(p->type, maxpages);
3225 if (error)
3226 goto bad_swap;
3227
3228 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
3229 cluster_info, maxpages, &span);
3230 if (unlikely(nr_extents < 0)) {
3231 error = nr_extents;
3232 goto bad_swap;
3233 }
3234
3235 if (IS_ENABLED(CONFIG_FRONTSWAP))
3236 frontswap_map = kvzalloc(BITS_TO_LONGS(maxpages) * sizeof(long),
3237 GFP_KERNEL);
3238
3239 if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
3240
3241
3242
3243
3244
3245
3246 p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
3247 SWP_PAGE_DISCARD);
3248
3249
3250
3251
3252
3253
3254
3255 if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
3256 p->flags &= ~SWP_PAGE_DISCARD;
3257 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
3258 p->flags &= ~SWP_AREA_DISCARD;
3259
3260
3261 if (p->flags & SWP_AREA_DISCARD) {
3262 int err = discard_swap(p);
3263 if (unlikely(err))
3264 pr_err("swapon: discard_swap(%p): %d\n",
3265 p, err);
3266 }
3267 }
3268
3269 error = init_swap_address_space(p->type, maxpages);
3270 if (error)
3271 goto bad_swap;
3272
3273 mutex_lock(&swapon_mutex);
3274 prio = -1;
3275 if (swap_flags & SWAP_FLAG_PREFER)
3276 prio =
3277 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
3278 enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
3279
3280 pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
3281 p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
3282 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
3283 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
3284 (p->flags & SWP_DISCARDABLE) ? "D" : "",
3285 (p->flags & SWP_AREA_DISCARD) ? "s" : "",
3286 (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
3287 (frontswap_map) ? "FS" : "");
3288
3289 mutex_unlock(&swapon_mutex);
3290 atomic_inc(&proc_poll_event);
3291 wake_up_interruptible(&proc_poll_wait);
3292
3293 if (S_ISREG(inode->i_mode))
3294 inode->i_flags |= S_SWAPFILE;
3295 error = 0;
3296 goto out;
3297bad_swap:
3298 free_percpu(p->percpu_cluster);
3299 p->percpu_cluster = NULL;
3300 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
3301 set_blocksize(p->bdev, p->old_block_size);
3302 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3303 }
3304 destroy_swap_extents(p);
3305 swap_cgroup_swapoff(p->type);
3306 spin_lock(&swap_lock);
3307 p->swap_file = NULL;
3308 p->flags = 0;
3309 spin_unlock(&swap_lock);
3310 vfree(swap_map);
3311 kvfree(cluster_info);
3312 kvfree(frontswap_map);
3313 if (inced_nr_rotate_swap)
3314 atomic_dec(&nr_rotate_swap);
3315 if (swap_file) {
3316 if (inode && S_ISREG(inode->i_mode)) {
3317 inode_unlock(inode);
3318 inode = NULL;
3319 }
3320 filp_close(swap_file, NULL);
3321 }
3322out:
3323 if (page && !IS_ERR(page)) {
3324 kunmap(page);
3325 put_page(page);
3326 }
3327 if (name)
3328 putname(name);
3329 if (inode && S_ISREG(inode->i_mode))
3330 inode_unlock(inode);
3331 if (!error)
3332 enable_swap_slots_cache();
3333 return error;
3334}
3335
3336void si_swapinfo(struct sysinfo *val)
3337{
3338 unsigned int type;
3339 unsigned long nr_to_be_unused = 0;
3340
3341 spin_lock(&swap_lock);
3342 for (type = 0; type < nr_swapfiles; type++) {
3343 struct swap_info_struct *si = swap_info[type];
3344
3345 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
3346 nr_to_be_unused += si->inuse_pages;
3347 }
3348 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
3349 val->totalswap = total_swap_pages + nr_to_be_unused;
3350 spin_unlock(&swap_lock);
3351}
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
3365{
3366 struct swap_info_struct *p;
3367 struct swap_cluster_info *ci;
3368 unsigned long offset, type;
3369 unsigned char count;
3370 unsigned char has_cache;
3371 int err = -EINVAL;
3372
3373 if (non_swap_entry(entry))
3374 goto out;
3375
3376 type = swp_type(entry);
3377 if (type >= nr_swapfiles)
3378 goto bad_file;
3379 p = swap_info[type];
3380 offset = swp_offset(entry);
3381 if (unlikely(offset >= p->max))
3382 goto out;
3383
3384 ci = lock_cluster_or_swap_info(p, offset);
3385
3386 count = p->swap_map[offset];
3387
3388
3389
3390
3391
3392 if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
3393 err = -ENOENT;
3394 goto unlock_out;
3395 }
3396
3397 has_cache = count & SWAP_HAS_CACHE;
3398 count &= ~SWAP_HAS_CACHE;
3399 err = 0;
3400
3401 if (usage == SWAP_HAS_CACHE) {
3402
3403
3404 if (!has_cache && count)
3405 has_cache = SWAP_HAS_CACHE;
3406 else if (has_cache)
3407 err = -EEXIST;
3408 else
3409 err = -ENOENT;
3410
3411 } else if (count || has_cache) {
3412
3413 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
3414 count += usage;
3415 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
3416 err = -EINVAL;
3417 else if (swap_count_continued(p, offset, count))
3418 count = COUNT_CONTINUED;
3419 else
3420 err = -ENOMEM;
3421 } else
3422 err = -ENOENT;
3423
3424 p->swap_map[offset] = count | has_cache;
3425
3426unlock_out:
3427 unlock_cluster_or_swap_info(p, ci);
3428out:
3429 return err;
3430
3431bad_file:
3432 pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
3433 goto out;
3434}
3435
3436
3437
3438
3439
3440void swap_shmem_alloc(swp_entry_t entry)
3441{
3442 __swap_duplicate(entry, SWAP_MAP_SHMEM);
3443}
3444
3445
3446
3447
3448
3449
3450
3451
3452int swap_duplicate(swp_entry_t entry)
3453{
3454 int err = 0;
3455
3456 while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
3457 err = add_swap_count_continuation(entry, GFP_ATOMIC);
3458 return err;
3459}
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469int swapcache_prepare(swp_entry_t entry)
3470{
3471 return __swap_duplicate(entry, SWAP_HAS_CACHE);
3472}
3473
3474struct swap_info_struct *swp_swap_info(swp_entry_t entry)
3475{
3476 return swap_info[swp_type(entry)];
3477}
3478
3479struct swap_info_struct *page_swap_info(struct page *page)
3480{
3481 swp_entry_t entry = { .val = page_private(page) };
3482 return swp_swap_info(entry);
3483}
3484
3485
3486
3487
3488struct address_space *__page_file_mapping(struct page *page)
3489{
3490 return page_swap_info(page)->swap_file->f_mapping;
3491}
3492EXPORT_SYMBOL_GPL(__page_file_mapping);
3493
3494pgoff_t __page_file_index(struct page *page)
3495{
3496 swp_entry_t swap = { .val = page_private(page) };
3497 return swp_offset(swap);
3498}
3499EXPORT_SYMBOL_GPL(__page_file_index);
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
3517{
3518 struct swap_info_struct *si;
3519 struct swap_cluster_info *ci;
3520 struct page *head;
3521 struct page *page;
3522 struct page *list_page;
3523 pgoff_t offset;
3524 unsigned char count;
3525
3526
3527
3528
3529
3530 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
3531
3532 si = swap_info_get(entry);
3533 if (!si) {
3534
3535
3536
3537
3538
3539 goto outer;
3540 }
3541
3542 offset = swp_offset(entry);
3543
3544 ci = lock_cluster(si, offset);
3545
3546 count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
3547
3548 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
3549
3550
3551
3552
3553
3554 goto out;
3555 }
3556
3557 if (!page) {
3558 unlock_cluster(ci);
3559 spin_unlock(&si->lock);
3560 return -ENOMEM;
3561 }
3562
3563
3564
3565
3566
3567
3568 head = vmalloc_to_page(si->swap_map + offset);
3569 offset &= ~PAGE_MASK;
3570
3571 spin_lock(&si->cont_lock);
3572
3573
3574
3575
3576 if (!page_private(head)) {
3577 BUG_ON(count & COUNT_CONTINUED);
3578 INIT_LIST_HEAD(&head->lru);
3579 set_page_private(head, SWP_CONTINUED);
3580 si->flags |= SWP_CONTINUED;
3581 }
3582
3583 list_for_each_entry(list_page, &head->lru, lru) {
3584 unsigned char *map;
3585
3586
3587
3588
3589
3590 if (!(count & COUNT_CONTINUED))
3591 goto out_unlock_cont;
3592
3593 map = kmap_atomic(list_page) + offset;
3594 count = *map;
3595 kunmap_atomic(map);
3596
3597
3598
3599
3600
3601 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
3602 goto out_unlock_cont;
3603 }
3604
3605 list_add_tail(&page->lru, &head->lru);
3606 page = NULL;
3607out_unlock_cont:
3608 spin_unlock(&si->cont_lock);
3609out:
3610 unlock_cluster(ci);
3611 spin_unlock(&si->lock);
3612outer:
3613 if (page)
3614 __free_page(page);
3615 return 0;
3616}
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627static bool swap_count_continued(struct swap_info_struct *si,
3628 pgoff_t offset, unsigned char count)
3629{
3630 struct page *head;
3631 struct page *page;
3632 unsigned char *map;
3633 bool ret;
3634
3635 head = vmalloc_to_page(si->swap_map + offset);
3636 if (page_private(head) != SWP_CONTINUED) {
3637 BUG_ON(count & COUNT_CONTINUED);
3638 return false;
3639 }
3640
3641 spin_lock(&si->cont_lock);
3642 offset &= ~PAGE_MASK;
3643 page = list_entry(head->lru.next, struct page, lru);
3644 map = kmap_atomic(page) + offset;
3645
3646 if (count == SWAP_MAP_MAX)
3647 goto init_map;
3648
3649 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) {
3650
3651
3652
3653 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
3654 kunmap_atomic(map);
3655 page = list_entry(page->lru.next, struct page, lru);
3656 BUG_ON(page == head);
3657 map = kmap_atomic(page) + offset;
3658 }
3659 if (*map == SWAP_CONT_MAX) {
3660 kunmap_atomic(map);
3661 page = list_entry(page->lru.next, struct page, lru);
3662 if (page == head) {
3663 ret = false;
3664 goto out;
3665 }
3666 map = kmap_atomic(page) + offset;
3667init_map: *map = 0;
3668 }
3669 *map += 1;
3670 kunmap_atomic(map);
3671 page = list_entry(page->lru.prev, struct page, lru);
3672 while (page != head) {
3673 map = kmap_atomic(page) + offset;
3674 *map = COUNT_CONTINUED;
3675 kunmap_atomic(map);
3676 page = list_entry(page->lru.prev, struct page, lru);
3677 }
3678 ret = true;
3679
3680 } else {
3681
3682
3683
3684 BUG_ON(count != COUNT_CONTINUED);
3685 while (*map == COUNT_CONTINUED) {
3686 kunmap_atomic(map);
3687 page = list_entry(page->lru.next, struct page, lru);
3688 BUG_ON(page == head);
3689 map = kmap_atomic(page) + offset;
3690 }
3691 BUG_ON(*map == 0);
3692 *map -= 1;
3693 if (*map == 0)
3694 count = 0;
3695 kunmap_atomic(map);
3696 page = list_entry(page->lru.prev, struct page, lru);
3697 while (page != head) {
3698 map = kmap_atomic(page) + offset;
3699 *map = SWAP_CONT_MAX | count;
3700 count = COUNT_CONTINUED;
3701 kunmap_atomic(map);
3702 page = list_entry(page->lru.prev, struct page, lru);
3703 }
3704 ret = count == COUNT_CONTINUED;
3705 }
3706out:
3707 spin_unlock(&si->cont_lock);
3708 return ret;
3709}
3710
3711
3712
3713
3714
3715static void free_swap_count_continuations(struct swap_info_struct *si)
3716{
3717 pgoff_t offset;
3718
3719 for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
3720 struct page *head;
3721 head = vmalloc_to_page(si->swap_map + offset);
3722 if (page_private(head)) {
3723 struct page *page, *next;
3724
3725 list_for_each_entry_safe(page, next, &head->lru, lru) {
3726 list_del(&page->lru);
3727 __free_page(page);
3728 }
3729 }
3730 }
3731}
3732
3733static int __init swapfile_init(void)
3734{
3735 int nid;
3736
3737 swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
3738 GFP_KERNEL);
3739 if (!swap_avail_heads) {
3740 pr_emerg("Not enough memory for swap heads, swap is disabled\n");
3741 return -ENOMEM;
3742 }
3743
3744 for_each_node(nid)
3745 plist_head_init(&swap_avail_heads[nid]);
3746
3747 return 0;
3748}
3749subsys_initcall(swapfile_init);
3750