1
2
3
4
5
6
7
8#include <linux/mm.h>
9#include <linux/sched/mm.h>
10#include <linux/sched/task.h>
11#include <linux/hugetlb.h>
12#include <linux/mman.h>
13#include <linux/slab.h>
14#include <linux/kernel_stat.h>
15#include <linux/swap.h>
16#include <linux/vmalloc.h>
17#include <linux/pagemap.h>
18#include <linux/namei.h>
19#include <linux/shmem_fs.h>
20#include <linux/blkdev.h>
21#include <linux/random.h>
22#include <linux/writeback.h>
23#include <linux/proc_fs.h>
24#include <linux/seq_file.h>
25#include <linux/init.h>
26#include <linux/ksm.h>
27#include <linux/rmap.h>
28#include <linux/security.h>
29#include <linux/backing-dev.h>
30#include <linux/mutex.h>
31#include <linux/capability.h>
32#include <linux/syscalls.h>
33#include <linux/memcontrol.h>
34#include <linux/poll.h>
35#include <linux/oom.h>
36#include <linux/frontswap.h>
37#include <linux/swapfile.h>
38#include <linux/export.h>
39#include <linux/swap_slots.h>
40#include <linux/sort.h>
41
42#include <asm/pgtable.h>
43#include <asm/tlbflush.h>
44#include <linux/swapops.h>
45#include <linux/swap_cgroup.h>
46
47static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
48 unsigned char);
49static void free_swap_count_continuations(struct swap_info_struct *);
50static sector_t map_swap_entry(swp_entry_t, struct block_device**);
51
52DEFINE_SPINLOCK(swap_lock);
53static unsigned int nr_swapfiles;
54atomic_long_t nr_swap_pages;
55
56
57
58
59
60EXPORT_SYMBOL_GPL(nr_swap_pages);
61
62long total_swap_pages;
63static int least_priority = -1;
64
65static const char Bad_file[] = "Bad swap file entry ";
66static const char Unused_file[] = "Unused swap file entry ";
67static const char Bad_offset[] = "Bad swap offset entry ";
68static const char Unused_offset[] = "Unused swap offset entry ";
69
70
71
72
73
74PLIST_HEAD(swap_active_head);
75
76
77
78
79
80
81
82
83
84
85
86
87
88struct plist_head *swap_avail_heads;
89static DEFINE_SPINLOCK(swap_avail_lock);
90
91struct swap_info_struct *swap_info[MAX_SWAPFILES];
92
93static DEFINE_MUTEX(swapon_mutex);
94
95static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
96
97static atomic_t proc_poll_event = ATOMIC_INIT(0);
98
99atomic_t nr_rotate_swap = ATOMIC_INIT(0);
100
101static inline unsigned char swap_count(unsigned char ent)
102{
103 return ent & ~SWAP_HAS_CACHE;
104}
105
106
107static int
108__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
109{
110 swp_entry_t entry = swp_entry(si->type, offset);
111 struct page *page;
112 int ret = 0;
113
114 page = find_get_page(swap_address_space(entry), swp_offset(entry));
115 if (!page)
116 return 0;
117
118
119
120
121
122
123
124 if (trylock_page(page)) {
125 ret = try_to_free_swap(page);
126 unlock_page(page);
127 }
128 put_page(page);
129 return ret;
130}
131
132
133
134
135
136static int discard_swap(struct swap_info_struct *si)
137{
138 struct swap_extent *se;
139 sector_t start_block;
140 sector_t nr_blocks;
141 int err = 0;
142
143
144 se = &si->first_swap_extent;
145 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
146 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
147 if (nr_blocks) {
148 err = blkdev_issue_discard(si->bdev, start_block,
149 nr_blocks, GFP_KERNEL, 0);
150 if (err)
151 return err;
152 cond_resched();
153 }
154
155 list_for_each_entry(se, &si->first_swap_extent.list, list) {
156 start_block = se->start_block << (PAGE_SHIFT - 9);
157 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
158
159 err = blkdev_issue_discard(si->bdev, start_block,
160 nr_blocks, GFP_KERNEL, 0);
161 if (err)
162 break;
163
164 cond_resched();
165 }
166 return err;
167}
168
169
170
171
172
173static void discard_swap_cluster(struct swap_info_struct *si,
174 pgoff_t start_page, pgoff_t nr_pages)
175{
176 struct swap_extent *se = si->curr_swap_extent;
177 int found_extent = 0;
178
179 while (nr_pages) {
180 if (se->start_page <= start_page &&
181 start_page < se->start_page + se->nr_pages) {
182 pgoff_t offset = start_page - se->start_page;
183 sector_t start_block = se->start_block + offset;
184 sector_t nr_blocks = se->nr_pages - offset;
185
186 if (nr_blocks > nr_pages)
187 nr_blocks = nr_pages;
188 start_page += nr_blocks;
189 nr_pages -= nr_blocks;
190
191 if (!found_extent++)
192 si->curr_swap_extent = se;
193
194 start_block <<= PAGE_SHIFT - 9;
195 nr_blocks <<= PAGE_SHIFT - 9;
196 if (blkdev_issue_discard(si->bdev, start_block,
197 nr_blocks, GFP_NOIO, 0))
198 break;
199 }
200
201 se = list_next_entry(se, list);
202 }
203}
204
205#ifdef CONFIG_THP_SWAP
206#define SWAPFILE_CLUSTER HPAGE_PMD_NR
207#else
208#define SWAPFILE_CLUSTER 256
209#endif
210#define LATENCY_LIMIT 256
211
212static inline void cluster_set_flag(struct swap_cluster_info *info,
213 unsigned int flag)
214{
215 info->flags = flag;
216}
217
218static inline unsigned int cluster_count(struct swap_cluster_info *info)
219{
220 return info->data;
221}
222
223static inline void cluster_set_count(struct swap_cluster_info *info,
224 unsigned int c)
225{
226 info->data = c;
227}
228
229static inline void cluster_set_count_flag(struct swap_cluster_info *info,
230 unsigned int c, unsigned int f)
231{
232 info->flags = f;
233 info->data = c;
234}
235
236static inline unsigned int cluster_next(struct swap_cluster_info *info)
237{
238 return info->data;
239}
240
241static inline void cluster_set_next(struct swap_cluster_info *info,
242 unsigned int n)
243{
244 info->data = n;
245}
246
247static inline void cluster_set_next_flag(struct swap_cluster_info *info,
248 unsigned int n, unsigned int f)
249{
250 info->flags = f;
251 info->data = n;
252}
253
254static inline bool cluster_is_free(struct swap_cluster_info *info)
255{
256 return info->flags & CLUSTER_FLAG_FREE;
257}
258
259static inline bool cluster_is_null(struct swap_cluster_info *info)
260{
261 return info->flags & CLUSTER_FLAG_NEXT_NULL;
262}
263
264static inline void cluster_set_null(struct swap_cluster_info *info)
265{
266 info->flags = CLUSTER_FLAG_NEXT_NULL;
267 info->data = 0;
268}
269
270static inline bool cluster_is_huge(struct swap_cluster_info *info)
271{
272 return info->flags & CLUSTER_FLAG_HUGE;
273}
274
275static inline void cluster_clear_huge(struct swap_cluster_info *info)
276{
277 info->flags &= ~CLUSTER_FLAG_HUGE;
278}
279
280static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
281 unsigned long offset)
282{
283 struct swap_cluster_info *ci;
284
285 ci = si->cluster_info;
286 if (ci) {
287 ci += offset / SWAPFILE_CLUSTER;
288 spin_lock(&ci->lock);
289 }
290 return ci;
291}
292
293static inline void unlock_cluster(struct swap_cluster_info *ci)
294{
295 if (ci)
296 spin_unlock(&ci->lock);
297}
298
299static inline struct swap_cluster_info *lock_cluster_or_swap_info(
300 struct swap_info_struct *si,
301 unsigned long offset)
302{
303 struct swap_cluster_info *ci;
304
305 ci = lock_cluster(si, offset);
306 if (!ci)
307 spin_lock(&si->lock);
308
309 return ci;
310}
311
312static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
313 struct swap_cluster_info *ci)
314{
315 if (ci)
316 unlock_cluster(ci);
317 else
318 spin_unlock(&si->lock);
319}
320
321static inline bool cluster_list_empty(struct swap_cluster_list *list)
322{
323 return cluster_is_null(&list->head);
324}
325
326static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
327{
328 return cluster_next(&list->head);
329}
330
331static void cluster_list_init(struct swap_cluster_list *list)
332{
333 cluster_set_null(&list->head);
334 cluster_set_null(&list->tail);
335}
336
337static void cluster_list_add_tail(struct swap_cluster_list *list,
338 struct swap_cluster_info *ci,
339 unsigned int idx)
340{
341 if (cluster_list_empty(list)) {
342 cluster_set_next_flag(&list->head, idx, 0);
343 cluster_set_next_flag(&list->tail, idx, 0);
344 } else {
345 struct swap_cluster_info *ci_tail;
346 unsigned int tail = cluster_next(&list->tail);
347
348
349
350
351
352 ci_tail = ci + tail;
353 spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
354 cluster_set_next(ci_tail, idx);
355 spin_unlock(&ci_tail->lock);
356 cluster_set_next_flag(&list->tail, idx, 0);
357 }
358}
359
360static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
361 struct swap_cluster_info *ci)
362{
363 unsigned int idx;
364
365 idx = cluster_next(&list->head);
366 if (cluster_next(&list->tail) == idx) {
367 cluster_set_null(&list->head);
368 cluster_set_null(&list->tail);
369 } else
370 cluster_set_next_flag(&list->head,
371 cluster_next(&ci[idx]), 0);
372
373 return idx;
374}
375
376
377static void swap_cluster_schedule_discard(struct swap_info_struct *si,
378 unsigned int idx)
379{
380
381
382
383
384
385
386 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
387 SWAP_MAP_BAD, SWAPFILE_CLUSTER);
388
389 cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
390
391 schedule_work(&si->discard_work);
392}
393
394static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
395{
396 struct swap_cluster_info *ci = si->cluster_info;
397
398 cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
399 cluster_list_add_tail(&si->free_clusters, ci, idx);
400}
401
402
403
404
405
406static void swap_do_scheduled_discard(struct swap_info_struct *si)
407{
408 struct swap_cluster_info *info, *ci;
409 unsigned int idx;
410
411 info = si->cluster_info;
412
413 while (!cluster_list_empty(&si->discard_clusters)) {
414 idx = cluster_list_del_first(&si->discard_clusters, info);
415 spin_unlock(&si->lock);
416
417 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
418 SWAPFILE_CLUSTER);
419
420 spin_lock(&si->lock);
421 ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
422 __free_cluster(si, idx);
423 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
424 0, SWAPFILE_CLUSTER);
425 unlock_cluster(ci);
426 }
427}
428
429static void swap_discard_work(struct work_struct *work)
430{
431 struct swap_info_struct *si;
432
433 si = container_of(work, struct swap_info_struct, discard_work);
434
435 spin_lock(&si->lock);
436 swap_do_scheduled_discard(si);
437 spin_unlock(&si->lock);
438}
439
440static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
441{
442 struct swap_cluster_info *ci = si->cluster_info;
443
444 VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
445 cluster_list_del_first(&si->free_clusters, ci);
446 cluster_set_count_flag(ci + idx, 0, 0);
447}
448
449static void free_cluster(struct swap_info_struct *si, unsigned long idx)
450{
451 struct swap_cluster_info *ci = si->cluster_info + idx;
452
453 VM_BUG_ON(cluster_count(ci) != 0);
454
455
456
457
458
459 if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
460 (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
461 swap_cluster_schedule_discard(si, idx);
462 return;
463 }
464
465 __free_cluster(si, idx);
466}
467
468
469
470
471
472static void inc_cluster_info_page(struct swap_info_struct *p,
473 struct swap_cluster_info *cluster_info, unsigned long page_nr)
474{
475 unsigned long idx = page_nr / SWAPFILE_CLUSTER;
476
477 if (!cluster_info)
478 return;
479 if (cluster_is_free(&cluster_info[idx]))
480 alloc_cluster(p, idx);
481
482 VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
483 cluster_set_count(&cluster_info[idx],
484 cluster_count(&cluster_info[idx]) + 1);
485}
486
487
488
489
490
491
492static void dec_cluster_info_page(struct swap_info_struct *p,
493 struct swap_cluster_info *cluster_info, unsigned long page_nr)
494{
495 unsigned long idx = page_nr / SWAPFILE_CLUSTER;
496
497 if (!cluster_info)
498 return;
499
500 VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
501 cluster_set_count(&cluster_info[idx],
502 cluster_count(&cluster_info[idx]) - 1);
503
504 if (cluster_count(&cluster_info[idx]) == 0)
505 free_cluster(p, idx);
506}
507
508
509
510
511
512static bool
513scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
514 unsigned long offset)
515{
516 struct percpu_cluster *percpu_cluster;
517 bool conflict;
518
519 offset /= SWAPFILE_CLUSTER;
520 conflict = !cluster_list_empty(&si->free_clusters) &&
521 offset != cluster_list_first(&si->free_clusters) &&
522 cluster_is_free(&si->cluster_info[offset]);
523
524 if (!conflict)
525 return false;
526
527 percpu_cluster = this_cpu_ptr(si->percpu_cluster);
528 cluster_set_null(&percpu_cluster->index);
529 return true;
530}
531
532
533
534
535
536static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
537 unsigned long *offset, unsigned long *scan_base)
538{
539 struct percpu_cluster *cluster;
540 struct swap_cluster_info *ci;
541 bool found_free;
542 unsigned long tmp, max;
543
544new_cluster:
545 cluster = this_cpu_ptr(si->percpu_cluster);
546 if (cluster_is_null(&cluster->index)) {
547 if (!cluster_list_empty(&si->free_clusters)) {
548 cluster->index = si->free_clusters.head;
549 cluster->next = cluster_next(&cluster->index) *
550 SWAPFILE_CLUSTER;
551 } else if (!cluster_list_empty(&si->discard_clusters)) {
552
553
554
555
556 swap_do_scheduled_discard(si);
557 *scan_base = *offset = si->cluster_next;
558 goto new_cluster;
559 } else
560 return false;
561 }
562
563 found_free = false;
564
565
566
567
568
569 tmp = cluster->next;
570 max = min_t(unsigned long, si->max,
571 (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
572 if (tmp >= max) {
573 cluster_set_null(&cluster->index);
574 goto new_cluster;
575 }
576 ci = lock_cluster(si, tmp);
577 while (tmp < max) {
578 if (!si->swap_map[tmp]) {
579 found_free = true;
580 break;
581 }
582 tmp++;
583 }
584 unlock_cluster(ci);
585 if (!found_free) {
586 cluster_set_null(&cluster->index);
587 goto new_cluster;
588 }
589 cluster->next = tmp + 1;
590 *offset = tmp;
591 *scan_base = tmp;
592 return found_free;
593}
594
595static void __del_from_avail_list(struct swap_info_struct *p)
596{
597 int nid;
598
599 for_each_node(nid)
600 plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
601}
602
603static void del_from_avail_list(struct swap_info_struct *p)
604{
605 spin_lock(&swap_avail_lock);
606 __del_from_avail_list(p);
607 spin_unlock(&swap_avail_lock);
608}
609
610static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
611 unsigned int nr_entries)
612{
613 unsigned int end = offset + nr_entries - 1;
614
615 if (offset == si->lowest_bit)
616 si->lowest_bit += nr_entries;
617 if (end == si->highest_bit)
618 si->highest_bit -= nr_entries;
619 si->inuse_pages += nr_entries;
620 if (si->inuse_pages == si->pages) {
621 si->lowest_bit = si->max;
622 si->highest_bit = 0;
623 del_from_avail_list(si);
624 }
625}
626
627static void add_to_avail_list(struct swap_info_struct *p)
628{
629 int nid;
630
631 spin_lock(&swap_avail_lock);
632 for_each_node(nid) {
633 WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
634 plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
635 }
636 spin_unlock(&swap_avail_lock);
637}
638
639static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
640 unsigned int nr_entries)
641{
642 unsigned long end = offset + nr_entries - 1;
643 void (*swap_slot_free_notify)(struct block_device *, unsigned long);
644
645 if (offset < si->lowest_bit)
646 si->lowest_bit = offset;
647 if (end > si->highest_bit) {
648 bool was_full = !si->highest_bit;
649
650 si->highest_bit = end;
651 if (was_full && (si->flags & SWP_WRITEOK))
652 add_to_avail_list(si);
653 }
654 atomic_long_add(nr_entries, &nr_swap_pages);
655 si->inuse_pages -= nr_entries;
656 if (si->flags & SWP_BLKDEV)
657 swap_slot_free_notify =
658 si->bdev->bd_disk->fops->swap_slot_free_notify;
659 else
660 swap_slot_free_notify = NULL;
661 while (offset <= end) {
662 frontswap_invalidate_page(si->type, offset);
663 if (swap_slot_free_notify)
664 swap_slot_free_notify(si->bdev, offset);
665 offset++;
666 }
667}
668
669static int scan_swap_map_slots(struct swap_info_struct *si,
670 unsigned char usage, int nr,
671 swp_entry_t slots[])
672{
673 struct swap_cluster_info *ci;
674 unsigned long offset;
675 unsigned long scan_base;
676 unsigned long last_in_cluster = 0;
677 int latency_ration = LATENCY_LIMIT;
678 int n_ret = 0;
679
680 if (nr > SWAP_BATCH)
681 nr = SWAP_BATCH;
682
683
684
685
686
687
688
689
690
691
692
693
694 si->flags += SWP_SCANNING;
695 scan_base = offset = si->cluster_next;
696
697
698 if (si->cluster_info) {
699 if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
700 goto checks;
701 else
702 goto scan;
703 }
704
705 if (unlikely(!si->cluster_nr--)) {
706 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
707 si->cluster_nr = SWAPFILE_CLUSTER - 1;
708 goto checks;
709 }
710
711 spin_unlock(&si->lock);
712
713
714
715
716
717
718
719 scan_base = offset = si->lowest_bit;
720 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
721
722
723 for (; last_in_cluster <= si->highest_bit; offset++) {
724 if (si->swap_map[offset])
725 last_in_cluster = offset + SWAPFILE_CLUSTER;
726 else if (offset == last_in_cluster) {
727 spin_lock(&si->lock);
728 offset -= SWAPFILE_CLUSTER - 1;
729 si->cluster_next = offset;
730 si->cluster_nr = SWAPFILE_CLUSTER - 1;
731 goto checks;
732 }
733 if (unlikely(--latency_ration < 0)) {
734 cond_resched();
735 latency_ration = LATENCY_LIMIT;
736 }
737 }
738
739 offset = scan_base;
740 spin_lock(&si->lock);
741 si->cluster_nr = SWAPFILE_CLUSTER - 1;
742 }
743
744checks:
745 if (si->cluster_info) {
746 while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
747
748 if (n_ret)
749 goto done;
750 if (!scan_swap_map_try_ssd_cluster(si, &offset,
751 &scan_base))
752 goto scan;
753 }
754 }
755 if (!(si->flags & SWP_WRITEOK))
756 goto no_page;
757 if (!si->highest_bit)
758 goto no_page;
759 if (offset > si->highest_bit)
760 scan_base = offset = si->lowest_bit;
761
762 ci = lock_cluster(si, offset);
763
764 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
765 int swap_was_freed;
766 unlock_cluster(ci);
767 spin_unlock(&si->lock);
768 swap_was_freed = __try_to_reclaim_swap(si, offset);
769 spin_lock(&si->lock);
770
771 if (swap_was_freed)
772 goto checks;
773 goto scan;
774 }
775
776 if (si->swap_map[offset]) {
777 unlock_cluster(ci);
778 if (!n_ret)
779 goto scan;
780 else
781 goto done;
782 }
783 si->swap_map[offset] = usage;
784 inc_cluster_info_page(si, si->cluster_info, offset);
785 unlock_cluster(ci);
786
787 swap_range_alloc(si, offset, 1);
788 si->cluster_next = offset + 1;
789 slots[n_ret++] = swp_entry(si->type, offset);
790
791
792 if ((n_ret == nr) || (offset >= si->highest_bit))
793 goto done;
794
795
796
797
798 if (unlikely(--latency_ration < 0)) {
799 if (n_ret)
800 goto done;
801 spin_unlock(&si->lock);
802 cond_resched();
803 spin_lock(&si->lock);
804 latency_ration = LATENCY_LIMIT;
805 }
806
807
808 if (si->cluster_info) {
809 if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
810 goto checks;
811 else
812 goto done;
813 }
814
815 ++offset;
816
817
818 if (si->cluster_nr && !si->swap_map[offset]) {
819 --si->cluster_nr;
820 goto checks;
821 }
822
823done:
824 si->flags -= SWP_SCANNING;
825 return n_ret;
826
827scan:
828 spin_unlock(&si->lock);
829 while (++offset <= si->highest_bit) {
830 if (!si->swap_map[offset]) {
831 spin_lock(&si->lock);
832 goto checks;
833 }
834 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
835 spin_lock(&si->lock);
836 goto checks;
837 }
838 if (unlikely(--latency_ration < 0)) {
839 cond_resched();
840 latency_ration = LATENCY_LIMIT;
841 }
842 }
843 offset = si->lowest_bit;
844 while (offset < scan_base) {
845 if (!si->swap_map[offset]) {
846 spin_lock(&si->lock);
847 goto checks;
848 }
849 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
850 spin_lock(&si->lock);
851 goto checks;
852 }
853 if (unlikely(--latency_ration < 0)) {
854 cond_resched();
855 latency_ration = LATENCY_LIMIT;
856 }
857 offset++;
858 }
859 spin_lock(&si->lock);
860
861no_page:
862 si->flags -= SWP_SCANNING;
863 return n_ret;
864}
865
866#ifdef CONFIG_THP_SWAP
867static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
868{
869 unsigned long idx;
870 struct swap_cluster_info *ci;
871 unsigned long offset, i;
872 unsigned char *map;
873
874 if (cluster_list_empty(&si->free_clusters))
875 return 0;
876
877 idx = cluster_list_first(&si->free_clusters);
878 offset = idx * SWAPFILE_CLUSTER;
879 ci = lock_cluster(si, offset);
880 alloc_cluster(si, idx);
881 cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
882
883 map = si->swap_map + offset;
884 for (i = 0; i < SWAPFILE_CLUSTER; i++)
885 map[i] = SWAP_HAS_CACHE;
886 unlock_cluster(ci);
887 swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
888 *slot = swp_entry(si->type, offset);
889
890 return 1;
891}
892
893static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
894{
895 unsigned long offset = idx * SWAPFILE_CLUSTER;
896 struct swap_cluster_info *ci;
897
898 ci = lock_cluster(si, offset);
899 cluster_set_count_flag(ci, 0, 0);
900 free_cluster(si, idx);
901 unlock_cluster(ci);
902 swap_range_free(si, offset, SWAPFILE_CLUSTER);
903}
904#else
905static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
906{
907 VM_WARN_ON_ONCE(1);
908 return 0;
909}
910#endif
911
912static unsigned long scan_swap_map(struct swap_info_struct *si,
913 unsigned char usage)
914{
915 swp_entry_t entry;
916 int n_ret;
917
918 n_ret = scan_swap_map_slots(si, usage, 1, &entry);
919
920 if (n_ret)
921 return swp_offset(entry);
922 else
923 return 0;
924
925}
926
927int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
928{
929 unsigned long nr_pages = cluster ? SWAPFILE_CLUSTER : 1;
930 struct swap_info_struct *si, *next;
931 long avail_pgs;
932 int n_ret = 0;
933 int node;
934
935
936 WARN_ON_ONCE(n_goal > 1 && cluster);
937
938 avail_pgs = atomic_long_read(&nr_swap_pages) / nr_pages;
939 if (avail_pgs <= 0)
940 goto noswap;
941
942 if (n_goal > SWAP_BATCH)
943 n_goal = SWAP_BATCH;
944
945 if (n_goal > avail_pgs)
946 n_goal = avail_pgs;
947
948 atomic_long_sub(n_goal * nr_pages, &nr_swap_pages);
949
950 spin_lock(&swap_avail_lock);
951
952start_over:
953 node = numa_node_id();
954 plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
955
956 plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
957 spin_unlock(&swap_avail_lock);
958 spin_lock(&si->lock);
959 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
960 spin_lock(&swap_avail_lock);
961 if (plist_node_empty(&si->avail_lists[node])) {
962 spin_unlock(&si->lock);
963 goto nextsi;
964 }
965 WARN(!si->highest_bit,
966 "swap_info %d in list but !highest_bit\n",
967 si->type);
968 WARN(!(si->flags & SWP_WRITEOK),
969 "swap_info %d in list but !SWP_WRITEOK\n",
970 si->type);
971 __del_from_avail_list(si);
972 spin_unlock(&si->lock);
973 goto nextsi;
974 }
975 if (cluster) {
976 if (!(si->flags & SWP_FILE))
977 n_ret = swap_alloc_cluster(si, swp_entries);
978 } else
979 n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
980 n_goal, swp_entries);
981 spin_unlock(&si->lock);
982 if (n_ret || cluster)
983 goto check_out;
984 pr_debug("scan_swap_map of si %d failed to find offset\n",
985 si->type);
986
987 spin_lock(&swap_avail_lock);
988nextsi:
989
990
991
992
993
994
995
996
997
998
999
1000 if (plist_node_empty(&next->avail_lists[node]))
1001 goto start_over;
1002 }
1003
1004 spin_unlock(&swap_avail_lock);
1005
1006check_out:
1007 if (n_ret < n_goal)
1008 atomic_long_add((long)(n_goal - n_ret) * nr_pages,
1009 &nr_swap_pages);
1010noswap:
1011 return n_ret;
1012}
1013
1014
1015swp_entry_t get_swap_page_of_type(int type)
1016{
1017 struct swap_info_struct *si;
1018 pgoff_t offset;
1019
1020 si = swap_info[type];
1021 spin_lock(&si->lock);
1022 if (si && (si->flags & SWP_WRITEOK)) {
1023 atomic_long_dec(&nr_swap_pages);
1024
1025 offset = scan_swap_map(si, 1);
1026 if (offset) {
1027 spin_unlock(&si->lock);
1028 return swp_entry(type, offset);
1029 }
1030 atomic_long_inc(&nr_swap_pages);
1031 }
1032 spin_unlock(&si->lock);
1033 return (swp_entry_t) {0};
1034}
1035
1036static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
1037{
1038 struct swap_info_struct *p;
1039 unsigned long offset, type;
1040
1041 if (!entry.val)
1042 goto out;
1043 type = swp_type(entry);
1044 if (type >= nr_swapfiles)
1045 goto bad_nofile;
1046 p = swap_info[type];
1047 if (!(p->flags & SWP_USED))
1048 goto bad_device;
1049 offset = swp_offset(entry);
1050 if (offset >= p->max)
1051 goto bad_offset;
1052 return p;
1053
1054bad_offset:
1055 pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val);
1056 goto out;
1057bad_device:
1058 pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val);
1059 goto out;
1060bad_nofile:
1061 pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val);
1062out:
1063 return NULL;
1064}
1065
1066static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
1067{
1068 struct swap_info_struct *p;
1069
1070 p = __swap_info_get(entry);
1071 if (!p)
1072 goto out;
1073 if (!p->swap_map[swp_offset(entry)])
1074 goto bad_free;
1075 return p;
1076
1077bad_free:
1078 pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
1079 goto out;
1080out:
1081 return NULL;
1082}
1083
1084static struct swap_info_struct *swap_info_get(swp_entry_t entry)
1085{
1086 struct swap_info_struct *p;
1087
1088 p = _swap_info_get(entry);
1089 if (p)
1090 spin_lock(&p->lock);
1091 return p;
1092}
1093
1094static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
1095 struct swap_info_struct *q)
1096{
1097 struct swap_info_struct *p;
1098
1099 p = _swap_info_get(entry);
1100
1101 if (p != q) {
1102 if (q != NULL)
1103 spin_unlock(&q->lock);
1104 if (p != NULL)
1105 spin_lock(&p->lock);
1106 }
1107 return p;
1108}
1109
1110static unsigned char __swap_entry_free(struct swap_info_struct *p,
1111 swp_entry_t entry, unsigned char usage)
1112{
1113 struct swap_cluster_info *ci;
1114 unsigned long offset = swp_offset(entry);
1115 unsigned char count;
1116 unsigned char has_cache;
1117
1118 ci = lock_cluster_or_swap_info(p, offset);
1119
1120 count = p->swap_map[offset];
1121
1122 has_cache = count & SWAP_HAS_CACHE;
1123 count &= ~SWAP_HAS_CACHE;
1124
1125 if (usage == SWAP_HAS_CACHE) {
1126 VM_BUG_ON(!has_cache);
1127 has_cache = 0;
1128 } else if (count == SWAP_MAP_SHMEM) {
1129
1130
1131
1132
1133 count = 0;
1134 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
1135 if (count == COUNT_CONTINUED) {
1136 if (swap_count_continued(p, offset, count))
1137 count = SWAP_MAP_MAX | COUNT_CONTINUED;
1138 else
1139 count = SWAP_MAP_MAX;
1140 } else
1141 count--;
1142 }
1143
1144 usage = count | has_cache;
1145 p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
1146
1147 unlock_cluster_or_swap_info(p, ci);
1148
1149 return usage;
1150}
1151
1152static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
1153{
1154 struct swap_cluster_info *ci;
1155 unsigned long offset = swp_offset(entry);
1156 unsigned char count;
1157
1158 ci = lock_cluster(p, offset);
1159 count = p->swap_map[offset];
1160 VM_BUG_ON(count != SWAP_HAS_CACHE);
1161 p->swap_map[offset] = 0;
1162 dec_cluster_info_page(p, p->cluster_info, offset);
1163 unlock_cluster(ci);
1164
1165 mem_cgroup_uncharge_swap(entry, 1);
1166 swap_range_free(p, offset, 1);
1167}
1168
1169
1170
1171
1172
1173void swap_free(swp_entry_t entry)
1174{
1175 struct swap_info_struct *p;
1176
1177 p = _swap_info_get(entry);
1178 if (p) {
1179 if (!__swap_entry_free(p, entry, 1))
1180 free_swap_slot(entry);
1181 }
1182}
1183
1184
1185
1186
1187static void swapcache_free(swp_entry_t entry)
1188{
1189 struct swap_info_struct *p;
1190
1191 p = _swap_info_get(entry);
1192 if (p) {
1193 if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE))
1194 free_swap_slot(entry);
1195 }
1196}
1197
1198#ifdef CONFIG_THP_SWAP
1199static void swapcache_free_cluster(swp_entry_t entry)
1200{
1201 unsigned long offset = swp_offset(entry);
1202 unsigned long idx = offset / SWAPFILE_CLUSTER;
1203 struct swap_cluster_info *ci;
1204 struct swap_info_struct *si;
1205 unsigned char *map;
1206 unsigned int i, free_entries = 0;
1207 unsigned char val;
1208
1209 si = _swap_info_get(entry);
1210 if (!si)
1211 return;
1212
1213 ci = lock_cluster(si, offset);
1214 VM_BUG_ON(!cluster_is_huge(ci));
1215 map = si->swap_map + offset;
1216 for (i = 0; i < SWAPFILE_CLUSTER; i++) {
1217 val = map[i];
1218 VM_BUG_ON(!(val & SWAP_HAS_CACHE));
1219 if (val == SWAP_HAS_CACHE)
1220 free_entries++;
1221 }
1222 if (!free_entries) {
1223 for (i = 0; i < SWAPFILE_CLUSTER; i++)
1224 map[i] &= ~SWAP_HAS_CACHE;
1225 }
1226 cluster_clear_huge(ci);
1227 unlock_cluster(ci);
1228 if (free_entries == SWAPFILE_CLUSTER) {
1229 spin_lock(&si->lock);
1230 ci = lock_cluster(si, offset);
1231 memset(map, 0, SWAPFILE_CLUSTER);
1232 unlock_cluster(ci);
1233 mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
1234 swap_free_cluster(si, idx);
1235 spin_unlock(&si->lock);
1236 } else if (free_entries) {
1237 for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) {
1238 if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
1239 free_swap_slot(entry);
1240 }
1241 }
1242}
1243
1244int split_swap_cluster(swp_entry_t entry)
1245{
1246 struct swap_info_struct *si;
1247 struct swap_cluster_info *ci;
1248 unsigned long offset = swp_offset(entry);
1249
1250 si = _swap_info_get(entry);
1251 if (!si)
1252 return -EBUSY;
1253 ci = lock_cluster(si, offset);
1254 cluster_clear_huge(ci);
1255 unlock_cluster(ci);
1256 return 0;
1257}
1258#else
1259static inline void swapcache_free_cluster(swp_entry_t entry)
1260{
1261}
1262#endif
1263
1264void put_swap_page(struct page *page, swp_entry_t entry)
1265{
1266 if (!PageTransHuge(page))
1267 swapcache_free(entry);
1268 else
1269 swapcache_free_cluster(entry);
1270}
1271
1272static int swp_entry_cmp(const void *ent1, const void *ent2)
1273{
1274 const swp_entry_t *e1 = ent1, *e2 = ent2;
1275
1276 return (int)swp_type(*e1) - (int)swp_type(*e2);
1277}
1278
1279void swapcache_free_entries(swp_entry_t *entries, int n)
1280{
1281 struct swap_info_struct *p, *prev;
1282 int i;
1283
1284 if (n <= 0)
1285 return;
1286
1287 prev = NULL;
1288 p = NULL;
1289
1290
1291
1292
1293
1294
1295 if (nr_swapfiles > 1)
1296 sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
1297 for (i = 0; i < n; ++i) {
1298 p = swap_info_get_cont(entries[i], prev);
1299 if (p)
1300 swap_entry_free(p, entries[i]);
1301 prev = p;
1302 }
1303 if (p)
1304 spin_unlock(&p->lock);
1305}
1306
1307
1308
1309
1310
1311
1312int page_swapcount(struct page *page)
1313{
1314 int count = 0;
1315 struct swap_info_struct *p;
1316 struct swap_cluster_info *ci;
1317 swp_entry_t entry;
1318 unsigned long offset;
1319
1320 entry.val = page_private(page);
1321 p = _swap_info_get(entry);
1322 if (p) {
1323 offset = swp_offset(entry);
1324 ci = lock_cluster_or_swap_info(p, offset);
1325 count = swap_count(p->swap_map[offset]);
1326 unlock_cluster_or_swap_info(p, ci);
1327 }
1328 return count;
1329}
1330
1331static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
1332{
1333 int count = 0;
1334 pgoff_t offset = swp_offset(entry);
1335 struct swap_cluster_info *ci;
1336
1337 ci = lock_cluster_or_swap_info(si, offset);
1338 count = swap_count(si->swap_map[offset]);
1339 unlock_cluster_or_swap_info(si, ci);
1340 return count;
1341}
1342
1343
1344
1345
1346
1347
1348int __swp_swapcount(swp_entry_t entry)
1349{
1350 int count = 0;
1351 struct swap_info_struct *si;
1352
1353 si = __swap_info_get(entry);
1354 if (si)
1355 count = swap_swapcount(si, entry);
1356 return count;
1357}
1358
1359
1360
1361
1362
1363int swp_swapcount(swp_entry_t entry)
1364{
1365 int count, tmp_count, n;
1366 struct swap_info_struct *p;
1367 struct swap_cluster_info *ci;
1368 struct page *page;
1369 pgoff_t offset;
1370 unsigned char *map;
1371
1372 p = _swap_info_get(entry);
1373 if (!p)
1374 return 0;
1375
1376 offset = swp_offset(entry);
1377
1378 ci = lock_cluster_or_swap_info(p, offset);
1379
1380 count = swap_count(p->swap_map[offset]);
1381 if (!(count & COUNT_CONTINUED))
1382 goto out;
1383
1384 count &= ~COUNT_CONTINUED;
1385 n = SWAP_MAP_MAX + 1;
1386
1387 page = vmalloc_to_page(p->swap_map + offset);
1388 offset &= ~PAGE_MASK;
1389 VM_BUG_ON(page_private(page) != SWP_CONTINUED);
1390
1391 do {
1392 page = list_next_entry(page, lru);
1393 map = kmap_atomic(page);
1394 tmp_count = map[offset];
1395 kunmap_atomic(map);
1396
1397 count += (tmp_count & ~COUNT_CONTINUED) * n;
1398 n *= (SWAP_CONT_MAX + 1);
1399 } while (tmp_count & COUNT_CONTINUED);
1400out:
1401 unlock_cluster_or_swap_info(p, ci);
1402 return count;
1403}
1404
1405#ifdef CONFIG_THP_SWAP
1406static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
1407 swp_entry_t entry)
1408{
1409 struct swap_cluster_info *ci;
1410 unsigned char *map = si->swap_map;
1411 unsigned long roffset = swp_offset(entry);
1412 unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
1413 int i;
1414 bool ret = false;
1415
1416 ci = lock_cluster_or_swap_info(si, offset);
1417 if (!ci || !cluster_is_huge(ci)) {
1418 if (map[roffset] != SWAP_HAS_CACHE)
1419 ret = true;
1420 goto unlock_out;
1421 }
1422 for (i = 0; i < SWAPFILE_CLUSTER; i++) {
1423 if (map[offset + i] != SWAP_HAS_CACHE) {
1424 ret = true;
1425 break;
1426 }
1427 }
1428unlock_out:
1429 unlock_cluster_or_swap_info(si, ci);
1430 return ret;
1431}
1432
1433static bool page_swapped(struct page *page)
1434{
1435 swp_entry_t entry;
1436 struct swap_info_struct *si;
1437
1438 if (likely(!PageTransCompound(page)))
1439 return page_swapcount(page) != 0;
1440
1441 page = compound_head(page);
1442 entry.val = page_private(page);
1443 si = _swap_info_get(entry);
1444 if (si)
1445 return swap_page_trans_huge_swapped(si, entry);
1446 return false;
1447}
1448
1449static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
1450 int *total_swapcount)
1451{
1452 int i, map_swapcount, _total_mapcount, _total_swapcount;
1453 unsigned long offset = 0;
1454 struct swap_info_struct *si;
1455 struct swap_cluster_info *ci = NULL;
1456 unsigned char *map = NULL;
1457 int mapcount, swapcount = 0;
1458
1459
1460 VM_BUG_ON_PAGE(PageHuge(page), page);
1461
1462 if (likely(!PageTransCompound(page))) {
1463 mapcount = atomic_read(&page->_mapcount) + 1;
1464 if (total_mapcount)
1465 *total_mapcount = mapcount;
1466 if (PageSwapCache(page))
1467 swapcount = page_swapcount(page);
1468 if (total_swapcount)
1469 *total_swapcount = swapcount;
1470 return mapcount + swapcount;
1471 }
1472
1473 page = compound_head(page);
1474
1475 _total_mapcount = _total_swapcount = map_swapcount = 0;
1476 if (PageSwapCache(page)) {
1477 swp_entry_t entry;
1478
1479 entry.val = page_private(page);
1480 si = _swap_info_get(entry);
1481 if (si) {
1482 map = si->swap_map;
1483 offset = swp_offset(entry);
1484 }
1485 }
1486 if (map)
1487 ci = lock_cluster(si, offset);
1488 for (i = 0; i < HPAGE_PMD_NR; i++) {
1489 mapcount = atomic_read(&page[i]._mapcount) + 1;
1490 _total_mapcount += mapcount;
1491 if (map) {
1492 swapcount = swap_count(map[offset + i]);
1493 _total_swapcount += swapcount;
1494 }
1495 map_swapcount = max(map_swapcount, mapcount + swapcount);
1496 }
1497 unlock_cluster(ci);
1498 if (PageDoubleMap(page)) {
1499 map_swapcount -= 1;
1500 _total_mapcount -= HPAGE_PMD_NR;
1501 }
1502 mapcount = compound_mapcount(page);
1503 map_swapcount += mapcount;
1504 _total_mapcount += mapcount;
1505 if (total_mapcount)
1506 *total_mapcount = _total_mapcount;
1507 if (total_swapcount)
1508 *total_swapcount = _total_swapcount;
1509
1510 return map_swapcount;
1511}
1512#else
1513#define swap_page_trans_huge_swapped(si, entry) swap_swapcount(si, entry)
1514#define page_swapped(page) (page_swapcount(page) != 0)
1515
1516static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
1517 int *total_swapcount)
1518{
1519 int mapcount, swapcount = 0;
1520
1521
1522 VM_BUG_ON_PAGE(PageHuge(page), page);
1523
1524 mapcount = page_trans_huge_mapcount(page, total_mapcount);
1525 if (PageSwapCache(page))
1526 swapcount = page_swapcount(page);
1527 if (total_swapcount)
1528 *total_swapcount = swapcount;
1529 return mapcount + swapcount;
1530}
1531#endif
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543bool reuse_swap_page(struct page *page, int *total_map_swapcount)
1544{
1545 int count, total_mapcount, total_swapcount;
1546
1547 VM_BUG_ON_PAGE(!PageLocked(page), page);
1548 if (unlikely(PageKsm(page)))
1549 return false;
1550 count = page_trans_huge_map_swapcount(page, &total_mapcount,
1551 &total_swapcount);
1552 if (total_map_swapcount)
1553 *total_map_swapcount = total_mapcount + total_swapcount;
1554 if (count == 1 && PageSwapCache(page) &&
1555 (likely(!PageTransCompound(page)) ||
1556
1557 total_swapcount == page_swapcount(page))) {
1558 if (!PageWriteback(page)) {
1559 page = compound_head(page);
1560 delete_from_swap_cache(page);
1561 SetPageDirty(page);
1562 } else {
1563 swp_entry_t entry;
1564 struct swap_info_struct *p;
1565
1566 entry.val = page_private(page);
1567 p = swap_info_get(entry);
1568 if (p->flags & SWP_STABLE_WRITES) {
1569 spin_unlock(&p->lock);
1570 return false;
1571 }
1572 spin_unlock(&p->lock);
1573 }
1574 }
1575
1576 return count <= 1;
1577}
1578
1579
1580
1581
1582
1583int try_to_free_swap(struct page *page)
1584{
1585 VM_BUG_ON_PAGE(!PageLocked(page), page);
1586
1587 if (!PageSwapCache(page))
1588 return 0;
1589 if (PageWriteback(page))
1590 return 0;
1591 if (page_swapped(page))
1592 return 0;
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609 if (pm_suspended_storage())
1610 return 0;
1611
1612 page = compound_head(page);
1613 delete_from_swap_cache(page);
1614 SetPageDirty(page);
1615 return 1;
1616}
1617
1618
1619
1620
1621
1622int free_swap_and_cache(swp_entry_t entry)
1623{
1624 struct swap_info_struct *p;
1625 struct page *page = NULL;
1626 unsigned char count;
1627
1628 if (non_swap_entry(entry))
1629 return 1;
1630
1631 p = _swap_info_get(entry);
1632 if (p) {
1633 count = __swap_entry_free(p, entry, 1);
1634 if (count == SWAP_HAS_CACHE &&
1635 !swap_page_trans_huge_swapped(p, entry)) {
1636 page = find_get_page(swap_address_space(entry),
1637 swp_offset(entry));
1638 if (page && !trylock_page(page)) {
1639 put_page(page);
1640 page = NULL;
1641 }
1642 } else if (!count)
1643 free_swap_slot(entry);
1644 }
1645 if (page) {
1646
1647
1648
1649
1650 if (PageSwapCache(page) && !PageWriteback(page) &&
1651 (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
1652 !swap_page_trans_huge_swapped(p, entry)) {
1653 page = compound_head(page);
1654 delete_from_swap_cache(page);
1655 SetPageDirty(page);
1656 }
1657 unlock_page(page);
1658 put_page(page);
1659 }
1660 return p != NULL;
1661}
1662
1663#ifdef CONFIG_HIBERNATION
1664
1665
1666
1667
1668
1669
1670
1671
1672int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
1673{
1674 struct block_device *bdev = NULL;
1675 int type;
1676
1677 if (device)
1678 bdev = bdget(device);
1679
1680 spin_lock(&swap_lock);
1681 for (type = 0; type < nr_swapfiles; type++) {
1682 struct swap_info_struct *sis = swap_info[type];
1683
1684 if (!(sis->flags & SWP_WRITEOK))
1685 continue;
1686
1687 if (!bdev) {
1688 if (bdev_p)
1689 *bdev_p = bdgrab(sis->bdev);
1690
1691 spin_unlock(&swap_lock);
1692 return type;
1693 }
1694 if (bdev == sis->bdev) {
1695 struct swap_extent *se = &sis->first_swap_extent;
1696
1697 if (se->start_block == offset) {
1698 if (bdev_p)
1699 *bdev_p = bdgrab(sis->bdev);
1700
1701 spin_unlock(&swap_lock);
1702 bdput(bdev);
1703 return type;
1704 }
1705 }
1706 }
1707 spin_unlock(&swap_lock);
1708 if (bdev)
1709 bdput(bdev);
1710
1711 return -ENODEV;
1712}
1713
1714
1715
1716
1717
1718sector_t swapdev_block(int type, pgoff_t offset)
1719{
1720 struct block_device *bdev;
1721
1722 if ((unsigned int)type >= nr_swapfiles)
1723 return 0;
1724 if (!(swap_info[type]->flags & SWP_WRITEOK))
1725 return 0;
1726 return map_swap_entry(swp_entry(type, offset), &bdev);
1727}
1728
1729
1730
1731
1732
1733
1734
1735unsigned int count_swap_pages(int type, int free)
1736{
1737 unsigned int n = 0;
1738
1739 spin_lock(&swap_lock);
1740 if ((unsigned int)type < nr_swapfiles) {
1741 struct swap_info_struct *sis = swap_info[type];
1742
1743 spin_lock(&sis->lock);
1744 if (sis->flags & SWP_WRITEOK) {
1745 n = sis->pages;
1746 if (free)
1747 n -= sis->inuse_pages;
1748 }
1749 spin_unlock(&sis->lock);
1750 }
1751 spin_unlock(&swap_lock);
1752 return n;
1753}
1754#endif
1755
1756static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
1757{
1758 return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
1759}
1760
1761
1762
1763
1764
1765
1766static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
1767 unsigned long addr, swp_entry_t entry, struct page *page)
1768{
1769 struct page *swapcache;
1770 struct mem_cgroup *memcg;
1771 spinlock_t *ptl;
1772 pte_t *pte;
1773 int ret = 1;
1774
1775 swapcache = page;
1776 page = ksm_might_need_to_copy(page, vma, addr);
1777 if (unlikely(!page))
1778 return -ENOMEM;
1779
1780 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
1781 &memcg, false)) {
1782 ret = -ENOMEM;
1783 goto out_nolock;
1784 }
1785
1786 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1787 if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
1788 mem_cgroup_cancel_charge(page, memcg, false);
1789 ret = 0;
1790 goto out;
1791 }
1792
1793 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
1794 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
1795 get_page(page);
1796 set_pte_at(vma->vm_mm, addr, pte,
1797 pte_mkold(mk_pte(page, vma->vm_page_prot)));
1798 if (page == swapcache) {
1799 page_add_anon_rmap(page, vma, addr, false);
1800 mem_cgroup_commit_charge(page, memcg, true, false);
1801 } else {
1802 page_add_new_anon_rmap(page, vma, addr, false);
1803 mem_cgroup_commit_charge(page, memcg, false, false);
1804 lru_cache_add_active_or_unevictable(page, vma);
1805 }
1806 swap_free(entry);
1807
1808
1809
1810
1811 activate_page(page);
1812out:
1813 pte_unmap_unlock(pte, ptl);
1814out_nolock:
1815 if (page != swapcache) {
1816 unlock_page(page);
1817 put_page(page);
1818 }
1819 return ret;
1820}
1821
1822static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
1823 unsigned long addr, unsigned long end,
1824 swp_entry_t entry, struct page *page)
1825{
1826 pte_t swp_pte = swp_entry_to_pte(entry);
1827 pte_t *pte;
1828 int ret = 0;
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839 pte = pte_offset_map(pmd, addr);
1840 do {
1841
1842
1843
1844
1845 if (unlikely(pte_same_as_swp(*pte, swp_pte))) {
1846 pte_unmap(pte);
1847 ret = unuse_pte(vma, pmd, addr, entry, page);
1848 if (ret)
1849 goto out;
1850 pte = pte_offset_map(pmd, addr);
1851 }
1852 } while (pte++, addr += PAGE_SIZE, addr != end);
1853 pte_unmap(pte - 1);
1854out:
1855 return ret;
1856}
1857
1858static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
1859 unsigned long addr, unsigned long end,
1860 swp_entry_t entry, struct page *page)
1861{
1862 pmd_t *pmd;
1863 unsigned long next;
1864 int ret;
1865
1866 pmd = pmd_offset(pud, addr);
1867 do {
1868 cond_resched();
1869 next = pmd_addr_end(addr, end);
1870 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1871 continue;
1872 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
1873 if (ret)
1874 return ret;
1875 } while (pmd++, addr = next, addr != end);
1876 return 0;
1877}
1878
1879static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
1880 unsigned long addr, unsigned long end,
1881 swp_entry_t entry, struct page *page)
1882{
1883 pud_t *pud;
1884 unsigned long next;
1885 int ret;
1886
1887 pud = pud_offset(p4d, addr);
1888 do {
1889 next = pud_addr_end(addr, end);
1890 if (pud_none_or_clear_bad(pud))
1891 continue;
1892 ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
1893 if (ret)
1894 return ret;
1895 } while (pud++, addr = next, addr != end);
1896 return 0;
1897}
1898
1899static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
1900 unsigned long addr, unsigned long end,
1901 swp_entry_t entry, struct page *page)
1902{
1903 p4d_t *p4d;
1904 unsigned long next;
1905 int ret;
1906
1907 p4d = p4d_offset(pgd, addr);
1908 do {
1909 next = p4d_addr_end(addr, end);
1910 if (p4d_none_or_clear_bad(p4d))
1911 continue;
1912 ret = unuse_pud_range(vma, p4d, addr, next, entry, page);
1913 if (ret)
1914 return ret;
1915 } while (p4d++, addr = next, addr != end);
1916 return 0;
1917}
1918
1919static int unuse_vma(struct vm_area_struct *vma,
1920 swp_entry_t entry, struct page *page)
1921{
1922 pgd_t *pgd;
1923 unsigned long addr, end, next;
1924 int ret;
1925
1926 if (page_anon_vma(page)) {
1927 addr = page_address_in_vma(page, vma);
1928 if (addr == -EFAULT)
1929 return 0;
1930 else
1931 end = addr + PAGE_SIZE;
1932 } else {
1933 addr = vma->vm_start;
1934 end = vma->vm_end;
1935 }
1936
1937 pgd = pgd_offset(vma->vm_mm, addr);
1938 do {
1939 next = pgd_addr_end(addr, end);
1940 if (pgd_none_or_clear_bad(pgd))
1941 continue;
1942 ret = unuse_p4d_range(vma, pgd, addr, next, entry, page);
1943 if (ret)
1944 return ret;
1945 } while (pgd++, addr = next, addr != end);
1946 return 0;
1947}
1948
1949static int unuse_mm(struct mm_struct *mm,
1950 swp_entry_t entry, struct page *page)
1951{
1952 struct vm_area_struct *vma;
1953 int ret = 0;
1954
1955 if (!down_read_trylock(&mm->mmap_sem)) {
1956
1957
1958
1959
1960 activate_page(page);
1961 unlock_page(page);
1962 down_read(&mm->mmap_sem);
1963 lock_page(page);
1964 }
1965 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1966 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
1967 break;
1968 cond_resched();
1969 }
1970 up_read(&mm->mmap_sem);
1971 return (ret < 0)? ret: 0;
1972}
1973
1974
1975
1976
1977
1978
1979static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1980 unsigned int prev, bool frontswap)
1981{
1982 unsigned int max = si->max;
1983 unsigned int i = prev;
1984 unsigned char count;
1985
1986
1987
1988
1989
1990
1991
1992 for (;;) {
1993 if (++i >= max) {
1994 if (!prev) {
1995 i = 0;
1996 break;
1997 }
1998
1999
2000
2001
2002 max = prev + 1;
2003 prev = 0;
2004 i = 1;
2005 }
2006 count = READ_ONCE(si->swap_map[i]);
2007 if (count && swap_count(count) != SWAP_MAP_BAD)
2008 if (!frontswap || frontswap_test(si, i))
2009 break;
2010 if ((i % LATENCY_LIMIT) == 0)
2011 cond_resched();
2012 }
2013 return i;
2014}
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024int try_to_unuse(unsigned int type, bool frontswap,
2025 unsigned long pages_to_unuse)
2026{
2027 struct swap_info_struct *si = swap_info[type];
2028 struct mm_struct *start_mm;
2029 volatile unsigned char *swap_map;
2030
2031
2032
2033
2034 unsigned char swcount;
2035 struct page *page;
2036 swp_entry_t entry;
2037 unsigned int i = 0;
2038 int retval = 0;
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054 start_mm = &init_mm;
2055 mmget(&init_mm);
2056
2057
2058
2059
2060
2061
2062 while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
2063 if (signal_pending(current)) {
2064 retval = -EINTR;
2065 break;
2066 }
2067
2068
2069
2070
2071
2072
2073 swap_map = &si->swap_map[i];
2074 entry = swp_entry(type, i);
2075 page = read_swap_cache_async(entry,
2076 GFP_HIGHUSER_MOVABLE, NULL, 0, false);
2077 if (!page) {
2078
2079
2080
2081
2082
2083
2084 swcount = *swap_map;
2085
2086
2087
2088
2089
2090
2091
2092 if (!swcount || swcount == SWAP_MAP_BAD)
2093 continue;
2094 retval = -ENOMEM;
2095 break;
2096 }
2097
2098
2099
2100
2101 if (atomic_read(&start_mm->mm_users) == 1) {
2102 mmput(start_mm);
2103 start_mm = &init_mm;
2104 mmget(&init_mm);
2105 }
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115 wait_on_page_locked(page);
2116 wait_on_page_writeback(page);
2117 lock_page(page);
2118 wait_on_page_writeback(page);
2119
2120
2121
2122
2123 swcount = *swap_map;
2124 if (swap_count(swcount) == SWAP_MAP_SHMEM) {
2125 retval = shmem_unuse(entry, page);
2126
2127 if (retval < 0)
2128 break;
2129 continue;
2130 }
2131 if (swap_count(swcount) && start_mm != &init_mm)
2132 retval = unuse_mm(start_mm, entry, page);
2133
2134 if (swap_count(*swap_map)) {
2135 int set_start_mm = (*swap_map >= swcount);
2136 struct list_head *p = &start_mm->mmlist;
2137 struct mm_struct *new_start_mm = start_mm;
2138 struct mm_struct *prev_mm = start_mm;
2139 struct mm_struct *mm;
2140
2141 mmget(new_start_mm);
2142 mmget(prev_mm);
2143 spin_lock(&mmlist_lock);
2144 while (swap_count(*swap_map) && !retval &&
2145 (p = p->next) != &start_mm->mmlist) {
2146 mm = list_entry(p, struct mm_struct, mmlist);
2147 if (!mmget_not_zero(mm))
2148 continue;
2149 spin_unlock(&mmlist_lock);
2150 mmput(prev_mm);
2151 prev_mm = mm;
2152
2153 cond_resched();
2154
2155 swcount = *swap_map;
2156 if (!swap_count(swcount))
2157 ;
2158 else if (mm == &init_mm)
2159 set_start_mm = 1;
2160 else
2161 retval = unuse_mm(mm, entry, page);
2162
2163 if (set_start_mm && *swap_map < swcount) {
2164 mmput(new_start_mm);
2165 mmget(mm);
2166 new_start_mm = mm;
2167 set_start_mm = 0;
2168 }
2169 spin_lock(&mmlist_lock);
2170 }
2171 spin_unlock(&mmlist_lock);
2172 mmput(prev_mm);
2173 mmput(start_mm);
2174 start_mm = new_start_mm;
2175 }
2176 if (retval) {
2177 unlock_page(page);
2178 put_page(page);
2179 break;
2180 }
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201 if (swap_count(*swap_map) &&
2202 PageDirty(page) && PageSwapCache(page)) {
2203 struct writeback_control wbc = {
2204 .sync_mode = WB_SYNC_NONE,
2205 };
2206
2207 swap_writepage(compound_head(page), &wbc);
2208 lock_page(page);
2209 wait_on_page_writeback(page);
2210 }
2211
2212
2213
2214
2215
2216
2217
2218
2219 if (PageSwapCache(page) &&
2220 likely(page_private(page) == entry.val) &&
2221 !page_swapped(page))
2222 delete_from_swap_cache(compound_head(page));
2223
2224
2225
2226
2227
2228
2229 SetPageDirty(page);
2230 unlock_page(page);
2231 put_page(page);
2232
2233
2234
2235
2236
2237 cond_resched();
2238 if (frontswap && pages_to_unuse > 0) {
2239 if (!--pages_to_unuse)
2240 break;
2241 }
2242 }
2243
2244 mmput(start_mm);
2245 return retval;
2246}
2247
2248
2249
2250
2251
2252
2253
2254static void drain_mmlist(void)
2255{
2256 struct list_head *p, *next;
2257 unsigned int type;
2258
2259 for (type = 0; type < nr_swapfiles; type++)
2260 if (swap_info[type]->inuse_pages)
2261 return;
2262 spin_lock(&mmlist_lock);
2263 list_for_each_safe(p, next, &init_mm.mmlist)
2264 list_del_init(p);
2265 spin_unlock(&mmlist_lock);
2266}
2267
2268
2269
2270
2271
2272
2273
2274static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
2275{
2276 struct swap_info_struct *sis;
2277 struct swap_extent *start_se;
2278 struct swap_extent *se;
2279 pgoff_t offset;
2280
2281 sis = swap_info[swp_type(entry)];
2282 *bdev = sis->bdev;
2283
2284 offset = swp_offset(entry);
2285 start_se = sis->curr_swap_extent;
2286 se = start_se;
2287
2288 for ( ; ; ) {
2289 if (se->start_page <= offset &&
2290 offset < (se->start_page + se->nr_pages)) {
2291 return se->start_block + (offset - se->start_page);
2292 }
2293 se = list_next_entry(se, list);
2294 sis->curr_swap_extent = se;
2295 BUG_ON(se == start_se);
2296 }
2297}
2298
2299
2300
2301
2302sector_t map_swap_page(struct page *page, struct block_device **bdev)
2303{
2304 swp_entry_t entry;
2305 entry.val = page_private(page);
2306 return map_swap_entry(entry, bdev);
2307}
2308
2309
2310
2311
2312static void destroy_swap_extents(struct swap_info_struct *sis)
2313{
2314 while (!list_empty(&sis->first_swap_extent.list)) {
2315 struct swap_extent *se;
2316
2317 se = list_first_entry(&sis->first_swap_extent.list,
2318 struct swap_extent, list);
2319 list_del(&se->list);
2320 kfree(se);
2321 }
2322
2323 if (sis->flags & SWP_FILE) {
2324 struct file *swap_file = sis->swap_file;
2325 struct address_space *mapping = swap_file->f_mapping;
2326
2327 sis->flags &= ~SWP_FILE;
2328 mapping->a_ops->swap_deactivate(swap_file);
2329 }
2330}
2331
2332
2333
2334
2335
2336
2337
2338int
2339add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
2340 unsigned long nr_pages, sector_t start_block)
2341{
2342 struct swap_extent *se;
2343 struct swap_extent *new_se;
2344 struct list_head *lh;
2345
2346 if (start_page == 0) {
2347 se = &sis->first_swap_extent;
2348 sis->curr_swap_extent = se;
2349 se->start_page = 0;
2350 se->nr_pages = nr_pages;
2351 se->start_block = start_block;
2352 return 1;
2353 } else {
2354 lh = sis->first_swap_extent.list.prev;
2355 se = list_entry(lh, struct swap_extent, list);
2356 BUG_ON(se->start_page + se->nr_pages != start_page);
2357 if (se->start_block + se->nr_pages == start_block) {
2358
2359 se->nr_pages += nr_pages;
2360 return 0;
2361 }
2362 }
2363
2364
2365
2366
2367 new_se = kmalloc(sizeof(*se), GFP_KERNEL);
2368 if (new_se == NULL)
2369 return -ENOMEM;
2370 new_se->start_page = start_page;
2371 new_se->nr_pages = nr_pages;
2372 new_se->start_block = start_block;
2373
2374 list_add_tail(&new_se->list, &sis->first_swap_extent.list);
2375 return 1;
2376}
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
2410{
2411 struct file *swap_file = sis->swap_file;
2412 struct address_space *mapping = swap_file->f_mapping;
2413 struct inode *inode = mapping->host;
2414 int ret;
2415
2416 if (S_ISBLK(inode->i_mode)) {
2417 ret = add_swap_extent(sis, 0, sis->max, 0);
2418 *span = sis->pages;
2419 return ret;
2420 }
2421
2422 if (mapping->a_ops->swap_activate) {
2423 ret = mapping->a_ops->swap_activate(sis, swap_file, span);
2424 if (!ret) {
2425 sis->flags |= SWP_FILE;
2426 ret = add_swap_extent(sis, 0, sis->max, 0);
2427 *span = sis->pages;
2428 }
2429 return ret;
2430 }
2431
2432 return generic_swapfile_activate(sis, swap_file, span);
2433}
2434
2435static int swap_node(struct swap_info_struct *p)
2436{
2437 struct block_device *bdev;
2438
2439 if (p->bdev)
2440 bdev = p->bdev;
2441 else
2442 bdev = p->swap_file->f_inode->i_sb->s_bdev;
2443
2444 return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
2445}
2446
2447static void _enable_swap_info(struct swap_info_struct *p, int prio,
2448 unsigned char *swap_map,
2449 struct swap_cluster_info *cluster_info)
2450{
2451 int i;
2452
2453 if (prio >= 0)
2454 p->prio = prio;
2455 else
2456 p->prio = --least_priority;
2457
2458
2459
2460
2461 p->list.prio = -p->prio;
2462 for_each_node(i) {
2463 if (p->prio >= 0)
2464 p->avail_lists[i].prio = -p->prio;
2465 else {
2466 if (swap_node(p) == i)
2467 p->avail_lists[i].prio = 1;
2468 else
2469 p->avail_lists[i].prio = -p->prio;
2470 }
2471 }
2472 p->swap_map = swap_map;
2473 p->cluster_info = cluster_info;
2474 p->flags |= SWP_WRITEOK;
2475 atomic_long_add(p->pages, &nr_swap_pages);
2476 total_swap_pages += p->pages;
2477
2478 assert_spin_locked(&swap_lock);
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489 plist_add(&p->list, &swap_active_head);
2490 add_to_avail_list(p);
2491}
2492
2493static void enable_swap_info(struct swap_info_struct *p, int prio,
2494 unsigned char *swap_map,
2495 struct swap_cluster_info *cluster_info,
2496 unsigned long *frontswap_map)
2497{
2498 frontswap_init(p->type, frontswap_map);
2499 spin_lock(&swap_lock);
2500 spin_lock(&p->lock);
2501 _enable_swap_info(p, prio, swap_map, cluster_info);
2502 spin_unlock(&p->lock);
2503 spin_unlock(&swap_lock);
2504}
2505
2506static void reinsert_swap_info(struct swap_info_struct *p)
2507{
2508 spin_lock(&swap_lock);
2509 spin_lock(&p->lock);
2510 _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
2511 spin_unlock(&p->lock);
2512 spin_unlock(&swap_lock);
2513}
2514
2515bool has_usable_swap(void)
2516{
2517 bool ret = true;
2518
2519 spin_lock(&swap_lock);
2520 if (plist_head_empty(&swap_active_head))
2521 ret = false;
2522 spin_unlock(&swap_lock);
2523 return ret;
2524}
2525
2526SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
2527{
2528 struct swap_info_struct *p = NULL;
2529 unsigned char *swap_map;
2530 struct swap_cluster_info *cluster_info;
2531 unsigned long *frontswap_map;
2532 struct file *swap_file, *victim;
2533 struct address_space *mapping;
2534 struct inode *inode;
2535 struct filename *pathname;
2536 int err, found = 0;
2537 unsigned int old_block_size;
2538
2539 if (!capable(CAP_SYS_ADMIN))
2540 return -EPERM;
2541
2542 BUG_ON(!current->mm);
2543
2544 pathname = getname(specialfile);
2545 if (IS_ERR(pathname))
2546 return PTR_ERR(pathname);
2547
2548 victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
2549 err = PTR_ERR(victim);
2550 if (IS_ERR(victim))
2551 goto out;
2552
2553 mapping = victim->f_mapping;
2554 spin_lock(&swap_lock);
2555 plist_for_each_entry(p, &swap_active_head, list) {
2556 if (p->flags & SWP_WRITEOK) {
2557 if (p->swap_file->f_mapping == mapping) {
2558 found = 1;
2559 break;
2560 }
2561 }
2562 }
2563 if (!found) {
2564 err = -EINVAL;
2565 spin_unlock(&swap_lock);
2566 goto out_dput;
2567 }
2568 if (!security_vm_enough_memory_mm(current->mm, p->pages))
2569 vm_unacct_memory(p->pages);
2570 else {
2571 err = -ENOMEM;
2572 spin_unlock(&swap_lock);
2573 goto out_dput;
2574 }
2575 del_from_avail_list(p);
2576 spin_lock(&p->lock);
2577 if (p->prio < 0) {
2578 struct swap_info_struct *si = p;
2579 int nid;
2580
2581 plist_for_each_entry_continue(si, &swap_active_head, list) {
2582 si->prio++;
2583 si->list.prio--;
2584 for_each_node(nid) {
2585 if (si->avail_lists[nid].prio != 1)
2586 si->avail_lists[nid].prio--;
2587 }
2588 }
2589 least_priority++;
2590 }
2591 plist_del(&p->list, &swap_active_head);
2592 atomic_long_sub(p->pages, &nr_swap_pages);
2593 total_swap_pages -= p->pages;
2594 p->flags &= ~SWP_WRITEOK;
2595 spin_unlock(&p->lock);
2596 spin_unlock(&swap_lock);
2597
2598 disable_swap_slots_cache_lock();
2599
2600 set_current_oom_origin();
2601 err = try_to_unuse(p->type, false, 0);
2602 clear_current_oom_origin();
2603
2604 if (err) {
2605
2606 reinsert_swap_info(p);
2607 reenable_swap_slots_cache_unlock();
2608 goto out_dput;
2609 }
2610
2611 reenable_swap_slots_cache_unlock();
2612
2613 flush_work(&p->discard_work);
2614
2615 destroy_swap_extents(p);
2616 if (p->flags & SWP_CONTINUED)
2617 free_swap_count_continuations(p);
2618
2619 if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
2620 atomic_dec(&nr_rotate_swap);
2621
2622 mutex_lock(&swapon_mutex);
2623 spin_lock(&swap_lock);
2624 spin_lock(&p->lock);
2625 drain_mmlist();
2626
2627
2628 p->highest_bit = 0;
2629 while (p->flags >= SWP_SCANNING) {
2630 spin_unlock(&p->lock);
2631 spin_unlock(&swap_lock);
2632 schedule_timeout_uninterruptible(1);
2633 spin_lock(&swap_lock);
2634 spin_lock(&p->lock);
2635 }
2636
2637 swap_file = p->swap_file;
2638 old_block_size = p->old_block_size;
2639 p->swap_file = NULL;
2640 p->max = 0;
2641 swap_map = p->swap_map;
2642 p->swap_map = NULL;
2643 cluster_info = p->cluster_info;
2644 p->cluster_info = NULL;
2645 frontswap_map = frontswap_map_get(p);
2646 spin_unlock(&p->lock);
2647 spin_unlock(&swap_lock);
2648 frontswap_invalidate_area(p->type);
2649 frontswap_map_set(p, NULL);
2650 mutex_unlock(&swapon_mutex);
2651 free_percpu(p->percpu_cluster);
2652 p->percpu_cluster = NULL;
2653 vfree(swap_map);
2654 kvfree(cluster_info);
2655 kvfree(frontswap_map);
2656
2657 swap_cgroup_swapoff(p->type);
2658 exit_swap_address_space(p->type);
2659
2660 inode = mapping->host;
2661 if (S_ISBLK(inode->i_mode)) {
2662 struct block_device *bdev = I_BDEV(inode);
2663 set_blocksize(bdev, old_block_size);
2664 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2665 } else {
2666 inode_lock(inode);
2667 inode->i_flags &= ~S_SWAPFILE;
2668 inode_unlock(inode);
2669 }
2670 filp_close(swap_file, NULL);
2671
2672
2673
2674
2675
2676
2677 spin_lock(&swap_lock);
2678 p->flags = 0;
2679 spin_unlock(&swap_lock);
2680
2681 err = 0;
2682 atomic_inc(&proc_poll_event);
2683 wake_up_interruptible(&proc_poll_wait);
2684
2685out_dput:
2686 filp_close(victim, NULL);
2687out:
2688 putname(pathname);
2689 return err;
2690}
2691
2692#ifdef CONFIG_PROC_FS
2693static unsigned swaps_poll(struct file *file, poll_table *wait)
2694{
2695 struct seq_file *seq = file->private_data;
2696
2697 poll_wait(file, &proc_poll_wait, wait);
2698
2699 if (seq->poll_event != atomic_read(&proc_poll_event)) {
2700 seq->poll_event = atomic_read(&proc_poll_event);
2701 return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
2702 }
2703
2704 return POLLIN | POLLRDNORM;
2705}
2706
2707
2708static void *swap_start(struct seq_file *swap, loff_t *pos)
2709{
2710 struct swap_info_struct *si;
2711 int type;
2712 loff_t l = *pos;
2713
2714 mutex_lock(&swapon_mutex);
2715
2716 if (!l)
2717 return SEQ_START_TOKEN;
2718
2719 for (type = 0; type < nr_swapfiles; type++) {
2720 smp_rmb();
2721 si = swap_info[type];
2722 if (!(si->flags & SWP_USED) || !si->swap_map)
2723 continue;
2724 if (!--l)
2725 return si;
2726 }
2727
2728 return NULL;
2729}
2730
2731static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
2732{
2733 struct swap_info_struct *si = v;
2734 int type;
2735
2736 if (v == SEQ_START_TOKEN)
2737 type = 0;
2738 else
2739 type = si->type + 1;
2740
2741 for (; type < nr_swapfiles; type++) {
2742 smp_rmb();
2743 si = swap_info[type];
2744 if (!(si->flags & SWP_USED) || !si->swap_map)
2745 continue;
2746 ++*pos;
2747 return si;
2748 }
2749
2750 return NULL;
2751}
2752
2753static void swap_stop(struct seq_file *swap, void *v)
2754{
2755 mutex_unlock(&swapon_mutex);
2756}
2757
2758static int swap_show(struct seq_file *swap, void *v)
2759{
2760 struct swap_info_struct *si = v;
2761 struct file *file;
2762 int len;
2763
2764 if (si == SEQ_START_TOKEN) {
2765 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
2766 return 0;
2767 }
2768
2769 file = si->swap_file;
2770 len = seq_file_path(swap, file, " \t\n\\");
2771 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
2772 len < 40 ? 40 - len : 1, " ",
2773 S_ISBLK(file_inode(file)->i_mode) ?
2774 "partition" : "file\t",
2775 si->pages << (PAGE_SHIFT - 10),
2776 si->inuse_pages << (PAGE_SHIFT - 10),
2777 si->prio);
2778 return 0;
2779}
2780
2781static const struct seq_operations swaps_op = {
2782 .start = swap_start,
2783 .next = swap_next,
2784 .stop = swap_stop,
2785 .show = swap_show
2786};
2787
2788static int swaps_open(struct inode *inode, struct file *file)
2789{
2790 struct seq_file *seq;
2791 int ret;
2792
2793 ret = seq_open(file, &swaps_op);
2794 if (ret)
2795 return ret;
2796
2797 seq = file->private_data;
2798 seq->poll_event = atomic_read(&proc_poll_event);
2799 return 0;
2800}
2801
2802static const struct file_operations proc_swaps_operations = {
2803 .open = swaps_open,
2804 .read = seq_read,
2805 .llseek = seq_lseek,
2806 .release = seq_release,
2807 .poll = swaps_poll,
2808};
2809
2810static int __init procswaps_init(void)
2811{
2812 proc_create("swaps", 0, NULL, &proc_swaps_operations);
2813 return 0;
2814}
2815__initcall(procswaps_init);
2816#endif
2817
2818#ifdef MAX_SWAPFILES_CHECK
2819static int __init max_swapfiles_check(void)
2820{
2821 MAX_SWAPFILES_CHECK();
2822 return 0;
2823}
2824late_initcall(max_swapfiles_check);
2825#endif
2826
2827static struct swap_info_struct *alloc_swap_info(void)
2828{
2829 struct swap_info_struct *p;
2830 unsigned int type;
2831 int i;
2832
2833 p = kzalloc(sizeof(*p), GFP_KERNEL);
2834 if (!p)
2835 return ERR_PTR(-ENOMEM);
2836
2837 spin_lock(&swap_lock);
2838 for (type = 0; type < nr_swapfiles; type++) {
2839 if (!(swap_info[type]->flags & SWP_USED))
2840 break;
2841 }
2842 if (type >= MAX_SWAPFILES) {
2843 spin_unlock(&swap_lock);
2844 kfree(p);
2845 return ERR_PTR(-EPERM);
2846 }
2847 if (type >= nr_swapfiles) {
2848 p->type = type;
2849 swap_info[type] = p;
2850
2851
2852
2853
2854
2855 smp_wmb();
2856 nr_swapfiles++;
2857 } else {
2858 kfree(p);
2859 p = swap_info[type];
2860
2861
2862
2863
2864 }
2865 INIT_LIST_HEAD(&p->first_swap_extent.list);
2866 plist_node_init(&p->list, 0);
2867 for_each_node(i)
2868 plist_node_init(&p->avail_lists[i], 0);
2869 p->flags = SWP_USED;
2870 spin_unlock(&swap_lock);
2871 spin_lock_init(&p->lock);
2872 spin_lock_init(&p->cont_lock);
2873
2874 return p;
2875}
2876
2877static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
2878{
2879 int error;
2880
2881 if (S_ISBLK(inode->i_mode)) {
2882 p->bdev = bdgrab(I_BDEV(inode));
2883 error = blkdev_get(p->bdev,
2884 FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
2885 if (error < 0) {
2886 p->bdev = NULL;
2887 return error;
2888 }
2889 p->old_block_size = block_size(p->bdev);
2890 error = set_blocksize(p->bdev, PAGE_SIZE);
2891 if (error < 0)
2892 return error;
2893 p->flags |= SWP_BLKDEV;
2894 } else if (S_ISREG(inode->i_mode)) {
2895 p->bdev = inode->i_sb->s_bdev;
2896 inode_lock(inode);
2897 if (IS_SWAPFILE(inode))
2898 return -EBUSY;
2899 } else
2900 return -EINVAL;
2901
2902 return 0;
2903}
2904
2905static unsigned long read_swap_header(struct swap_info_struct *p,
2906 union swap_header *swap_header,
2907 struct inode *inode)
2908{
2909 int i;
2910 unsigned long maxpages;
2911 unsigned long swapfilepages;
2912 unsigned long last_page;
2913
2914 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
2915 pr_err("Unable to find swap-space signature\n");
2916 return 0;
2917 }
2918
2919
2920 if (swab32(swap_header->info.version) == 1) {
2921 swab32s(&swap_header->info.version);
2922 swab32s(&swap_header->info.last_page);
2923 swab32s(&swap_header->info.nr_badpages);
2924 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2925 return 0;
2926 for (i = 0; i < swap_header->info.nr_badpages; i++)
2927 swab32s(&swap_header->info.badpages[i]);
2928 }
2929
2930 if (swap_header->info.version != 1) {
2931 pr_warn("Unable to handle swap header version %d\n",
2932 swap_header->info.version);
2933 return 0;
2934 }
2935
2936 p->lowest_bit = 1;
2937 p->cluster_next = 1;
2938 p->cluster_nr = 0;
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954 maxpages = swp_offset(pte_to_swp_entry(
2955 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
2956 last_page = swap_header->info.last_page;
2957 if (last_page > maxpages) {
2958 pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
2959 maxpages << (PAGE_SHIFT - 10),
2960 last_page << (PAGE_SHIFT - 10));
2961 }
2962 if (maxpages > last_page) {
2963 maxpages = last_page + 1;
2964
2965 if ((unsigned int)maxpages == 0)
2966 maxpages = UINT_MAX;
2967 }
2968 p->highest_bit = maxpages - 1;
2969
2970 if (!maxpages)
2971 return 0;
2972 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
2973 if (swapfilepages && maxpages > swapfilepages) {
2974 pr_warn("Swap area shorter than signature indicates\n");
2975 return 0;
2976 }
2977 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
2978 return 0;
2979 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2980 return 0;
2981
2982 return maxpages;
2983}
2984
2985#define SWAP_CLUSTER_INFO_COLS \
2986 DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
2987#define SWAP_CLUSTER_SPACE_COLS \
2988 DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
2989#define SWAP_CLUSTER_COLS \
2990 max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
2991
2992static int setup_swap_map_and_extents(struct swap_info_struct *p,
2993 union swap_header *swap_header,
2994 unsigned char *swap_map,
2995 struct swap_cluster_info *cluster_info,
2996 unsigned long maxpages,
2997 sector_t *span)
2998{
2999 unsigned int j, k;
3000 unsigned int nr_good_pages;
3001 int nr_extents;
3002 unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
3003 unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
3004 unsigned long i, idx;
3005
3006 nr_good_pages = maxpages - 1;
3007
3008 cluster_list_init(&p->free_clusters);
3009 cluster_list_init(&p->discard_clusters);
3010
3011 for (i = 0; i < swap_header->info.nr_badpages; i++) {
3012 unsigned int page_nr = swap_header->info.badpages[i];
3013 if (page_nr == 0 || page_nr > swap_header->info.last_page)
3014 return -EINVAL;
3015 if (page_nr < maxpages) {
3016 swap_map[page_nr] = SWAP_MAP_BAD;
3017 nr_good_pages--;
3018
3019
3020
3021
3022 inc_cluster_info_page(p, cluster_info, page_nr);
3023 }
3024 }
3025
3026
3027 for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
3028 inc_cluster_info_page(p, cluster_info, i);
3029
3030 if (nr_good_pages) {
3031 swap_map[0] = SWAP_MAP_BAD;
3032
3033
3034
3035
3036 inc_cluster_info_page(p, cluster_info, 0);
3037 p->max = maxpages;
3038 p->pages = nr_good_pages;
3039 nr_extents = setup_swap_extents(p, span);
3040 if (nr_extents < 0)
3041 return nr_extents;
3042 nr_good_pages = p->pages;
3043 }
3044 if (!nr_good_pages) {
3045 pr_warn("Empty swap-file\n");
3046 return -EINVAL;
3047 }
3048
3049 if (!cluster_info)
3050 return nr_extents;
3051
3052
3053
3054
3055
3056
3057 for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
3058 j = (k + col) % SWAP_CLUSTER_COLS;
3059 for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
3060 idx = i * SWAP_CLUSTER_COLS + j;
3061 if (idx >= nr_clusters)
3062 continue;
3063 if (cluster_count(&cluster_info[idx]))
3064 continue;
3065 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
3066 cluster_list_add_tail(&p->free_clusters, cluster_info,
3067 idx);
3068 }
3069 }
3070 return nr_extents;
3071}
3072
3073
3074
3075
3076
3077static bool swap_discardable(struct swap_info_struct *si)
3078{
3079 struct request_queue *q = bdev_get_queue(si->bdev);
3080
3081 if (!q || !blk_queue_discard(q))
3082 return false;
3083
3084 return true;
3085}
3086
3087SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
3088{
3089 struct swap_info_struct *p;
3090 struct filename *name;
3091 struct file *swap_file = NULL;
3092 struct address_space *mapping;
3093 int prio;
3094 int error;
3095 union swap_header *swap_header;
3096 int nr_extents;
3097 sector_t span;
3098 unsigned long maxpages;
3099 unsigned char *swap_map = NULL;
3100 struct swap_cluster_info *cluster_info = NULL;
3101 unsigned long *frontswap_map = NULL;
3102 struct page *page = NULL;
3103 struct inode *inode = NULL;
3104
3105 if (swap_flags & ~SWAP_FLAGS_VALID)
3106 return -EINVAL;
3107
3108 if (!capable(CAP_SYS_ADMIN))
3109 return -EPERM;
3110
3111 if (!swap_avail_heads)
3112 return -ENOMEM;
3113
3114 p = alloc_swap_info();
3115 if (IS_ERR(p))
3116 return PTR_ERR(p);
3117
3118 INIT_WORK(&p->discard_work, swap_discard_work);
3119
3120 name = getname(specialfile);
3121 if (IS_ERR(name)) {
3122 error = PTR_ERR(name);
3123 name = NULL;
3124 goto bad_swap;
3125 }
3126 swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
3127 if (IS_ERR(swap_file)) {
3128 error = PTR_ERR(swap_file);
3129 swap_file = NULL;
3130 goto bad_swap;
3131 }
3132
3133 p->swap_file = swap_file;
3134 mapping = swap_file->f_mapping;
3135 inode = mapping->host;
3136
3137
3138 error = claim_swapfile(p, inode);
3139 if (unlikely(error))
3140 goto bad_swap;
3141
3142
3143
3144
3145 if (!mapping->a_ops->readpage) {
3146 error = -EINVAL;
3147 goto bad_swap;
3148 }
3149 page = read_mapping_page(mapping, 0, swap_file);
3150 if (IS_ERR(page)) {
3151 error = PTR_ERR(page);
3152 goto bad_swap;
3153 }
3154 swap_header = kmap(page);
3155
3156 maxpages = read_swap_header(p, swap_header, inode);
3157 if (unlikely(!maxpages)) {
3158 error = -EINVAL;
3159 goto bad_swap;
3160 }
3161
3162
3163 swap_map = vzalloc(maxpages);
3164 if (!swap_map) {
3165 error = -ENOMEM;
3166 goto bad_swap;
3167 }
3168
3169 if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
3170 p->flags |= SWP_STABLE_WRITES;
3171
3172 if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
3173 int cpu;
3174 unsigned long ci, nr_cluster;
3175
3176 p->flags |= SWP_SOLIDSTATE;
3177
3178
3179
3180
3181 p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
3182 nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
3183
3184 cluster_info = kvzalloc(nr_cluster * sizeof(*cluster_info),
3185 GFP_KERNEL);
3186 if (!cluster_info) {
3187 error = -ENOMEM;
3188 goto bad_swap;
3189 }
3190
3191 for (ci = 0; ci < nr_cluster; ci++)
3192 spin_lock_init(&((cluster_info + ci)->lock));
3193
3194 p->percpu_cluster = alloc_percpu(struct percpu_cluster);
3195 if (!p->percpu_cluster) {
3196 error = -ENOMEM;
3197 goto bad_swap;
3198 }
3199 for_each_possible_cpu(cpu) {
3200 struct percpu_cluster *cluster;
3201 cluster = per_cpu_ptr(p->percpu_cluster, cpu);
3202 cluster_set_null(&cluster->index);
3203 }
3204 } else
3205 atomic_inc(&nr_rotate_swap);
3206
3207 error = swap_cgroup_swapon(p->type, maxpages);
3208 if (error)
3209 goto bad_swap;
3210
3211 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
3212 cluster_info, maxpages, &span);
3213 if (unlikely(nr_extents < 0)) {
3214 error = nr_extents;
3215 goto bad_swap;
3216 }
3217
3218 if (IS_ENABLED(CONFIG_FRONTSWAP))
3219 frontswap_map = kvzalloc(BITS_TO_LONGS(maxpages) * sizeof(long),
3220 GFP_KERNEL);
3221
3222 if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
3223
3224
3225
3226
3227
3228
3229 p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
3230 SWP_PAGE_DISCARD);
3231
3232
3233
3234
3235
3236
3237
3238 if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
3239 p->flags &= ~SWP_PAGE_DISCARD;
3240 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
3241 p->flags &= ~SWP_AREA_DISCARD;
3242
3243
3244 if (p->flags & SWP_AREA_DISCARD) {
3245 int err = discard_swap(p);
3246 if (unlikely(err))
3247 pr_err("swapon: discard_swap(%p): %d\n",
3248 p, err);
3249 }
3250 }
3251
3252 error = init_swap_address_space(p->type, maxpages);
3253 if (error)
3254 goto bad_swap;
3255
3256 mutex_lock(&swapon_mutex);
3257 prio = -1;
3258 if (swap_flags & SWAP_FLAG_PREFER)
3259 prio =
3260 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
3261 enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
3262
3263 pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
3264 p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
3265 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
3266 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
3267 (p->flags & SWP_DISCARDABLE) ? "D" : "",
3268 (p->flags & SWP_AREA_DISCARD) ? "s" : "",
3269 (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
3270 (frontswap_map) ? "FS" : "");
3271
3272 mutex_unlock(&swapon_mutex);
3273 atomic_inc(&proc_poll_event);
3274 wake_up_interruptible(&proc_poll_wait);
3275
3276 if (S_ISREG(inode->i_mode))
3277 inode->i_flags |= S_SWAPFILE;
3278 error = 0;
3279 goto out;
3280bad_swap:
3281 free_percpu(p->percpu_cluster);
3282 p->percpu_cluster = NULL;
3283 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
3284 set_blocksize(p->bdev, p->old_block_size);
3285 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3286 }
3287 destroy_swap_extents(p);
3288 swap_cgroup_swapoff(p->type);
3289 spin_lock(&swap_lock);
3290 p->swap_file = NULL;
3291 p->flags = 0;
3292 spin_unlock(&swap_lock);
3293 vfree(swap_map);
3294 kvfree(cluster_info);
3295 kvfree(frontswap_map);
3296 if (swap_file) {
3297 if (inode && S_ISREG(inode->i_mode)) {
3298 inode_unlock(inode);
3299 inode = NULL;
3300 }
3301 filp_close(swap_file, NULL);
3302 }
3303out:
3304 if (page && !IS_ERR(page)) {
3305 kunmap(page);
3306 put_page(page);
3307 }
3308 if (name)
3309 putname(name);
3310 if (inode && S_ISREG(inode->i_mode))
3311 inode_unlock(inode);
3312 if (!error)
3313 enable_swap_slots_cache();
3314 return error;
3315}
3316
3317void si_swapinfo(struct sysinfo *val)
3318{
3319 unsigned int type;
3320 unsigned long nr_to_be_unused = 0;
3321
3322 spin_lock(&swap_lock);
3323 for (type = 0; type < nr_swapfiles; type++) {
3324 struct swap_info_struct *si = swap_info[type];
3325
3326 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
3327 nr_to_be_unused += si->inuse_pages;
3328 }
3329 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
3330 val->totalswap = total_swap_pages + nr_to_be_unused;
3331 spin_unlock(&swap_lock);
3332}
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
3346{
3347 struct swap_info_struct *p;
3348 struct swap_cluster_info *ci;
3349 unsigned long offset, type;
3350 unsigned char count;
3351 unsigned char has_cache;
3352 int err = -EINVAL;
3353
3354 if (non_swap_entry(entry))
3355 goto out;
3356
3357 type = swp_type(entry);
3358 if (type >= nr_swapfiles)
3359 goto bad_file;
3360 p = swap_info[type];
3361 offset = swp_offset(entry);
3362 if (unlikely(offset >= p->max))
3363 goto out;
3364
3365 ci = lock_cluster_or_swap_info(p, offset);
3366
3367 count = p->swap_map[offset];
3368
3369
3370
3371
3372
3373 if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
3374 err = -ENOENT;
3375 goto unlock_out;
3376 }
3377
3378 has_cache = count & SWAP_HAS_CACHE;
3379 count &= ~SWAP_HAS_CACHE;
3380 err = 0;
3381
3382 if (usage == SWAP_HAS_CACHE) {
3383
3384
3385 if (!has_cache && count)
3386 has_cache = SWAP_HAS_CACHE;
3387 else if (has_cache)
3388 err = -EEXIST;
3389 else
3390 err = -ENOENT;
3391
3392 } else if (count || has_cache) {
3393
3394 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
3395 count += usage;
3396 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
3397 err = -EINVAL;
3398 else if (swap_count_continued(p, offset, count))
3399 count = COUNT_CONTINUED;
3400 else
3401 err = -ENOMEM;
3402 } else
3403 err = -ENOENT;
3404
3405 p->swap_map[offset] = count | has_cache;
3406
3407unlock_out:
3408 unlock_cluster_or_swap_info(p, ci);
3409out:
3410 return err;
3411
3412bad_file:
3413 pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
3414 goto out;
3415}
3416
3417
3418
3419
3420
3421void swap_shmem_alloc(swp_entry_t entry)
3422{
3423 __swap_duplicate(entry, SWAP_MAP_SHMEM);
3424}
3425
3426
3427
3428
3429
3430
3431
3432
3433int swap_duplicate(swp_entry_t entry)
3434{
3435 int err = 0;
3436
3437 while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
3438 err = add_swap_count_continuation(entry, GFP_ATOMIC);
3439 return err;
3440}
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450int swapcache_prepare(swp_entry_t entry)
3451{
3452 return __swap_duplicate(entry, SWAP_HAS_CACHE);
3453}
3454
3455struct swap_info_struct *page_swap_info(struct page *page)
3456{
3457 swp_entry_t swap = { .val = page_private(page) };
3458 return swap_info[swp_type(swap)];
3459}
3460
3461
3462
3463
3464struct address_space *__page_file_mapping(struct page *page)
3465{
3466 VM_BUG_ON_PAGE(!PageSwapCache(page), page);
3467 return page_swap_info(page)->swap_file->f_mapping;
3468}
3469EXPORT_SYMBOL_GPL(__page_file_mapping);
3470
3471pgoff_t __page_file_index(struct page *page)
3472{
3473 swp_entry_t swap = { .val = page_private(page) };
3474 VM_BUG_ON_PAGE(!PageSwapCache(page), page);
3475 return swp_offset(swap);
3476}
3477EXPORT_SYMBOL_GPL(__page_file_index);
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
3495{
3496 struct swap_info_struct *si;
3497 struct swap_cluster_info *ci;
3498 struct page *head;
3499 struct page *page;
3500 struct page *list_page;
3501 pgoff_t offset;
3502 unsigned char count;
3503
3504
3505
3506
3507
3508 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
3509
3510 si = swap_info_get(entry);
3511 if (!si) {
3512
3513
3514
3515
3516
3517 goto outer;
3518 }
3519
3520 offset = swp_offset(entry);
3521
3522 ci = lock_cluster(si, offset);
3523
3524 count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
3525
3526 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
3527
3528
3529
3530
3531
3532 goto out;
3533 }
3534
3535 if (!page) {
3536 unlock_cluster(ci);
3537 spin_unlock(&si->lock);
3538 return -ENOMEM;
3539 }
3540
3541
3542
3543
3544
3545
3546 head = vmalloc_to_page(si->swap_map + offset);
3547 offset &= ~PAGE_MASK;
3548
3549 spin_lock(&si->cont_lock);
3550
3551
3552
3553
3554 if (!page_private(head)) {
3555 BUG_ON(count & COUNT_CONTINUED);
3556 INIT_LIST_HEAD(&head->lru);
3557 set_page_private(head, SWP_CONTINUED);
3558 si->flags |= SWP_CONTINUED;
3559 }
3560
3561 list_for_each_entry(list_page, &head->lru, lru) {
3562 unsigned char *map;
3563
3564
3565
3566
3567
3568 if (!(count & COUNT_CONTINUED))
3569 goto out_unlock_cont;
3570
3571 map = kmap_atomic(list_page) + offset;
3572 count = *map;
3573 kunmap_atomic(map);
3574
3575
3576
3577
3578
3579 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
3580 goto out_unlock_cont;
3581 }
3582
3583 list_add_tail(&page->lru, &head->lru);
3584 page = NULL;
3585out_unlock_cont:
3586 spin_unlock(&si->cont_lock);
3587out:
3588 unlock_cluster(ci);
3589 spin_unlock(&si->lock);
3590outer:
3591 if (page)
3592 __free_page(page);
3593 return 0;
3594}
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605static bool swap_count_continued(struct swap_info_struct *si,
3606 pgoff_t offset, unsigned char count)
3607{
3608 struct page *head;
3609 struct page *page;
3610 unsigned char *map;
3611 bool ret;
3612
3613 head = vmalloc_to_page(si->swap_map + offset);
3614 if (page_private(head) != SWP_CONTINUED) {
3615 BUG_ON(count & COUNT_CONTINUED);
3616 return false;
3617 }
3618
3619 spin_lock(&si->cont_lock);
3620 offset &= ~PAGE_MASK;
3621 page = list_entry(head->lru.next, struct page, lru);
3622 map = kmap_atomic(page) + offset;
3623
3624 if (count == SWAP_MAP_MAX)
3625 goto init_map;
3626
3627 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) {
3628
3629
3630
3631 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
3632 kunmap_atomic(map);
3633 page = list_entry(page->lru.next, struct page, lru);
3634 BUG_ON(page == head);
3635 map = kmap_atomic(page) + offset;
3636 }
3637 if (*map == SWAP_CONT_MAX) {
3638 kunmap_atomic(map);
3639 page = list_entry(page->lru.next, struct page, lru);
3640 if (page == head) {
3641 ret = false;
3642 goto out;
3643 }
3644 map = kmap_atomic(page) + offset;
3645init_map: *map = 0;
3646 }
3647 *map += 1;
3648 kunmap_atomic(map);
3649 page = list_entry(page->lru.prev, struct page, lru);
3650 while (page != head) {
3651 map = kmap_atomic(page) + offset;
3652 *map = COUNT_CONTINUED;
3653 kunmap_atomic(map);
3654 page = list_entry(page->lru.prev, struct page, lru);
3655 }
3656 ret = true;
3657
3658 } else {
3659
3660
3661
3662 BUG_ON(count != COUNT_CONTINUED);
3663 while (*map == COUNT_CONTINUED) {
3664 kunmap_atomic(map);
3665 page = list_entry(page->lru.next, struct page, lru);
3666 BUG_ON(page == head);
3667 map = kmap_atomic(page) + offset;
3668 }
3669 BUG_ON(*map == 0);
3670 *map -= 1;
3671 if (*map == 0)
3672 count = 0;
3673 kunmap_atomic(map);
3674 page = list_entry(page->lru.prev, struct page, lru);
3675 while (page != head) {
3676 map = kmap_atomic(page) + offset;
3677 *map = SWAP_CONT_MAX | count;
3678 count = COUNT_CONTINUED;
3679 kunmap_atomic(map);
3680 page = list_entry(page->lru.prev, struct page, lru);
3681 }
3682 ret = count == COUNT_CONTINUED;
3683 }
3684out:
3685 spin_unlock(&si->cont_lock);
3686 return ret;
3687}
3688
3689
3690
3691
3692
3693static void free_swap_count_continuations(struct swap_info_struct *si)
3694{
3695 pgoff_t offset;
3696
3697 for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
3698 struct page *head;
3699 head = vmalloc_to_page(si->swap_map + offset);
3700 if (page_private(head)) {
3701 struct page *page, *next;
3702
3703 list_for_each_entry_safe(page, next, &head->lru, lru) {
3704 list_del(&page->lru);
3705 __free_page(page);
3706 }
3707 }
3708 }
3709}
3710
3711static int __init swapfile_init(void)
3712{
3713 int nid;
3714
3715 swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
3716 GFP_KERNEL);
3717 if (!swap_avail_heads) {
3718 pr_emerg("Not enough memory for swap heads, swap is disabled\n");
3719 return -ENOMEM;
3720 }
3721
3722 for_each_node(nid)
3723 plist_head_init(&swap_avail_heads[nid]);
3724
3725 return 0;
3726}
3727subsys_initcall(swapfile_init);
3728