1
2
3
4
5
6
7
8
9#include <linux/mm.h>
10#include <linux/sched/mm.h>
11#include <linux/sched/task.h>
12#include <linux/hugetlb.h>
13#include <linux/mman.h>
14#include <linux/slab.h>
15#include <linux/kernel_stat.h>
16#include <linux/swap.h>
17#include <linux/vmalloc.h>
18#include <linux/pagemap.h>
19#include <linux/namei.h>
20#include <linux/shmem_fs.h>
21#include <linux/blkdev.h>
22#include <linux/random.h>
23#include <linux/writeback.h>
24#include <linux/proc_fs.h>
25#include <linux/seq_file.h>
26#include <linux/init.h>
27#include <linux/ksm.h>
28#include <linux/rmap.h>
29#include <linux/security.h>
30#include <linux/backing-dev.h>
31#include <linux/mutex.h>
32#include <linux/capability.h>
33#include <linux/syscalls.h>
34#include <linux/memcontrol.h>
35#include <linux/poll.h>
36#include <linux/oom.h>
37#include <linux/frontswap.h>
38#include <linux/swapfile.h>
39#include <linux/export.h>
40#include <linux/swap_slots.h>
41#include <linux/sort.h>
42
43#include <asm/pgtable.h>
44#include <asm/tlbflush.h>
45#include <linux/swapops.h>
46#include <linux/swap_cgroup.h>
47
48static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
49 unsigned char);
50static void free_swap_count_continuations(struct swap_info_struct *);
51static sector_t map_swap_entry(swp_entry_t, struct block_device**);
52
53DEFINE_SPINLOCK(swap_lock);
54static unsigned int nr_swapfiles;
55atomic_long_t nr_swap_pages;
56
57
58
59
60
61EXPORT_SYMBOL_GPL(nr_swap_pages);
62
63long total_swap_pages;
64static int least_priority = -1;
65
66static const char Bad_file[] = "Bad swap file entry ";
67static const char Unused_file[] = "Unused swap file entry ";
68static const char Bad_offset[] = "Bad swap offset entry ";
69static const char Unused_offset[] = "Unused swap offset entry ";
70
71
72
73
74
75PLIST_HEAD(swap_active_head);
76
77
78
79
80
81
82
83
84
85
86
87
88
89static struct plist_head *swap_avail_heads;
90static DEFINE_SPINLOCK(swap_avail_lock);
91
92struct swap_info_struct *swap_info[MAX_SWAPFILES];
93
94static DEFINE_MUTEX(swapon_mutex);
95
96static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
97
98static atomic_t proc_poll_event = ATOMIC_INIT(0);
99
100atomic_t nr_rotate_swap = ATOMIC_INIT(0);
101
102static struct swap_info_struct *swap_type_to_swap_info(int type)
103{
104 if (type >= READ_ONCE(nr_swapfiles))
105 return NULL;
106
107 smp_rmb();
108 return READ_ONCE(swap_info[type]);
109}
110
111static inline unsigned char swap_count(unsigned char ent)
112{
113 return ent & ~SWAP_HAS_CACHE;
114}
115
116
117#define TTRS_ANYWAY 0x1
118
119
120
121
122#define TTRS_UNMAPPED 0x2
123
124#define TTRS_FULL 0x4
125
126
127static int __try_to_reclaim_swap(struct swap_info_struct *si,
128 unsigned long offset, unsigned long flags)
129{
130 swp_entry_t entry = swp_entry(si->type, offset);
131 struct page *page;
132 int ret = 0;
133
134 page = find_get_page(swap_address_space(entry), offset);
135 if (!page)
136 return 0;
137
138
139
140
141
142
143
144 if (trylock_page(page)) {
145 if ((flags & TTRS_ANYWAY) ||
146 ((flags & TTRS_UNMAPPED) && !page_mapped(page)) ||
147 ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
148 ret = try_to_free_swap(page);
149 unlock_page(page);
150 }
151 put_page(page);
152 return ret;
153}
154
155
156
157
158
159static int discard_swap(struct swap_info_struct *si)
160{
161 struct swap_extent *se;
162 sector_t start_block;
163 sector_t nr_blocks;
164 int err = 0;
165
166
167 se = &si->first_swap_extent;
168 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
169 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
170 if (nr_blocks) {
171 err = blkdev_issue_discard(si->bdev, start_block,
172 nr_blocks, GFP_KERNEL, 0);
173 if (err)
174 return err;
175 cond_resched();
176 }
177
178 list_for_each_entry(se, &si->first_swap_extent.list, list) {
179 start_block = se->start_block << (PAGE_SHIFT - 9);
180 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
181
182 err = blkdev_issue_discard(si->bdev, start_block,
183 nr_blocks, GFP_KERNEL, 0);
184 if (err)
185 break;
186
187 cond_resched();
188 }
189 return err;
190}
191
192
193
194
195
196static void discard_swap_cluster(struct swap_info_struct *si,
197 pgoff_t start_page, pgoff_t nr_pages)
198{
199 struct swap_extent *se = si->curr_swap_extent;
200 int found_extent = 0;
201
202 while (nr_pages) {
203 if (se->start_page <= start_page &&
204 start_page < se->start_page + se->nr_pages) {
205 pgoff_t offset = start_page - se->start_page;
206 sector_t start_block = se->start_block + offset;
207 sector_t nr_blocks = se->nr_pages - offset;
208
209 if (nr_blocks > nr_pages)
210 nr_blocks = nr_pages;
211 start_page += nr_blocks;
212 nr_pages -= nr_blocks;
213
214 if (!found_extent++)
215 si->curr_swap_extent = se;
216
217 start_block <<= PAGE_SHIFT - 9;
218 nr_blocks <<= PAGE_SHIFT - 9;
219 if (blkdev_issue_discard(si->bdev, start_block,
220 nr_blocks, GFP_NOIO, 0))
221 break;
222 }
223
224 se = list_next_entry(se, list);
225 }
226}
227
228#ifdef CONFIG_THP_SWAP
229#define SWAPFILE_CLUSTER HPAGE_PMD_NR
230
231#define swap_entry_size(size) (size)
232#else
233#define SWAPFILE_CLUSTER 256
234
235
236
237
238
239#define swap_entry_size(size) 1
240#endif
241#define LATENCY_LIMIT 256
242
243static inline void cluster_set_flag(struct swap_cluster_info *info,
244 unsigned int flag)
245{
246 info->flags = flag;
247}
248
249static inline unsigned int cluster_count(struct swap_cluster_info *info)
250{
251 return info->data;
252}
253
254static inline void cluster_set_count(struct swap_cluster_info *info,
255 unsigned int c)
256{
257 info->data = c;
258}
259
260static inline void cluster_set_count_flag(struct swap_cluster_info *info,
261 unsigned int c, unsigned int f)
262{
263 info->flags = f;
264 info->data = c;
265}
266
267static inline unsigned int cluster_next(struct swap_cluster_info *info)
268{
269 return info->data;
270}
271
272static inline void cluster_set_next(struct swap_cluster_info *info,
273 unsigned int n)
274{
275 info->data = n;
276}
277
278static inline void cluster_set_next_flag(struct swap_cluster_info *info,
279 unsigned int n, unsigned int f)
280{
281 info->flags = f;
282 info->data = n;
283}
284
285static inline bool cluster_is_free(struct swap_cluster_info *info)
286{
287 return info->flags & CLUSTER_FLAG_FREE;
288}
289
290static inline bool cluster_is_null(struct swap_cluster_info *info)
291{
292 return info->flags & CLUSTER_FLAG_NEXT_NULL;
293}
294
295static inline void cluster_set_null(struct swap_cluster_info *info)
296{
297 info->flags = CLUSTER_FLAG_NEXT_NULL;
298 info->data = 0;
299}
300
301static inline bool cluster_is_huge(struct swap_cluster_info *info)
302{
303 if (IS_ENABLED(CONFIG_THP_SWAP))
304 return info->flags & CLUSTER_FLAG_HUGE;
305 return false;
306}
307
308static inline void cluster_clear_huge(struct swap_cluster_info *info)
309{
310 info->flags &= ~CLUSTER_FLAG_HUGE;
311}
312
313static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
314 unsigned long offset)
315{
316 struct swap_cluster_info *ci;
317
318 ci = si->cluster_info;
319 if (ci) {
320 ci += offset / SWAPFILE_CLUSTER;
321 spin_lock(&ci->lock);
322 }
323 return ci;
324}
325
326static inline void unlock_cluster(struct swap_cluster_info *ci)
327{
328 if (ci)
329 spin_unlock(&ci->lock);
330}
331
332
333
334
335
336static inline struct swap_cluster_info *lock_cluster_or_swap_info(
337 struct swap_info_struct *si, unsigned long offset)
338{
339 struct swap_cluster_info *ci;
340
341
342 ci = lock_cluster(si, offset);
343
344 if (!ci)
345 spin_lock(&si->lock);
346
347 return ci;
348}
349
350static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
351 struct swap_cluster_info *ci)
352{
353 if (ci)
354 unlock_cluster(ci);
355 else
356 spin_unlock(&si->lock);
357}
358
359static inline bool cluster_list_empty(struct swap_cluster_list *list)
360{
361 return cluster_is_null(&list->head);
362}
363
364static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
365{
366 return cluster_next(&list->head);
367}
368
369static void cluster_list_init(struct swap_cluster_list *list)
370{
371 cluster_set_null(&list->head);
372 cluster_set_null(&list->tail);
373}
374
375static void cluster_list_add_tail(struct swap_cluster_list *list,
376 struct swap_cluster_info *ci,
377 unsigned int idx)
378{
379 if (cluster_list_empty(list)) {
380 cluster_set_next_flag(&list->head, idx, 0);
381 cluster_set_next_flag(&list->tail, idx, 0);
382 } else {
383 struct swap_cluster_info *ci_tail;
384 unsigned int tail = cluster_next(&list->tail);
385
386
387
388
389
390 ci_tail = ci + tail;
391 spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
392 cluster_set_next(ci_tail, idx);
393 spin_unlock(&ci_tail->lock);
394 cluster_set_next_flag(&list->tail, idx, 0);
395 }
396}
397
398static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
399 struct swap_cluster_info *ci)
400{
401 unsigned int idx;
402
403 idx = cluster_next(&list->head);
404 if (cluster_next(&list->tail) == idx) {
405 cluster_set_null(&list->head);
406 cluster_set_null(&list->tail);
407 } else
408 cluster_set_next_flag(&list->head,
409 cluster_next(&ci[idx]), 0);
410
411 return idx;
412}
413
414
415static void swap_cluster_schedule_discard(struct swap_info_struct *si,
416 unsigned int idx)
417{
418
419
420
421
422
423
424 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
425 SWAP_MAP_BAD, SWAPFILE_CLUSTER);
426
427 cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
428
429 schedule_work(&si->discard_work);
430}
431
432static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
433{
434 struct swap_cluster_info *ci = si->cluster_info;
435
436 cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
437 cluster_list_add_tail(&si->free_clusters, ci, idx);
438}
439
440
441
442
443
444static void swap_do_scheduled_discard(struct swap_info_struct *si)
445{
446 struct swap_cluster_info *info, *ci;
447 unsigned int idx;
448
449 info = si->cluster_info;
450
451 while (!cluster_list_empty(&si->discard_clusters)) {
452 idx = cluster_list_del_first(&si->discard_clusters, info);
453 spin_unlock(&si->lock);
454
455 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
456 SWAPFILE_CLUSTER);
457
458 spin_lock(&si->lock);
459 ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
460 __free_cluster(si, idx);
461 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
462 0, SWAPFILE_CLUSTER);
463 unlock_cluster(ci);
464 }
465}
466
467static void swap_discard_work(struct work_struct *work)
468{
469 struct swap_info_struct *si;
470
471 si = container_of(work, struct swap_info_struct, discard_work);
472
473 spin_lock(&si->lock);
474 swap_do_scheduled_discard(si);
475 spin_unlock(&si->lock);
476}
477
478static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
479{
480 struct swap_cluster_info *ci = si->cluster_info;
481
482 VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
483 cluster_list_del_first(&si->free_clusters, ci);
484 cluster_set_count_flag(ci + idx, 0, 0);
485}
486
487static void free_cluster(struct swap_info_struct *si, unsigned long idx)
488{
489 struct swap_cluster_info *ci = si->cluster_info + idx;
490
491 VM_BUG_ON(cluster_count(ci) != 0);
492
493
494
495
496
497 if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
498 (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
499 swap_cluster_schedule_discard(si, idx);
500 return;
501 }
502
503 __free_cluster(si, idx);
504}
505
506
507
508
509
510static void inc_cluster_info_page(struct swap_info_struct *p,
511 struct swap_cluster_info *cluster_info, unsigned long page_nr)
512{
513 unsigned long idx = page_nr / SWAPFILE_CLUSTER;
514
515 if (!cluster_info)
516 return;
517 if (cluster_is_free(&cluster_info[idx]))
518 alloc_cluster(p, idx);
519
520 VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
521 cluster_set_count(&cluster_info[idx],
522 cluster_count(&cluster_info[idx]) + 1);
523}
524
525
526
527
528
529
530static void dec_cluster_info_page(struct swap_info_struct *p,
531 struct swap_cluster_info *cluster_info, unsigned long page_nr)
532{
533 unsigned long idx = page_nr / SWAPFILE_CLUSTER;
534
535 if (!cluster_info)
536 return;
537
538 VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
539 cluster_set_count(&cluster_info[idx],
540 cluster_count(&cluster_info[idx]) - 1);
541
542 if (cluster_count(&cluster_info[idx]) == 0)
543 free_cluster(p, idx);
544}
545
546
547
548
549
550static bool
551scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
552 unsigned long offset)
553{
554 struct percpu_cluster *percpu_cluster;
555 bool conflict;
556
557 offset /= SWAPFILE_CLUSTER;
558 conflict = !cluster_list_empty(&si->free_clusters) &&
559 offset != cluster_list_first(&si->free_clusters) &&
560 cluster_is_free(&si->cluster_info[offset]);
561
562 if (!conflict)
563 return false;
564
565 percpu_cluster = this_cpu_ptr(si->percpu_cluster);
566 cluster_set_null(&percpu_cluster->index);
567 return true;
568}
569
570
571
572
573
574static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
575 unsigned long *offset, unsigned long *scan_base)
576{
577 struct percpu_cluster *cluster;
578 struct swap_cluster_info *ci;
579 bool found_free;
580 unsigned long tmp, max;
581
582new_cluster:
583 cluster = this_cpu_ptr(si->percpu_cluster);
584 if (cluster_is_null(&cluster->index)) {
585 if (!cluster_list_empty(&si->free_clusters)) {
586 cluster->index = si->free_clusters.head;
587 cluster->next = cluster_next(&cluster->index) *
588 SWAPFILE_CLUSTER;
589 } else if (!cluster_list_empty(&si->discard_clusters)) {
590
591
592
593
594 swap_do_scheduled_discard(si);
595 *scan_base = *offset = si->cluster_next;
596 goto new_cluster;
597 } else
598 return false;
599 }
600
601 found_free = false;
602
603
604
605
606
607 tmp = cluster->next;
608 max = min_t(unsigned long, si->max,
609 (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
610 if (tmp >= max) {
611 cluster_set_null(&cluster->index);
612 goto new_cluster;
613 }
614 ci = lock_cluster(si, tmp);
615 while (tmp < max) {
616 if (!si->swap_map[tmp]) {
617 found_free = true;
618 break;
619 }
620 tmp++;
621 }
622 unlock_cluster(ci);
623 if (!found_free) {
624 cluster_set_null(&cluster->index);
625 goto new_cluster;
626 }
627 cluster->next = tmp + 1;
628 *offset = tmp;
629 *scan_base = tmp;
630 return found_free;
631}
632
633static void __del_from_avail_list(struct swap_info_struct *p)
634{
635 int nid;
636
637 for_each_node(nid)
638 plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
639}
640
641static void del_from_avail_list(struct swap_info_struct *p)
642{
643 spin_lock(&swap_avail_lock);
644 __del_from_avail_list(p);
645 spin_unlock(&swap_avail_lock);
646}
647
648static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
649 unsigned int nr_entries)
650{
651 unsigned int end = offset + nr_entries - 1;
652
653 if (offset == si->lowest_bit)
654 si->lowest_bit += nr_entries;
655 if (end == si->highest_bit)
656 si->highest_bit -= nr_entries;
657 si->inuse_pages += nr_entries;
658 if (si->inuse_pages == si->pages) {
659 si->lowest_bit = si->max;
660 si->highest_bit = 0;
661 del_from_avail_list(si);
662 }
663}
664
665static void add_to_avail_list(struct swap_info_struct *p)
666{
667 int nid;
668
669 spin_lock(&swap_avail_lock);
670 for_each_node(nid) {
671 WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
672 plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
673 }
674 spin_unlock(&swap_avail_lock);
675}
676
677static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
678 unsigned int nr_entries)
679{
680 unsigned long end = offset + nr_entries - 1;
681 void (*swap_slot_free_notify)(struct block_device *, unsigned long);
682
683 if (offset < si->lowest_bit)
684 si->lowest_bit = offset;
685 if (end > si->highest_bit) {
686 bool was_full = !si->highest_bit;
687
688 si->highest_bit = end;
689 if (was_full && (si->flags & SWP_WRITEOK))
690 add_to_avail_list(si);
691 }
692 atomic_long_add(nr_entries, &nr_swap_pages);
693 si->inuse_pages -= nr_entries;
694 if (si->flags & SWP_BLKDEV)
695 swap_slot_free_notify =
696 si->bdev->bd_disk->fops->swap_slot_free_notify;
697 else
698 swap_slot_free_notify = NULL;
699 while (offset <= end) {
700 frontswap_invalidate_page(si->type, offset);
701 if (swap_slot_free_notify)
702 swap_slot_free_notify(si->bdev, offset);
703 offset++;
704 }
705}
706
707static int scan_swap_map_slots(struct swap_info_struct *si,
708 unsigned char usage, int nr,
709 swp_entry_t slots[])
710{
711 struct swap_cluster_info *ci;
712 unsigned long offset;
713 unsigned long scan_base;
714 unsigned long last_in_cluster = 0;
715 int latency_ration = LATENCY_LIMIT;
716 int n_ret = 0;
717
718 if (nr > SWAP_BATCH)
719 nr = SWAP_BATCH;
720
721
722
723
724
725
726
727
728
729
730
731
732 si->flags += SWP_SCANNING;
733 scan_base = offset = si->cluster_next;
734
735
736 if (si->cluster_info) {
737 if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
738 goto checks;
739 else
740 goto scan;
741 }
742
743 if (unlikely(!si->cluster_nr--)) {
744 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
745 si->cluster_nr = SWAPFILE_CLUSTER - 1;
746 goto checks;
747 }
748
749 spin_unlock(&si->lock);
750
751
752
753
754
755
756
757 scan_base = offset = si->lowest_bit;
758 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
759
760
761 for (; last_in_cluster <= si->highest_bit; offset++) {
762 if (si->swap_map[offset])
763 last_in_cluster = offset + SWAPFILE_CLUSTER;
764 else if (offset == last_in_cluster) {
765 spin_lock(&si->lock);
766 offset -= SWAPFILE_CLUSTER - 1;
767 si->cluster_next = offset;
768 si->cluster_nr = SWAPFILE_CLUSTER - 1;
769 goto checks;
770 }
771 if (unlikely(--latency_ration < 0)) {
772 cond_resched();
773 latency_ration = LATENCY_LIMIT;
774 }
775 }
776
777 offset = scan_base;
778 spin_lock(&si->lock);
779 si->cluster_nr = SWAPFILE_CLUSTER - 1;
780 }
781
782checks:
783 if (si->cluster_info) {
784 while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
785
786 if (n_ret)
787 goto done;
788 if (!scan_swap_map_try_ssd_cluster(si, &offset,
789 &scan_base))
790 goto scan;
791 }
792 }
793 if (!(si->flags & SWP_WRITEOK))
794 goto no_page;
795 if (!si->highest_bit)
796 goto no_page;
797 if (offset > si->highest_bit)
798 scan_base = offset = si->lowest_bit;
799
800 ci = lock_cluster(si, offset);
801
802 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
803 int swap_was_freed;
804 unlock_cluster(ci);
805 spin_unlock(&si->lock);
806 swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
807 spin_lock(&si->lock);
808
809 if (swap_was_freed)
810 goto checks;
811 goto scan;
812 }
813
814 if (si->swap_map[offset]) {
815 unlock_cluster(ci);
816 if (!n_ret)
817 goto scan;
818 else
819 goto done;
820 }
821 si->swap_map[offset] = usage;
822 inc_cluster_info_page(si, si->cluster_info, offset);
823 unlock_cluster(ci);
824
825 swap_range_alloc(si, offset, 1);
826 si->cluster_next = offset + 1;
827 slots[n_ret++] = swp_entry(si->type, offset);
828
829
830 if ((n_ret == nr) || (offset >= si->highest_bit))
831 goto done;
832
833
834
835
836 if (unlikely(--latency_ration < 0)) {
837 if (n_ret)
838 goto done;
839 spin_unlock(&si->lock);
840 cond_resched();
841 spin_lock(&si->lock);
842 latency_ration = LATENCY_LIMIT;
843 }
844
845
846 if (si->cluster_info) {
847 if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
848 goto checks;
849 else
850 goto done;
851 }
852
853 ++offset;
854
855
856 if (si->cluster_nr && !si->swap_map[offset]) {
857 --si->cluster_nr;
858 goto checks;
859 }
860
861done:
862 si->flags -= SWP_SCANNING;
863 return n_ret;
864
865scan:
866 spin_unlock(&si->lock);
867 while (++offset <= si->highest_bit) {
868 if (!si->swap_map[offset]) {
869 spin_lock(&si->lock);
870 goto checks;
871 }
872 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
873 spin_lock(&si->lock);
874 goto checks;
875 }
876 if (unlikely(--latency_ration < 0)) {
877 cond_resched();
878 latency_ration = LATENCY_LIMIT;
879 }
880 }
881 offset = si->lowest_bit;
882 while (offset < scan_base) {
883 if (!si->swap_map[offset]) {
884 spin_lock(&si->lock);
885 goto checks;
886 }
887 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
888 spin_lock(&si->lock);
889 goto checks;
890 }
891 if (unlikely(--latency_ration < 0)) {
892 cond_resched();
893 latency_ration = LATENCY_LIMIT;
894 }
895 offset++;
896 }
897 spin_lock(&si->lock);
898
899no_page:
900 si->flags -= SWP_SCANNING;
901 return n_ret;
902}
903
904static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
905{
906 unsigned long idx;
907 struct swap_cluster_info *ci;
908 unsigned long offset, i;
909 unsigned char *map;
910
911
912
913
914
915 if (!IS_ENABLED(CONFIG_THP_SWAP)) {
916 VM_WARN_ON_ONCE(1);
917 return 0;
918 }
919
920 if (cluster_list_empty(&si->free_clusters))
921 return 0;
922
923 idx = cluster_list_first(&si->free_clusters);
924 offset = idx * SWAPFILE_CLUSTER;
925 ci = lock_cluster(si, offset);
926 alloc_cluster(si, idx);
927 cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
928
929 map = si->swap_map + offset;
930 for (i = 0; i < SWAPFILE_CLUSTER; i++)
931 map[i] = SWAP_HAS_CACHE;
932 unlock_cluster(ci);
933 swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
934 *slot = swp_entry(si->type, offset);
935
936 return 1;
937}
938
939static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
940{
941 unsigned long offset = idx * SWAPFILE_CLUSTER;
942 struct swap_cluster_info *ci;
943
944 ci = lock_cluster(si, offset);
945 memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
946 cluster_set_count_flag(ci, 0, 0);
947 free_cluster(si, idx);
948 unlock_cluster(ci);
949 swap_range_free(si, offset, SWAPFILE_CLUSTER);
950}
951
952static unsigned long scan_swap_map(struct swap_info_struct *si,
953 unsigned char usage)
954{
955 swp_entry_t entry;
956 int n_ret;
957
958 n_ret = scan_swap_map_slots(si, usage, 1, &entry);
959
960 if (n_ret)
961 return swp_offset(entry);
962 else
963 return 0;
964
965}
966
967int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
968{
969 unsigned long size = swap_entry_size(entry_size);
970 struct swap_info_struct *si, *next;
971 long avail_pgs;
972 int n_ret = 0;
973 int node;
974
975
976 WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
977
978 avail_pgs = atomic_long_read(&nr_swap_pages) / size;
979 if (avail_pgs <= 0)
980 goto noswap;
981
982 if (n_goal > SWAP_BATCH)
983 n_goal = SWAP_BATCH;
984
985 if (n_goal > avail_pgs)
986 n_goal = avail_pgs;
987
988 atomic_long_sub(n_goal * size, &nr_swap_pages);
989
990 spin_lock(&swap_avail_lock);
991
992start_over:
993 node = numa_node_id();
994 plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
995
996 plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
997 spin_unlock(&swap_avail_lock);
998 spin_lock(&si->lock);
999 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
1000 spin_lock(&swap_avail_lock);
1001 if (plist_node_empty(&si->avail_lists[node])) {
1002 spin_unlock(&si->lock);
1003 goto nextsi;
1004 }
1005 WARN(!si->highest_bit,
1006 "swap_info %d in list but !highest_bit\n",
1007 si->type);
1008 WARN(!(si->flags & SWP_WRITEOK),
1009 "swap_info %d in list but !SWP_WRITEOK\n",
1010 si->type);
1011 __del_from_avail_list(si);
1012 spin_unlock(&si->lock);
1013 goto nextsi;
1014 }
1015 if (size == SWAPFILE_CLUSTER) {
1016 if (!(si->flags & SWP_FS))
1017 n_ret = swap_alloc_cluster(si, swp_entries);
1018 } else
1019 n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
1020 n_goal, swp_entries);
1021 spin_unlock(&si->lock);
1022 if (n_ret || size == SWAPFILE_CLUSTER)
1023 goto check_out;
1024 pr_debug("scan_swap_map of si %d failed to find offset\n",
1025 si->type);
1026
1027 spin_lock(&swap_avail_lock);
1028nextsi:
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040 if (plist_node_empty(&next->avail_lists[node]))
1041 goto start_over;
1042 }
1043
1044 spin_unlock(&swap_avail_lock);
1045
1046check_out:
1047 if (n_ret < n_goal)
1048 atomic_long_add((long)(n_goal - n_ret) * size,
1049 &nr_swap_pages);
1050noswap:
1051 return n_ret;
1052}
1053
1054
1055swp_entry_t get_swap_page_of_type(int type)
1056{
1057 struct swap_info_struct *si = swap_type_to_swap_info(type);
1058 pgoff_t offset;
1059
1060 if (!si)
1061 goto fail;
1062
1063 spin_lock(&si->lock);
1064 if (si->flags & SWP_WRITEOK) {
1065 atomic_long_dec(&nr_swap_pages);
1066
1067 offset = scan_swap_map(si, 1);
1068 if (offset) {
1069 spin_unlock(&si->lock);
1070 return swp_entry(type, offset);
1071 }
1072 atomic_long_inc(&nr_swap_pages);
1073 }
1074 spin_unlock(&si->lock);
1075fail:
1076 return (swp_entry_t) {0};
1077}
1078
1079static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
1080{
1081 struct swap_info_struct *p;
1082 unsigned long offset, type;
1083
1084 if (!entry.val)
1085 goto out;
1086 type = swp_type(entry);
1087 p = swap_type_to_swap_info(type);
1088 if (!p)
1089 goto bad_nofile;
1090 if (!(p->flags & SWP_USED))
1091 goto bad_device;
1092 offset = swp_offset(entry);
1093 if (offset >= p->max)
1094 goto bad_offset;
1095 return p;
1096
1097bad_offset:
1098 pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val);
1099 goto out;
1100bad_device:
1101 pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val);
1102 goto out;
1103bad_nofile:
1104 pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val);
1105out:
1106 return NULL;
1107}
1108
1109static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
1110{
1111 struct swap_info_struct *p;
1112
1113 p = __swap_info_get(entry);
1114 if (!p)
1115 goto out;
1116 if (!p->swap_map[swp_offset(entry)])
1117 goto bad_free;
1118 return p;
1119
1120bad_free:
1121 pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
1122 goto out;
1123out:
1124 return NULL;
1125}
1126
1127static struct swap_info_struct *swap_info_get(swp_entry_t entry)
1128{
1129 struct swap_info_struct *p;
1130
1131 p = _swap_info_get(entry);
1132 if (p)
1133 spin_lock(&p->lock);
1134 return p;
1135}
1136
1137static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
1138 struct swap_info_struct *q)
1139{
1140 struct swap_info_struct *p;
1141
1142 p = _swap_info_get(entry);
1143
1144 if (p != q) {
1145 if (q != NULL)
1146 spin_unlock(&q->lock);
1147 if (p != NULL)
1148 spin_lock(&p->lock);
1149 }
1150 return p;
1151}
1152
1153static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
1154 unsigned long offset,
1155 unsigned char usage)
1156{
1157 unsigned char count;
1158 unsigned char has_cache;
1159
1160 count = p->swap_map[offset];
1161
1162 has_cache = count & SWAP_HAS_CACHE;
1163 count &= ~SWAP_HAS_CACHE;
1164
1165 if (usage == SWAP_HAS_CACHE) {
1166 VM_BUG_ON(!has_cache);
1167 has_cache = 0;
1168 } else if (count == SWAP_MAP_SHMEM) {
1169
1170
1171
1172
1173 count = 0;
1174 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
1175 if (count == COUNT_CONTINUED) {
1176 if (swap_count_continued(p, offset, count))
1177 count = SWAP_MAP_MAX | COUNT_CONTINUED;
1178 else
1179 count = SWAP_MAP_MAX;
1180 } else
1181 count--;
1182 }
1183
1184 usage = count | has_cache;
1185 p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
1186
1187 return usage;
1188}
1189
1190static unsigned char __swap_entry_free(struct swap_info_struct *p,
1191 swp_entry_t entry, unsigned char usage)
1192{
1193 struct swap_cluster_info *ci;
1194 unsigned long offset = swp_offset(entry);
1195
1196 ci = lock_cluster_or_swap_info(p, offset);
1197 usage = __swap_entry_free_locked(p, offset, usage);
1198 unlock_cluster_or_swap_info(p, ci);
1199 if (!usage)
1200 free_swap_slot(entry);
1201
1202 return usage;
1203}
1204
1205static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
1206{
1207 struct swap_cluster_info *ci;
1208 unsigned long offset = swp_offset(entry);
1209 unsigned char count;
1210
1211 ci = lock_cluster(p, offset);
1212 count = p->swap_map[offset];
1213 VM_BUG_ON(count != SWAP_HAS_CACHE);
1214 p->swap_map[offset] = 0;
1215 dec_cluster_info_page(p, p->cluster_info, offset);
1216 unlock_cluster(ci);
1217
1218 mem_cgroup_uncharge_swap(entry, 1);
1219 swap_range_free(p, offset, 1);
1220}
1221
1222
1223
1224
1225
1226void swap_free(swp_entry_t entry)
1227{
1228 struct swap_info_struct *p;
1229
1230 p = _swap_info_get(entry);
1231 if (p)
1232 __swap_entry_free(p, entry, 1);
1233}
1234
1235
1236
1237
1238void put_swap_page(struct page *page, swp_entry_t entry)
1239{
1240 unsigned long offset = swp_offset(entry);
1241 unsigned long idx = offset / SWAPFILE_CLUSTER;
1242 struct swap_cluster_info *ci;
1243 struct swap_info_struct *si;
1244 unsigned char *map;
1245 unsigned int i, free_entries = 0;
1246 unsigned char val;
1247 int size = swap_entry_size(hpage_nr_pages(page));
1248
1249 si = _swap_info_get(entry);
1250 if (!si)
1251 return;
1252
1253 ci = lock_cluster_or_swap_info(si, offset);
1254 if (size == SWAPFILE_CLUSTER) {
1255 VM_BUG_ON(!cluster_is_huge(ci));
1256 map = si->swap_map + offset;
1257 for (i = 0; i < SWAPFILE_CLUSTER; i++) {
1258 val = map[i];
1259 VM_BUG_ON(!(val & SWAP_HAS_CACHE));
1260 if (val == SWAP_HAS_CACHE)
1261 free_entries++;
1262 }
1263 cluster_clear_huge(ci);
1264 if (free_entries == SWAPFILE_CLUSTER) {
1265 unlock_cluster_or_swap_info(si, ci);
1266 spin_lock(&si->lock);
1267 mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
1268 swap_free_cluster(si, idx);
1269 spin_unlock(&si->lock);
1270 return;
1271 }
1272 }
1273 for (i = 0; i < size; i++, entry.val++) {
1274 if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
1275 unlock_cluster_or_swap_info(si, ci);
1276 free_swap_slot(entry);
1277 if (i == size - 1)
1278 return;
1279 lock_cluster_or_swap_info(si, offset);
1280 }
1281 }
1282 unlock_cluster_or_swap_info(si, ci);
1283}
1284
1285#ifdef CONFIG_THP_SWAP
1286int split_swap_cluster(swp_entry_t entry)
1287{
1288 struct swap_info_struct *si;
1289 struct swap_cluster_info *ci;
1290 unsigned long offset = swp_offset(entry);
1291
1292 si = _swap_info_get(entry);
1293 if (!si)
1294 return -EBUSY;
1295 ci = lock_cluster(si, offset);
1296 cluster_clear_huge(ci);
1297 unlock_cluster(ci);
1298 return 0;
1299}
1300#endif
1301
1302static int swp_entry_cmp(const void *ent1, const void *ent2)
1303{
1304 const swp_entry_t *e1 = ent1, *e2 = ent2;
1305
1306 return (int)swp_type(*e1) - (int)swp_type(*e2);
1307}
1308
1309void swapcache_free_entries(swp_entry_t *entries, int n)
1310{
1311 struct swap_info_struct *p, *prev;
1312 int i;
1313
1314 if (n <= 0)
1315 return;
1316
1317 prev = NULL;
1318 p = NULL;
1319
1320
1321
1322
1323
1324
1325 if (nr_swapfiles > 1)
1326 sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
1327 for (i = 0; i < n; ++i) {
1328 p = swap_info_get_cont(entries[i], prev);
1329 if (p)
1330 swap_entry_free(p, entries[i]);
1331 prev = p;
1332 }
1333 if (p)
1334 spin_unlock(&p->lock);
1335}
1336
1337
1338
1339
1340
1341
1342int page_swapcount(struct page *page)
1343{
1344 int count = 0;
1345 struct swap_info_struct *p;
1346 struct swap_cluster_info *ci;
1347 swp_entry_t entry;
1348 unsigned long offset;
1349
1350 entry.val = page_private(page);
1351 p = _swap_info_get(entry);
1352 if (p) {
1353 offset = swp_offset(entry);
1354 ci = lock_cluster_or_swap_info(p, offset);
1355 count = swap_count(p->swap_map[offset]);
1356 unlock_cluster_or_swap_info(p, ci);
1357 }
1358 return count;
1359}
1360
1361int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
1362{
1363 pgoff_t offset = swp_offset(entry);
1364
1365 return swap_count(si->swap_map[offset]);
1366}
1367
1368static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
1369{
1370 int count = 0;
1371 pgoff_t offset = swp_offset(entry);
1372 struct swap_cluster_info *ci;
1373
1374 ci = lock_cluster_or_swap_info(si, offset);
1375 count = swap_count(si->swap_map[offset]);
1376 unlock_cluster_or_swap_info(si, ci);
1377 return count;
1378}
1379
1380
1381
1382
1383
1384
1385int __swp_swapcount(swp_entry_t entry)
1386{
1387 int count = 0;
1388 struct swap_info_struct *si;
1389
1390 si = __swap_info_get(entry);
1391 if (si)
1392 count = swap_swapcount(si, entry);
1393 return count;
1394}
1395
1396
1397
1398
1399
1400int swp_swapcount(swp_entry_t entry)
1401{
1402 int count, tmp_count, n;
1403 struct swap_info_struct *p;
1404 struct swap_cluster_info *ci;
1405 struct page *page;
1406 pgoff_t offset;
1407 unsigned char *map;
1408
1409 p = _swap_info_get(entry);
1410 if (!p)
1411 return 0;
1412
1413 offset = swp_offset(entry);
1414
1415 ci = lock_cluster_or_swap_info(p, offset);
1416
1417 count = swap_count(p->swap_map[offset]);
1418 if (!(count & COUNT_CONTINUED))
1419 goto out;
1420
1421 count &= ~COUNT_CONTINUED;
1422 n = SWAP_MAP_MAX + 1;
1423
1424 page = vmalloc_to_page(p->swap_map + offset);
1425 offset &= ~PAGE_MASK;
1426 VM_BUG_ON(page_private(page) != SWP_CONTINUED);
1427
1428 do {
1429 page = list_next_entry(page, lru);
1430 map = kmap_atomic(page);
1431 tmp_count = map[offset];
1432 kunmap_atomic(map);
1433
1434 count += (tmp_count & ~COUNT_CONTINUED) * n;
1435 n *= (SWAP_CONT_MAX + 1);
1436 } while (tmp_count & COUNT_CONTINUED);
1437out:
1438 unlock_cluster_or_swap_info(p, ci);
1439 return count;
1440}
1441
1442static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
1443 swp_entry_t entry)
1444{
1445 struct swap_cluster_info *ci;
1446 unsigned char *map = si->swap_map;
1447 unsigned long roffset = swp_offset(entry);
1448 unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
1449 int i;
1450 bool ret = false;
1451
1452 ci = lock_cluster_or_swap_info(si, offset);
1453 if (!ci || !cluster_is_huge(ci)) {
1454 if (swap_count(map[roffset]))
1455 ret = true;
1456 goto unlock_out;
1457 }
1458 for (i = 0; i < SWAPFILE_CLUSTER; i++) {
1459 if (swap_count(map[offset + i])) {
1460 ret = true;
1461 break;
1462 }
1463 }
1464unlock_out:
1465 unlock_cluster_or_swap_info(si, ci);
1466 return ret;
1467}
1468
1469static bool page_swapped(struct page *page)
1470{
1471 swp_entry_t entry;
1472 struct swap_info_struct *si;
1473
1474 if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
1475 return page_swapcount(page) != 0;
1476
1477 page = compound_head(page);
1478 entry.val = page_private(page);
1479 si = _swap_info_get(entry);
1480 if (si)
1481 return swap_page_trans_huge_swapped(si, entry);
1482 return false;
1483}
1484
1485static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
1486 int *total_swapcount)
1487{
1488 int i, map_swapcount, _total_mapcount, _total_swapcount;
1489 unsigned long offset = 0;
1490 struct swap_info_struct *si;
1491 struct swap_cluster_info *ci = NULL;
1492 unsigned char *map = NULL;
1493 int mapcount, swapcount = 0;
1494
1495
1496 VM_BUG_ON_PAGE(PageHuge(page), page);
1497
1498 if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
1499 mapcount = page_trans_huge_mapcount(page, total_mapcount);
1500 if (PageSwapCache(page))
1501 swapcount = page_swapcount(page);
1502 if (total_swapcount)
1503 *total_swapcount = swapcount;
1504 return mapcount + swapcount;
1505 }
1506
1507 page = compound_head(page);
1508
1509 _total_mapcount = _total_swapcount = map_swapcount = 0;
1510 if (PageSwapCache(page)) {
1511 swp_entry_t entry;
1512
1513 entry.val = page_private(page);
1514 si = _swap_info_get(entry);
1515 if (si) {
1516 map = si->swap_map;
1517 offset = swp_offset(entry);
1518 }
1519 }
1520 if (map)
1521 ci = lock_cluster(si, offset);
1522 for (i = 0; i < HPAGE_PMD_NR; i++) {
1523 mapcount = atomic_read(&page[i]._mapcount) + 1;
1524 _total_mapcount += mapcount;
1525 if (map) {
1526 swapcount = swap_count(map[offset + i]);
1527 _total_swapcount += swapcount;
1528 }
1529 map_swapcount = max(map_swapcount, mapcount + swapcount);
1530 }
1531 unlock_cluster(ci);
1532 if (PageDoubleMap(page)) {
1533 map_swapcount -= 1;
1534 _total_mapcount -= HPAGE_PMD_NR;
1535 }
1536 mapcount = compound_mapcount(page);
1537 map_swapcount += mapcount;
1538 _total_mapcount += mapcount;
1539 if (total_mapcount)
1540 *total_mapcount = _total_mapcount;
1541 if (total_swapcount)
1542 *total_swapcount = _total_swapcount;
1543
1544 return map_swapcount;
1545}
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557bool reuse_swap_page(struct page *page, int *total_map_swapcount)
1558{
1559 int count, total_mapcount, total_swapcount;
1560
1561 VM_BUG_ON_PAGE(!PageLocked(page), page);
1562 if (unlikely(PageKsm(page)))
1563 return false;
1564 count = page_trans_huge_map_swapcount(page, &total_mapcount,
1565 &total_swapcount);
1566 if (total_map_swapcount)
1567 *total_map_swapcount = total_mapcount + total_swapcount;
1568 if (count == 1 && PageSwapCache(page) &&
1569 (likely(!PageTransCompound(page)) ||
1570
1571 total_swapcount == page_swapcount(page))) {
1572 if (!PageWriteback(page)) {
1573 page = compound_head(page);
1574 delete_from_swap_cache(page);
1575 SetPageDirty(page);
1576 } else {
1577 swp_entry_t entry;
1578 struct swap_info_struct *p;
1579
1580 entry.val = page_private(page);
1581 p = swap_info_get(entry);
1582 if (p->flags & SWP_STABLE_WRITES) {
1583 spin_unlock(&p->lock);
1584 return false;
1585 }
1586 spin_unlock(&p->lock);
1587 }
1588 }
1589
1590 return count <= 1;
1591}
1592
1593
1594
1595
1596
1597int try_to_free_swap(struct page *page)
1598{
1599 VM_BUG_ON_PAGE(!PageLocked(page), page);
1600
1601 if (!PageSwapCache(page))
1602 return 0;
1603 if (PageWriteback(page))
1604 return 0;
1605 if (page_swapped(page))
1606 return 0;
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623 if (pm_suspended_storage())
1624 return 0;
1625
1626 page = compound_head(page);
1627 delete_from_swap_cache(page);
1628 SetPageDirty(page);
1629 return 1;
1630}
1631
1632
1633
1634
1635
1636int free_swap_and_cache(swp_entry_t entry)
1637{
1638 struct swap_info_struct *p;
1639 unsigned char count;
1640
1641 if (non_swap_entry(entry))
1642 return 1;
1643
1644 p = _swap_info_get(entry);
1645 if (p) {
1646 count = __swap_entry_free(p, entry, 1);
1647 if (count == SWAP_HAS_CACHE &&
1648 !swap_page_trans_huge_swapped(p, entry))
1649 __try_to_reclaim_swap(p, swp_offset(entry),
1650 TTRS_UNMAPPED | TTRS_FULL);
1651 }
1652 return p != NULL;
1653}
1654
1655#ifdef CONFIG_HIBERNATION
1656
1657
1658
1659
1660
1661
1662
1663
1664int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
1665{
1666 struct block_device *bdev = NULL;
1667 int type;
1668
1669 if (device)
1670 bdev = bdget(device);
1671
1672 spin_lock(&swap_lock);
1673 for (type = 0; type < nr_swapfiles; type++) {
1674 struct swap_info_struct *sis = swap_info[type];
1675
1676 if (!(sis->flags & SWP_WRITEOK))
1677 continue;
1678
1679 if (!bdev) {
1680 if (bdev_p)
1681 *bdev_p = bdgrab(sis->bdev);
1682
1683 spin_unlock(&swap_lock);
1684 return type;
1685 }
1686 if (bdev == sis->bdev) {
1687 struct swap_extent *se = &sis->first_swap_extent;
1688
1689 if (se->start_block == offset) {
1690 if (bdev_p)
1691 *bdev_p = bdgrab(sis->bdev);
1692
1693 spin_unlock(&swap_lock);
1694 bdput(bdev);
1695 return type;
1696 }
1697 }
1698 }
1699 spin_unlock(&swap_lock);
1700 if (bdev)
1701 bdput(bdev);
1702
1703 return -ENODEV;
1704}
1705
1706
1707
1708
1709
1710sector_t swapdev_block(int type, pgoff_t offset)
1711{
1712 struct block_device *bdev;
1713 struct swap_info_struct *si = swap_type_to_swap_info(type);
1714
1715 if (!si || !(si->flags & SWP_WRITEOK))
1716 return 0;
1717 return map_swap_entry(swp_entry(type, offset), &bdev);
1718}
1719
1720
1721
1722
1723
1724
1725
1726unsigned int count_swap_pages(int type, int free)
1727{
1728 unsigned int n = 0;
1729
1730 spin_lock(&swap_lock);
1731 if ((unsigned int)type < nr_swapfiles) {
1732 struct swap_info_struct *sis = swap_info[type];
1733
1734 spin_lock(&sis->lock);
1735 if (sis->flags & SWP_WRITEOK) {
1736 n = sis->pages;
1737 if (free)
1738 n -= sis->inuse_pages;
1739 }
1740 spin_unlock(&sis->lock);
1741 }
1742 spin_unlock(&swap_lock);
1743 return n;
1744}
1745#endif
1746
1747static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
1748{
1749 return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
1750}
1751
1752
1753
1754
1755
1756
1757static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
1758 unsigned long addr, swp_entry_t entry, struct page *page)
1759{
1760 struct page *swapcache;
1761 struct mem_cgroup *memcg;
1762 spinlock_t *ptl;
1763 pte_t *pte;
1764 int ret = 1;
1765
1766 swapcache = page;
1767 page = ksm_might_need_to_copy(page, vma, addr);
1768 if (unlikely(!page))
1769 return -ENOMEM;
1770
1771 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
1772 &memcg, false)) {
1773 ret = -ENOMEM;
1774 goto out_nolock;
1775 }
1776
1777 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1778 if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
1779 mem_cgroup_cancel_charge(page, memcg, false);
1780 ret = 0;
1781 goto out;
1782 }
1783
1784 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
1785 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
1786 get_page(page);
1787 set_pte_at(vma->vm_mm, addr, pte,
1788 pte_mkold(mk_pte(page, vma->vm_page_prot)));
1789 if (page == swapcache) {
1790 page_add_anon_rmap(page, vma, addr, false);
1791 mem_cgroup_commit_charge(page, memcg, true, false);
1792 } else {
1793 page_add_new_anon_rmap(page, vma, addr, false);
1794 mem_cgroup_commit_charge(page, memcg, false, false);
1795 lru_cache_add_active_or_unevictable(page, vma);
1796 }
1797 swap_free(entry);
1798
1799
1800
1801
1802 activate_page(page);
1803out:
1804 pte_unmap_unlock(pte, ptl);
1805out_nolock:
1806 if (page != swapcache) {
1807 unlock_page(page);
1808 put_page(page);
1809 }
1810 return ret;
1811}
1812
1813static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
1814 unsigned long addr, unsigned long end,
1815 unsigned int type, bool frontswap,
1816 unsigned long *fs_pages_to_unuse)
1817{
1818 struct page *page;
1819 swp_entry_t entry;
1820 pte_t *pte;
1821 struct swap_info_struct *si;
1822 unsigned long offset;
1823 int ret = 0;
1824 volatile unsigned char *swap_map;
1825
1826 si = swap_info[type];
1827 pte = pte_offset_map(pmd, addr);
1828 do {
1829 struct vm_fault vmf;
1830
1831 if (!is_swap_pte(*pte))
1832 continue;
1833
1834 entry = pte_to_swp_entry(*pte);
1835 if (swp_type(entry) != type)
1836 continue;
1837
1838 offset = swp_offset(entry);
1839 if (frontswap && !frontswap_test(si, offset))
1840 continue;
1841
1842 pte_unmap(pte);
1843 swap_map = &si->swap_map[offset];
1844 vmf.vma = vma;
1845 vmf.address = addr;
1846 vmf.pmd = pmd;
1847 page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf);
1848 if (!page) {
1849 if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
1850 goto try_next;
1851 return -ENOMEM;
1852 }
1853
1854 lock_page(page);
1855 wait_on_page_writeback(page);
1856 ret = unuse_pte(vma, pmd, addr, entry, page);
1857 if (ret < 0) {
1858 unlock_page(page);
1859 put_page(page);
1860 goto out;
1861 }
1862
1863 try_to_free_swap(page);
1864 unlock_page(page);
1865 put_page(page);
1866
1867 if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) {
1868 ret = FRONTSWAP_PAGES_UNUSED;
1869 goto out;
1870 }
1871try_next:
1872 pte = pte_offset_map(pmd, addr);
1873 } while (pte++, addr += PAGE_SIZE, addr != end);
1874 pte_unmap(pte - 1);
1875
1876 ret = 0;
1877out:
1878 return ret;
1879}
1880
1881static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
1882 unsigned long addr, unsigned long end,
1883 unsigned int type, bool frontswap,
1884 unsigned long *fs_pages_to_unuse)
1885{
1886 pmd_t *pmd;
1887 unsigned long next;
1888 int ret;
1889
1890 pmd = pmd_offset(pud, addr);
1891 do {
1892 cond_resched();
1893 next = pmd_addr_end(addr, end);
1894 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1895 continue;
1896 ret = unuse_pte_range(vma, pmd, addr, next, type,
1897 frontswap, fs_pages_to_unuse);
1898 if (ret)
1899 return ret;
1900 } while (pmd++, addr = next, addr != end);
1901 return 0;
1902}
1903
1904static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
1905 unsigned long addr, unsigned long end,
1906 unsigned int type, bool frontswap,
1907 unsigned long *fs_pages_to_unuse)
1908{
1909 pud_t *pud;
1910 unsigned long next;
1911 int ret;
1912
1913 pud = pud_offset(p4d, addr);
1914 do {
1915 next = pud_addr_end(addr, end);
1916 if (pud_none_or_clear_bad(pud))
1917 continue;
1918 ret = unuse_pmd_range(vma, pud, addr, next, type,
1919 frontswap, fs_pages_to_unuse);
1920 if (ret)
1921 return ret;
1922 } while (pud++, addr = next, addr != end);
1923 return 0;
1924}
1925
1926static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
1927 unsigned long addr, unsigned long end,
1928 unsigned int type, bool frontswap,
1929 unsigned long *fs_pages_to_unuse)
1930{
1931 p4d_t *p4d;
1932 unsigned long next;
1933 int ret;
1934
1935 p4d = p4d_offset(pgd, addr);
1936 do {
1937 next = p4d_addr_end(addr, end);
1938 if (p4d_none_or_clear_bad(p4d))
1939 continue;
1940 ret = unuse_pud_range(vma, p4d, addr, next, type,
1941 frontswap, fs_pages_to_unuse);
1942 if (ret)
1943 return ret;
1944 } while (p4d++, addr = next, addr != end);
1945 return 0;
1946}
1947
1948static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
1949 bool frontswap, unsigned long *fs_pages_to_unuse)
1950{
1951 pgd_t *pgd;
1952 unsigned long addr, end, next;
1953 int ret;
1954
1955 addr = vma->vm_start;
1956 end = vma->vm_end;
1957
1958 pgd = pgd_offset(vma->vm_mm, addr);
1959 do {
1960 next = pgd_addr_end(addr, end);
1961 if (pgd_none_or_clear_bad(pgd))
1962 continue;
1963 ret = unuse_p4d_range(vma, pgd, addr, next, type,
1964 frontswap, fs_pages_to_unuse);
1965 if (ret)
1966 return ret;
1967 } while (pgd++, addr = next, addr != end);
1968 return 0;
1969}
1970
1971static int unuse_mm(struct mm_struct *mm, unsigned int type,
1972 bool frontswap, unsigned long *fs_pages_to_unuse)
1973{
1974 struct vm_area_struct *vma;
1975 int ret = 0;
1976
1977 down_read(&mm->mmap_sem);
1978 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1979 if (vma->anon_vma) {
1980 ret = unuse_vma(vma, type, frontswap,
1981 fs_pages_to_unuse);
1982 if (ret)
1983 break;
1984 }
1985 cond_resched();
1986 }
1987 up_read(&mm->mmap_sem);
1988 return ret;
1989}
1990
1991
1992
1993
1994
1995
1996static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1997 unsigned int prev, bool frontswap)
1998{
1999 unsigned int i;
2000 unsigned char count;
2001
2002
2003
2004
2005
2006
2007
2008 for (i = prev + 1; i < si->max; i++) {
2009 count = READ_ONCE(si->swap_map[i]);
2010 if (count && swap_count(count) != SWAP_MAP_BAD)
2011 if (!frontswap || frontswap_test(si, i))
2012 break;
2013 if ((i % LATENCY_LIMIT) == 0)
2014 cond_resched();
2015 }
2016
2017 if (i == si->max)
2018 i = 0;
2019
2020 return i;
2021}
2022
2023
2024
2025
2026
2027int try_to_unuse(unsigned int type, bool frontswap,
2028 unsigned long pages_to_unuse)
2029{
2030 struct mm_struct *prev_mm;
2031 struct mm_struct *mm;
2032 struct list_head *p;
2033 int retval = 0;
2034 struct swap_info_struct *si = swap_info[type];
2035 struct page *page;
2036 swp_entry_t entry;
2037 unsigned int i;
2038
2039 if (!si->inuse_pages)
2040 return 0;
2041
2042 if (!frontswap)
2043 pages_to_unuse = 0;
2044
2045retry:
2046 retval = shmem_unuse(type, frontswap, &pages_to_unuse);
2047 if (retval)
2048 goto out;
2049
2050 prev_mm = &init_mm;
2051 mmget(prev_mm);
2052
2053 spin_lock(&mmlist_lock);
2054 p = &init_mm.mmlist;
2055 while (si->inuse_pages &&
2056 !signal_pending(current) &&
2057 (p = p->next) != &init_mm.mmlist) {
2058
2059 mm = list_entry(p, struct mm_struct, mmlist);
2060 if (!mmget_not_zero(mm))
2061 continue;
2062 spin_unlock(&mmlist_lock);
2063 mmput(prev_mm);
2064 prev_mm = mm;
2065 retval = unuse_mm(mm, type, frontswap, &pages_to_unuse);
2066
2067 if (retval) {
2068 mmput(prev_mm);
2069 goto out;
2070 }
2071
2072
2073
2074
2075
2076 cond_resched();
2077 spin_lock(&mmlist_lock);
2078 }
2079 spin_unlock(&mmlist_lock);
2080
2081 mmput(prev_mm);
2082
2083 i = 0;
2084 while (si->inuse_pages &&
2085 !signal_pending(current) &&
2086 (i = find_next_to_unuse(si, i, frontswap)) != 0) {
2087
2088 entry = swp_entry(type, i);
2089 page = find_get_page(swap_address_space(entry), i);
2090 if (!page)
2091 continue;
2092
2093
2094
2095
2096
2097
2098
2099 lock_page(page);
2100 wait_on_page_writeback(page);
2101 try_to_free_swap(page);
2102 unlock_page(page);
2103 put_page(page);
2104
2105
2106
2107
2108
2109
2110 if (pages_to_unuse && --pages_to_unuse == 0)
2111 goto out;
2112 }
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126 if (si->inuse_pages) {
2127 if (!signal_pending(current))
2128 goto retry;
2129 retval = -EINTR;
2130 }
2131out:
2132 return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval;
2133}
2134
2135
2136
2137
2138
2139
2140
2141static void drain_mmlist(void)
2142{
2143 struct list_head *p, *next;
2144 unsigned int type;
2145
2146 for (type = 0; type < nr_swapfiles; type++)
2147 if (swap_info[type]->inuse_pages)
2148 return;
2149 spin_lock(&mmlist_lock);
2150 list_for_each_safe(p, next, &init_mm.mmlist)
2151 list_del_init(p);
2152 spin_unlock(&mmlist_lock);
2153}
2154
2155
2156
2157
2158
2159
2160
2161static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
2162{
2163 struct swap_info_struct *sis;
2164 struct swap_extent *start_se;
2165 struct swap_extent *se;
2166 pgoff_t offset;
2167
2168 sis = swp_swap_info(entry);
2169 *bdev = sis->bdev;
2170
2171 offset = swp_offset(entry);
2172 start_se = sis->curr_swap_extent;
2173 se = start_se;
2174
2175 for ( ; ; ) {
2176 if (se->start_page <= offset &&
2177 offset < (se->start_page + se->nr_pages)) {
2178 return se->start_block + (offset - se->start_page);
2179 }
2180 se = list_next_entry(se, list);
2181 sis->curr_swap_extent = se;
2182 BUG_ON(se == start_se);
2183 }
2184}
2185
2186
2187
2188
2189sector_t map_swap_page(struct page *page, struct block_device **bdev)
2190{
2191 swp_entry_t entry;
2192 entry.val = page_private(page);
2193 return map_swap_entry(entry, bdev);
2194}
2195
2196
2197
2198
2199static void destroy_swap_extents(struct swap_info_struct *sis)
2200{
2201 while (!list_empty(&sis->first_swap_extent.list)) {
2202 struct swap_extent *se;
2203
2204 se = list_first_entry(&sis->first_swap_extent.list,
2205 struct swap_extent, list);
2206 list_del(&se->list);
2207 kfree(se);
2208 }
2209
2210 if (sis->flags & SWP_ACTIVATED) {
2211 struct file *swap_file = sis->swap_file;
2212 struct address_space *mapping = swap_file->f_mapping;
2213
2214 sis->flags &= ~SWP_ACTIVATED;
2215 if (mapping->a_ops->swap_deactivate)
2216 mapping->a_ops->swap_deactivate(swap_file);
2217 }
2218}
2219
2220
2221
2222
2223
2224
2225
2226int
2227add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
2228 unsigned long nr_pages, sector_t start_block)
2229{
2230 struct swap_extent *se;
2231 struct swap_extent *new_se;
2232 struct list_head *lh;
2233
2234 if (start_page == 0) {
2235 se = &sis->first_swap_extent;
2236 sis->curr_swap_extent = se;
2237 se->start_page = 0;
2238 se->nr_pages = nr_pages;
2239 se->start_block = start_block;
2240 return 1;
2241 } else {
2242 lh = sis->first_swap_extent.list.prev;
2243 se = list_entry(lh, struct swap_extent, list);
2244 BUG_ON(se->start_page + se->nr_pages != start_page);
2245 if (se->start_block + se->nr_pages == start_block) {
2246
2247 se->nr_pages += nr_pages;
2248 return 0;
2249 }
2250 }
2251
2252
2253
2254
2255 new_se = kmalloc(sizeof(*se), GFP_KERNEL);
2256 if (new_se == NULL)
2257 return -ENOMEM;
2258 new_se->start_page = start_page;
2259 new_se->nr_pages = nr_pages;
2260 new_se->start_block = start_block;
2261
2262 list_add_tail(&new_se->list, &sis->first_swap_extent.list);
2263 return 1;
2264}
2265EXPORT_SYMBOL_GPL(add_swap_extent);
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
2299{
2300 struct file *swap_file = sis->swap_file;
2301 struct address_space *mapping = swap_file->f_mapping;
2302 struct inode *inode = mapping->host;
2303 int ret;
2304
2305 if (S_ISBLK(inode->i_mode)) {
2306 ret = add_swap_extent(sis, 0, sis->max, 0);
2307 *span = sis->pages;
2308 return ret;
2309 }
2310
2311 if (mapping->a_ops->swap_activate) {
2312 ret = mapping->a_ops->swap_activate(sis, swap_file, span);
2313 if (ret >= 0)
2314 sis->flags |= SWP_ACTIVATED;
2315 if (!ret) {
2316 sis->flags |= SWP_FS;
2317 ret = add_swap_extent(sis, 0, sis->max, 0);
2318 *span = sis->pages;
2319 }
2320 return ret;
2321 }
2322
2323 return generic_swapfile_activate(sis, swap_file, span);
2324}
2325
2326static int swap_node(struct swap_info_struct *p)
2327{
2328 struct block_device *bdev;
2329
2330 if (p->bdev)
2331 bdev = p->bdev;
2332 else
2333 bdev = p->swap_file->f_inode->i_sb->s_bdev;
2334
2335 return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
2336}
2337
2338static void _enable_swap_info(struct swap_info_struct *p, int prio,
2339 unsigned char *swap_map,
2340 struct swap_cluster_info *cluster_info)
2341{
2342 int i;
2343
2344 if (prio >= 0)
2345 p->prio = prio;
2346 else
2347 p->prio = --least_priority;
2348
2349
2350
2351
2352 p->list.prio = -p->prio;
2353 for_each_node(i) {
2354 if (p->prio >= 0)
2355 p->avail_lists[i].prio = -p->prio;
2356 else {
2357 if (swap_node(p) == i)
2358 p->avail_lists[i].prio = 1;
2359 else
2360 p->avail_lists[i].prio = -p->prio;
2361 }
2362 }
2363 p->swap_map = swap_map;
2364 p->cluster_info = cluster_info;
2365 p->flags |= SWP_WRITEOK;
2366 atomic_long_add(p->pages, &nr_swap_pages);
2367 total_swap_pages += p->pages;
2368
2369 assert_spin_locked(&swap_lock);
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380 plist_add(&p->list, &swap_active_head);
2381 add_to_avail_list(p);
2382}
2383
2384static void enable_swap_info(struct swap_info_struct *p, int prio,
2385 unsigned char *swap_map,
2386 struct swap_cluster_info *cluster_info,
2387 unsigned long *frontswap_map)
2388{
2389 frontswap_init(p->type, frontswap_map);
2390 spin_lock(&swap_lock);
2391 spin_lock(&p->lock);
2392 _enable_swap_info(p, prio, swap_map, cluster_info);
2393 spin_unlock(&p->lock);
2394 spin_unlock(&swap_lock);
2395}
2396
2397static void reinsert_swap_info(struct swap_info_struct *p)
2398{
2399 spin_lock(&swap_lock);
2400 spin_lock(&p->lock);
2401 _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
2402 spin_unlock(&p->lock);
2403 spin_unlock(&swap_lock);
2404}
2405
2406bool has_usable_swap(void)
2407{
2408 bool ret = true;
2409
2410 spin_lock(&swap_lock);
2411 if (plist_head_empty(&swap_active_head))
2412 ret = false;
2413 spin_unlock(&swap_lock);
2414 return ret;
2415}
2416
2417SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
2418{
2419 struct swap_info_struct *p = NULL;
2420 unsigned char *swap_map;
2421 struct swap_cluster_info *cluster_info;
2422 unsigned long *frontswap_map;
2423 struct file *swap_file, *victim;
2424 struct address_space *mapping;
2425 struct inode *inode;
2426 struct filename *pathname;
2427 int err, found = 0;
2428 unsigned int old_block_size;
2429
2430 if (!capable(CAP_SYS_ADMIN))
2431 return -EPERM;
2432
2433 BUG_ON(!current->mm);
2434
2435 pathname = getname(specialfile);
2436 if (IS_ERR(pathname))
2437 return PTR_ERR(pathname);
2438
2439 victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
2440 err = PTR_ERR(victim);
2441 if (IS_ERR(victim))
2442 goto out;
2443
2444 mapping = victim->f_mapping;
2445 spin_lock(&swap_lock);
2446 plist_for_each_entry(p, &swap_active_head, list) {
2447 if (p->flags & SWP_WRITEOK) {
2448 if (p->swap_file->f_mapping == mapping) {
2449 found = 1;
2450 break;
2451 }
2452 }
2453 }
2454 if (!found) {
2455 err = -EINVAL;
2456 spin_unlock(&swap_lock);
2457 goto out_dput;
2458 }
2459 if (!security_vm_enough_memory_mm(current->mm, p->pages))
2460 vm_unacct_memory(p->pages);
2461 else {
2462 err = -ENOMEM;
2463 spin_unlock(&swap_lock);
2464 goto out_dput;
2465 }
2466 del_from_avail_list(p);
2467 spin_lock(&p->lock);
2468 if (p->prio < 0) {
2469 struct swap_info_struct *si = p;
2470 int nid;
2471
2472 plist_for_each_entry_continue(si, &swap_active_head, list) {
2473 si->prio++;
2474 si->list.prio--;
2475 for_each_node(nid) {
2476 if (si->avail_lists[nid].prio != 1)
2477 si->avail_lists[nid].prio--;
2478 }
2479 }
2480 least_priority++;
2481 }
2482 plist_del(&p->list, &swap_active_head);
2483 atomic_long_sub(p->pages, &nr_swap_pages);
2484 total_swap_pages -= p->pages;
2485 p->flags &= ~SWP_WRITEOK;
2486 spin_unlock(&p->lock);
2487 spin_unlock(&swap_lock);
2488
2489 disable_swap_slots_cache_lock();
2490
2491 set_current_oom_origin();
2492 err = try_to_unuse(p->type, false, 0);
2493 clear_current_oom_origin();
2494
2495 if (err) {
2496
2497 reinsert_swap_info(p);
2498 reenable_swap_slots_cache_unlock();
2499 goto out_dput;
2500 }
2501
2502 reenable_swap_slots_cache_unlock();
2503
2504 flush_work(&p->discard_work);
2505
2506 destroy_swap_extents(p);
2507 if (p->flags & SWP_CONTINUED)
2508 free_swap_count_continuations(p);
2509
2510 if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
2511 atomic_dec(&nr_rotate_swap);
2512
2513 mutex_lock(&swapon_mutex);
2514 spin_lock(&swap_lock);
2515 spin_lock(&p->lock);
2516 drain_mmlist();
2517
2518
2519 p->highest_bit = 0;
2520 while (p->flags >= SWP_SCANNING) {
2521 spin_unlock(&p->lock);
2522 spin_unlock(&swap_lock);
2523 schedule_timeout_uninterruptible(1);
2524 spin_lock(&swap_lock);
2525 spin_lock(&p->lock);
2526 }
2527
2528 swap_file = p->swap_file;
2529 old_block_size = p->old_block_size;
2530 p->swap_file = NULL;
2531 p->max = 0;
2532 swap_map = p->swap_map;
2533 p->swap_map = NULL;
2534 cluster_info = p->cluster_info;
2535 p->cluster_info = NULL;
2536 frontswap_map = frontswap_map_get(p);
2537 spin_unlock(&p->lock);
2538 spin_unlock(&swap_lock);
2539 frontswap_invalidate_area(p->type);
2540 frontswap_map_set(p, NULL);
2541 mutex_unlock(&swapon_mutex);
2542 free_percpu(p->percpu_cluster);
2543 p->percpu_cluster = NULL;
2544 vfree(swap_map);
2545 kvfree(cluster_info);
2546 kvfree(frontswap_map);
2547
2548 swap_cgroup_swapoff(p->type);
2549 exit_swap_address_space(p->type);
2550
2551 inode = mapping->host;
2552 if (S_ISBLK(inode->i_mode)) {
2553 struct block_device *bdev = I_BDEV(inode);
2554 set_blocksize(bdev, old_block_size);
2555 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2556 } else {
2557 inode_lock(inode);
2558 inode->i_flags &= ~S_SWAPFILE;
2559 inode_unlock(inode);
2560 }
2561 filp_close(swap_file, NULL);
2562
2563
2564
2565
2566
2567
2568 spin_lock(&swap_lock);
2569 p->flags = 0;
2570 spin_unlock(&swap_lock);
2571
2572 err = 0;
2573 atomic_inc(&proc_poll_event);
2574 wake_up_interruptible(&proc_poll_wait);
2575
2576out_dput:
2577 filp_close(victim, NULL);
2578out:
2579 putname(pathname);
2580 return err;
2581}
2582
2583#ifdef CONFIG_PROC_FS
2584static __poll_t swaps_poll(struct file *file, poll_table *wait)
2585{
2586 struct seq_file *seq = file->private_data;
2587
2588 poll_wait(file, &proc_poll_wait, wait);
2589
2590 if (seq->poll_event != atomic_read(&proc_poll_event)) {
2591 seq->poll_event = atomic_read(&proc_poll_event);
2592 return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
2593 }
2594
2595 return EPOLLIN | EPOLLRDNORM;
2596}
2597
2598
2599static void *swap_start(struct seq_file *swap, loff_t *pos)
2600{
2601 struct swap_info_struct *si;
2602 int type;
2603 loff_t l = *pos;
2604
2605 mutex_lock(&swapon_mutex);
2606
2607 if (!l)
2608 return SEQ_START_TOKEN;
2609
2610 for (type = 0; (si = swap_type_to_swap_info(type)); type++) {
2611 if (!(si->flags & SWP_USED) || !si->swap_map)
2612 continue;
2613 if (!--l)
2614 return si;
2615 }
2616
2617 return NULL;
2618}
2619
2620static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
2621{
2622 struct swap_info_struct *si = v;
2623 int type;
2624
2625 if (v == SEQ_START_TOKEN)
2626 type = 0;
2627 else
2628 type = si->type + 1;
2629
2630 for (; (si = swap_type_to_swap_info(type)); type++) {
2631 if (!(si->flags & SWP_USED) || !si->swap_map)
2632 continue;
2633 ++*pos;
2634 return si;
2635 }
2636
2637 return NULL;
2638}
2639
2640static void swap_stop(struct seq_file *swap, void *v)
2641{
2642 mutex_unlock(&swapon_mutex);
2643}
2644
2645static int swap_show(struct seq_file *swap, void *v)
2646{
2647 struct swap_info_struct *si = v;
2648 struct file *file;
2649 int len;
2650
2651 if (si == SEQ_START_TOKEN) {
2652 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
2653 return 0;
2654 }
2655
2656 file = si->swap_file;
2657 len = seq_file_path(swap, file, " \t\n\\");
2658 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
2659 len < 40 ? 40 - len : 1, " ",
2660 S_ISBLK(file_inode(file)->i_mode) ?
2661 "partition" : "file\t",
2662 si->pages << (PAGE_SHIFT - 10),
2663 si->inuse_pages << (PAGE_SHIFT - 10),
2664 si->prio);
2665 return 0;
2666}
2667
2668static const struct seq_operations swaps_op = {
2669 .start = swap_start,
2670 .next = swap_next,
2671 .stop = swap_stop,
2672 .show = swap_show
2673};
2674
2675static int swaps_open(struct inode *inode, struct file *file)
2676{
2677 struct seq_file *seq;
2678 int ret;
2679
2680 ret = seq_open(file, &swaps_op);
2681 if (ret)
2682 return ret;
2683
2684 seq = file->private_data;
2685 seq->poll_event = atomic_read(&proc_poll_event);
2686 return 0;
2687}
2688
2689static const struct file_operations proc_swaps_operations = {
2690 .open = swaps_open,
2691 .read = seq_read,
2692 .llseek = seq_lseek,
2693 .release = seq_release,
2694 .poll = swaps_poll,
2695};
2696
2697static int __init procswaps_init(void)
2698{
2699 proc_create("swaps", 0, NULL, &proc_swaps_operations);
2700 return 0;
2701}
2702__initcall(procswaps_init);
2703#endif
2704
2705#ifdef MAX_SWAPFILES_CHECK
2706static int __init max_swapfiles_check(void)
2707{
2708 MAX_SWAPFILES_CHECK();
2709 return 0;
2710}
2711late_initcall(max_swapfiles_check);
2712#endif
2713
2714static struct swap_info_struct *alloc_swap_info(void)
2715{
2716 struct swap_info_struct *p;
2717 unsigned int type;
2718 int i;
2719
2720 p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
2721 if (!p)
2722 return ERR_PTR(-ENOMEM);
2723
2724 spin_lock(&swap_lock);
2725 for (type = 0; type < nr_swapfiles; type++) {
2726 if (!(swap_info[type]->flags & SWP_USED))
2727 break;
2728 }
2729 if (type >= MAX_SWAPFILES) {
2730 spin_unlock(&swap_lock);
2731 kvfree(p);
2732 return ERR_PTR(-EPERM);
2733 }
2734 if (type >= nr_swapfiles) {
2735 p->type = type;
2736 WRITE_ONCE(swap_info[type], p);
2737
2738
2739
2740
2741
2742 smp_wmb();
2743 WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1);
2744 } else {
2745 kvfree(p);
2746 p = swap_info[type];
2747
2748
2749
2750
2751 }
2752 INIT_LIST_HEAD(&p->first_swap_extent.list);
2753 plist_node_init(&p->list, 0);
2754 for_each_node(i)
2755 plist_node_init(&p->avail_lists[i], 0);
2756 p->flags = SWP_USED;
2757 spin_unlock(&swap_lock);
2758 spin_lock_init(&p->lock);
2759 spin_lock_init(&p->cont_lock);
2760
2761 return p;
2762}
2763
2764static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
2765{
2766 int error;
2767
2768 if (S_ISBLK(inode->i_mode)) {
2769 p->bdev = bdgrab(I_BDEV(inode));
2770 error = blkdev_get(p->bdev,
2771 FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
2772 if (error < 0) {
2773 p->bdev = NULL;
2774 return error;
2775 }
2776 p->old_block_size = block_size(p->bdev);
2777 error = set_blocksize(p->bdev, PAGE_SIZE);
2778 if (error < 0)
2779 return error;
2780 p->flags |= SWP_BLKDEV;
2781 } else if (S_ISREG(inode->i_mode)) {
2782 p->bdev = inode->i_sb->s_bdev;
2783 inode_lock(inode);
2784 if (IS_SWAPFILE(inode))
2785 return -EBUSY;
2786 } else
2787 return -EINVAL;
2788
2789 return 0;
2790}
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809unsigned long generic_max_swapfile_size(void)
2810{
2811 return swp_offset(pte_to_swp_entry(
2812 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
2813}
2814
2815
2816__weak unsigned long max_swapfile_size(void)
2817{
2818 return generic_max_swapfile_size();
2819}
2820
2821static unsigned long read_swap_header(struct swap_info_struct *p,
2822 union swap_header *swap_header,
2823 struct inode *inode)
2824{
2825 int i;
2826 unsigned long maxpages;
2827 unsigned long swapfilepages;
2828 unsigned long last_page;
2829
2830 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
2831 pr_err("Unable to find swap-space signature\n");
2832 return 0;
2833 }
2834
2835
2836 if (swab32(swap_header->info.version) == 1) {
2837 swab32s(&swap_header->info.version);
2838 swab32s(&swap_header->info.last_page);
2839 swab32s(&swap_header->info.nr_badpages);
2840 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2841 return 0;
2842 for (i = 0; i < swap_header->info.nr_badpages; i++)
2843 swab32s(&swap_header->info.badpages[i]);
2844 }
2845
2846 if (swap_header->info.version != 1) {
2847 pr_warn("Unable to handle swap header version %d\n",
2848 swap_header->info.version);
2849 return 0;
2850 }
2851
2852 p->lowest_bit = 1;
2853 p->cluster_next = 1;
2854 p->cluster_nr = 0;
2855
2856 maxpages = max_swapfile_size();
2857 last_page = swap_header->info.last_page;
2858 if (!last_page) {
2859 pr_warn("Empty swap-file\n");
2860 return 0;
2861 }
2862 if (last_page > maxpages) {
2863 pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
2864 maxpages << (PAGE_SHIFT - 10),
2865 last_page << (PAGE_SHIFT - 10));
2866 }
2867 if (maxpages > last_page) {
2868 maxpages = last_page + 1;
2869
2870 if ((unsigned int)maxpages == 0)
2871 maxpages = UINT_MAX;
2872 }
2873 p->highest_bit = maxpages - 1;
2874
2875 if (!maxpages)
2876 return 0;
2877 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
2878 if (swapfilepages && maxpages > swapfilepages) {
2879 pr_warn("Swap area shorter than signature indicates\n");
2880 return 0;
2881 }
2882 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
2883 return 0;
2884 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2885 return 0;
2886
2887 return maxpages;
2888}
2889
2890#define SWAP_CLUSTER_INFO_COLS \
2891 DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
2892#define SWAP_CLUSTER_SPACE_COLS \
2893 DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
2894#define SWAP_CLUSTER_COLS \
2895 max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
2896
2897static int setup_swap_map_and_extents(struct swap_info_struct *p,
2898 union swap_header *swap_header,
2899 unsigned char *swap_map,
2900 struct swap_cluster_info *cluster_info,
2901 unsigned long maxpages,
2902 sector_t *span)
2903{
2904 unsigned int j, k;
2905 unsigned int nr_good_pages;
2906 int nr_extents;
2907 unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
2908 unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
2909 unsigned long i, idx;
2910
2911 nr_good_pages = maxpages - 1;
2912
2913 cluster_list_init(&p->free_clusters);
2914 cluster_list_init(&p->discard_clusters);
2915
2916 for (i = 0; i < swap_header->info.nr_badpages; i++) {
2917 unsigned int page_nr = swap_header->info.badpages[i];
2918 if (page_nr == 0 || page_nr > swap_header->info.last_page)
2919 return -EINVAL;
2920 if (page_nr < maxpages) {
2921 swap_map[page_nr] = SWAP_MAP_BAD;
2922 nr_good_pages--;
2923
2924
2925
2926
2927 inc_cluster_info_page(p, cluster_info, page_nr);
2928 }
2929 }
2930
2931
2932 for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
2933 inc_cluster_info_page(p, cluster_info, i);
2934
2935 if (nr_good_pages) {
2936 swap_map[0] = SWAP_MAP_BAD;
2937
2938
2939
2940
2941 inc_cluster_info_page(p, cluster_info, 0);
2942 p->max = maxpages;
2943 p->pages = nr_good_pages;
2944 nr_extents = setup_swap_extents(p, span);
2945 if (nr_extents < 0)
2946 return nr_extents;
2947 nr_good_pages = p->pages;
2948 }
2949 if (!nr_good_pages) {
2950 pr_warn("Empty swap-file\n");
2951 return -EINVAL;
2952 }
2953
2954 if (!cluster_info)
2955 return nr_extents;
2956
2957
2958
2959
2960
2961
2962 for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
2963 j = (k + col) % SWAP_CLUSTER_COLS;
2964 for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
2965 idx = i * SWAP_CLUSTER_COLS + j;
2966 if (idx >= nr_clusters)
2967 continue;
2968 if (cluster_count(&cluster_info[idx]))
2969 continue;
2970 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
2971 cluster_list_add_tail(&p->free_clusters, cluster_info,
2972 idx);
2973 }
2974 }
2975 return nr_extents;
2976}
2977
2978
2979
2980
2981
2982static bool swap_discardable(struct swap_info_struct *si)
2983{
2984 struct request_queue *q = bdev_get_queue(si->bdev);
2985
2986 if (!q || !blk_queue_discard(q))
2987 return false;
2988
2989 return true;
2990}
2991
2992SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2993{
2994 struct swap_info_struct *p;
2995 struct filename *name;
2996 struct file *swap_file = NULL;
2997 struct address_space *mapping;
2998 int prio;
2999 int error;
3000 union swap_header *swap_header;
3001 int nr_extents;
3002 sector_t span;
3003 unsigned long maxpages;
3004 unsigned char *swap_map = NULL;
3005 struct swap_cluster_info *cluster_info = NULL;
3006 unsigned long *frontswap_map = NULL;
3007 struct page *page = NULL;
3008 struct inode *inode = NULL;
3009 bool inced_nr_rotate_swap = false;
3010
3011 if (swap_flags & ~SWAP_FLAGS_VALID)
3012 return -EINVAL;
3013
3014 if (!capable(CAP_SYS_ADMIN))
3015 return -EPERM;
3016
3017 if (!swap_avail_heads)
3018 return -ENOMEM;
3019
3020 p = alloc_swap_info();
3021 if (IS_ERR(p))
3022 return PTR_ERR(p);
3023
3024 INIT_WORK(&p->discard_work, swap_discard_work);
3025
3026 name = getname(specialfile);
3027 if (IS_ERR(name)) {
3028 error = PTR_ERR(name);
3029 name = NULL;
3030 goto bad_swap;
3031 }
3032 swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
3033 if (IS_ERR(swap_file)) {
3034 error = PTR_ERR(swap_file);
3035 swap_file = NULL;
3036 goto bad_swap;
3037 }
3038
3039 p->swap_file = swap_file;
3040 mapping = swap_file->f_mapping;
3041 inode = mapping->host;
3042
3043
3044 error = claim_swapfile(p, inode);
3045 if (unlikely(error))
3046 goto bad_swap;
3047
3048
3049
3050
3051 if (!mapping->a_ops->readpage) {
3052 error = -EINVAL;
3053 goto bad_swap;
3054 }
3055 page = read_mapping_page(mapping, 0, swap_file);
3056 if (IS_ERR(page)) {
3057 error = PTR_ERR(page);
3058 goto bad_swap;
3059 }
3060 swap_header = kmap(page);
3061
3062 maxpages = read_swap_header(p, swap_header, inode);
3063 if (unlikely(!maxpages)) {
3064 error = -EINVAL;
3065 goto bad_swap;
3066 }
3067
3068
3069 swap_map = vzalloc(maxpages);
3070 if (!swap_map) {
3071 error = -ENOMEM;
3072 goto bad_swap;
3073 }
3074
3075 if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
3076 p->flags |= SWP_STABLE_WRITES;
3077
3078 if (bdi_cap_synchronous_io(inode_to_bdi(inode)))
3079 p->flags |= SWP_SYNCHRONOUS_IO;
3080
3081 if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
3082 int cpu;
3083 unsigned long ci, nr_cluster;
3084
3085 p->flags |= SWP_SOLIDSTATE;
3086
3087
3088
3089
3090 p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
3091 nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
3092
3093 cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
3094 GFP_KERNEL);
3095 if (!cluster_info) {
3096 error = -ENOMEM;
3097 goto bad_swap;
3098 }
3099
3100 for (ci = 0; ci < nr_cluster; ci++)
3101 spin_lock_init(&((cluster_info + ci)->lock));
3102
3103 p->percpu_cluster = alloc_percpu(struct percpu_cluster);
3104 if (!p->percpu_cluster) {
3105 error = -ENOMEM;
3106 goto bad_swap;
3107 }
3108 for_each_possible_cpu(cpu) {
3109 struct percpu_cluster *cluster;
3110 cluster = per_cpu_ptr(p->percpu_cluster, cpu);
3111 cluster_set_null(&cluster->index);
3112 }
3113 } else {
3114 atomic_inc(&nr_rotate_swap);
3115 inced_nr_rotate_swap = true;
3116 }
3117
3118 error = swap_cgroup_swapon(p->type, maxpages);
3119 if (error)
3120 goto bad_swap;
3121
3122 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
3123 cluster_info, maxpages, &span);
3124 if (unlikely(nr_extents < 0)) {
3125 error = nr_extents;
3126 goto bad_swap;
3127 }
3128
3129 if (IS_ENABLED(CONFIG_FRONTSWAP))
3130 frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages),
3131 sizeof(long),
3132 GFP_KERNEL);
3133
3134 if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
3135
3136
3137
3138
3139
3140
3141 p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
3142 SWP_PAGE_DISCARD);
3143
3144
3145
3146
3147
3148
3149
3150 if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
3151 p->flags &= ~SWP_PAGE_DISCARD;
3152 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
3153 p->flags &= ~SWP_AREA_DISCARD;
3154
3155
3156 if (p->flags & SWP_AREA_DISCARD) {
3157 int err = discard_swap(p);
3158 if (unlikely(err))
3159 pr_err("swapon: discard_swap(%p): %d\n",
3160 p, err);
3161 }
3162 }
3163
3164 error = init_swap_address_space(p->type, maxpages);
3165 if (error)
3166 goto bad_swap;
3167
3168 mutex_lock(&swapon_mutex);
3169 prio = -1;
3170 if (swap_flags & SWAP_FLAG_PREFER)
3171 prio =
3172 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
3173 enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
3174
3175 pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
3176 p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
3177 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
3178 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
3179 (p->flags & SWP_DISCARDABLE) ? "D" : "",
3180 (p->flags & SWP_AREA_DISCARD) ? "s" : "",
3181 (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
3182 (frontswap_map) ? "FS" : "");
3183
3184 mutex_unlock(&swapon_mutex);
3185 atomic_inc(&proc_poll_event);
3186 wake_up_interruptible(&proc_poll_wait);
3187
3188 if (S_ISREG(inode->i_mode))
3189 inode->i_flags |= S_SWAPFILE;
3190 error = 0;
3191 goto out;
3192bad_swap:
3193 free_percpu(p->percpu_cluster);
3194 p->percpu_cluster = NULL;
3195 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
3196 set_blocksize(p->bdev, p->old_block_size);
3197 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3198 }
3199 destroy_swap_extents(p);
3200 swap_cgroup_swapoff(p->type);
3201 spin_lock(&swap_lock);
3202 p->swap_file = NULL;
3203 p->flags = 0;
3204 spin_unlock(&swap_lock);
3205 vfree(swap_map);
3206 kvfree(cluster_info);
3207 kvfree(frontswap_map);
3208 if (inced_nr_rotate_swap)
3209 atomic_dec(&nr_rotate_swap);
3210 if (swap_file) {
3211 if (inode && S_ISREG(inode->i_mode)) {
3212 inode_unlock(inode);
3213 inode = NULL;
3214 }
3215 filp_close(swap_file, NULL);
3216 }
3217out:
3218 if (page && !IS_ERR(page)) {
3219 kunmap(page);
3220 put_page(page);
3221 }
3222 if (name)
3223 putname(name);
3224 if (inode && S_ISREG(inode->i_mode))
3225 inode_unlock(inode);
3226 if (!error)
3227 enable_swap_slots_cache();
3228 return error;
3229}
3230
3231void si_swapinfo(struct sysinfo *val)
3232{
3233 unsigned int type;
3234 unsigned long nr_to_be_unused = 0;
3235
3236 spin_lock(&swap_lock);
3237 for (type = 0; type < nr_swapfiles; type++) {
3238 struct swap_info_struct *si = swap_info[type];
3239
3240 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
3241 nr_to_be_unused += si->inuse_pages;
3242 }
3243 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
3244 val->totalswap = total_swap_pages + nr_to_be_unused;
3245 spin_unlock(&swap_lock);
3246}
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
3260{
3261 struct swap_info_struct *p;
3262 struct swap_cluster_info *ci;
3263 unsigned long offset;
3264 unsigned char count;
3265 unsigned char has_cache;
3266 int err = -EINVAL;
3267
3268 if (non_swap_entry(entry))
3269 goto out;
3270
3271 p = swp_swap_info(entry);
3272 if (!p)
3273 goto bad_file;
3274
3275 offset = swp_offset(entry);
3276 if (unlikely(offset >= p->max))
3277 goto out;
3278
3279 ci = lock_cluster_or_swap_info(p, offset);
3280
3281 count = p->swap_map[offset];
3282
3283
3284
3285
3286
3287 if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
3288 err = -ENOENT;
3289 goto unlock_out;
3290 }
3291
3292 has_cache = count & SWAP_HAS_CACHE;
3293 count &= ~SWAP_HAS_CACHE;
3294 err = 0;
3295
3296 if (usage == SWAP_HAS_CACHE) {
3297
3298
3299 if (!has_cache && count)
3300 has_cache = SWAP_HAS_CACHE;
3301 else if (has_cache)
3302 err = -EEXIST;
3303 else
3304 err = -ENOENT;
3305
3306 } else if (count || has_cache) {
3307
3308 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
3309 count += usage;
3310 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
3311 err = -EINVAL;
3312 else if (swap_count_continued(p, offset, count))
3313 count = COUNT_CONTINUED;
3314 else
3315 err = -ENOMEM;
3316 } else
3317 err = -ENOENT;
3318
3319 p->swap_map[offset] = count | has_cache;
3320
3321unlock_out:
3322 unlock_cluster_or_swap_info(p, ci);
3323out:
3324 return err;
3325
3326bad_file:
3327 pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
3328 goto out;
3329}
3330
3331
3332
3333
3334
3335void swap_shmem_alloc(swp_entry_t entry)
3336{
3337 __swap_duplicate(entry, SWAP_MAP_SHMEM);
3338}
3339
3340
3341
3342
3343
3344
3345
3346
3347int swap_duplicate(swp_entry_t entry)
3348{
3349 int err = 0;
3350
3351 while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
3352 err = add_swap_count_continuation(entry, GFP_ATOMIC);
3353 return err;
3354}
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364int swapcache_prepare(swp_entry_t entry)
3365{
3366 return __swap_duplicate(entry, SWAP_HAS_CACHE);
3367}
3368
3369struct swap_info_struct *swp_swap_info(swp_entry_t entry)
3370{
3371 return swap_type_to_swap_info(swp_type(entry));
3372}
3373
3374struct swap_info_struct *page_swap_info(struct page *page)
3375{
3376 swp_entry_t entry = { .val = page_private(page) };
3377 return swp_swap_info(entry);
3378}
3379
3380
3381
3382
3383struct address_space *__page_file_mapping(struct page *page)
3384{
3385 return page_swap_info(page)->swap_file->f_mapping;
3386}
3387EXPORT_SYMBOL_GPL(__page_file_mapping);
3388
3389pgoff_t __page_file_index(struct page *page)
3390{
3391 swp_entry_t swap = { .val = page_private(page) };
3392 return swp_offset(swap);
3393}
3394EXPORT_SYMBOL_GPL(__page_file_index);
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
3412{
3413 struct swap_info_struct *si;
3414 struct swap_cluster_info *ci;
3415 struct page *head;
3416 struct page *page;
3417 struct page *list_page;
3418 pgoff_t offset;
3419 unsigned char count;
3420
3421
3422
3423
3424
3425 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
3426
3427 si = swap_info_get(entry);
3428 if (!si) {
3429
3430
3431
3432
3433
3434 goto outer;
3435 }
3436
3437 offset = swp_offset(entry);
3438
3439 ci = lock_cluster(si, offset);
3440
3441 count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
3442
3443 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
3444
3445
3446
3447
3448
3449 goto out;
3450 }
3451
3452 if (!page) {
3453 unlock_cluster(ci);
3454 spin_unlock(&si->lock);
3455 return -ENOMEM;
3456 }
3457
3458
3459
3460
3461
3462
3463 head = vmalloc_to_page(si->swap_map + offset);
3464 offset &= ~PAGE_MASK;
3465
3466 spin_lock(&si->cont_lock);
3467
3468
3469
3470
3471 if (!page_private(head)) {
3472 BUG_ON(count & COUNT_CONTINUED);
3473 INIT_LIST_HEAD(&head->lru);
3474 set_page_private(head, SWP_CONTINUED);
3475 si->flags |= SWP_CONTINUED;
3476 }
3477
3478 list_for_each_entry(list_page, &head->lru, lru) {
3479 unsigned char *map;
3480
3481
3482
3483
3484
3485 if (!(count & COUNT_CONTINUED))
3486 goto out_unlock_cont;
3487
3488 map = kmap_atomic(list_page) + offset;
3489 count = *map;
3490 kunmap_atomic(map);
3491
3492
3493
3494
3495
3496 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
3497 goto out_unlock_cont;
3498 }
3499
3500 list_add_tail(&page->lru, &head->lru);
3501 page = NULL;
3502out_unlock_cont:
3503 spin_unlock(&si->cont_lock);
3504out:
3505 unlock_cluster(ci);
3506 spin_unlock(&si->lock);
3507outer:
3508 if (page)
3509 __free_page(page);
3510 return 0;
3511}
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522static bool swap_count_continued(struct swap_info_struct *si,
3523 pgoff_t offset, unsigned char count)
3524{
3525 struct page *head;
3526 struct page *page;
3527 unsigned char *map;
3528 bool ret;
3529
3530 head = vmalloc_to_page(si->swap_map + offset);
3531 if (page_private(head) != SWP_CONTINUED) {
3532 BUG_ON(count & COUNT_CONTINUED);
3533 return false;
3534 }
3535
3536 spin_lock(&si->cont_lock);
3537 offset &= ~PAGE_MASK;
3538 page = list_entry(head->lru.next, struct page, lru);
3539 map = kmap_atomic(page) + offset;
3540
3541 if (count == SWAP_MAP_MAX)
3542 goto init_map;
3543
3544 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) {
3545
3546
3547
3548 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
3549 kunmap_atomic(map);
3550 page = list_entry(page->lru.next, struct page, lru);
3551 BUG_ON(page == head);
3552 map = kmap_atomic(page) + offset;
3553 }
3554 if (*map == SWAP_CONT_MAX) {
3555 kunmap_atomic(map);
3556 page = list_entry(page->lru.next, struct page, lru);
3557 if (page == head) {
3558 ret = false;
3559 goto out;
3560 }
3561 map = kmap_atomic(page) + offset;
3562init_map: *map = 0;
3563 }
3564 *map += 1;
3565 kunmap_atomic(map);
3566 page = list_entry(page->lru.prev, struct page, lru);
3567 while (page != head) {
3568 map = kmap_atomic(page) + offset;
3569 *map = COUNT_CONTINUED;
3570 kunmap_atomic(map);
3571 page = list_entry(page->lru.prev, struct page, lru);
3572 }
3573 ret = true;
3574
3575 } else {
3576
3577
3578
3579 BUG_ON(count != COUNT_CONTINUED);
3580 while (*map == COUNT_CONTINUED) {
3581 kunmap_atomic(map);
3582 page = list_entry(page->lru.next, struct page, lru);
3583 BUG_ON(page == head);
3584 map = kmap_atomic(page) + offset;
3585 }
3586 BUG_ON(*map == 0);
3587 *map -= 1;
3588 if (*map == 0)
3589 count = 0;
3590 kunmap_atomic(map);
3591 page = list_entry(page->lru.prev, struct page, lru);
3592 while (page != head) {
3593 map = kmap_atomic(page) + offset;
3594 *map = SWAP_CONT_MAX | count;
3595 count = COUNT_CONTINUED;
3596 kunmap_atomic(map);
3597 page = list_entry(page->lru.prev, struct page, lru);
3598 }
3599 ret = count == COUNT_CONTINUED;
3600 }
3601out:
3602 spin_unlock(&si->cont_lock);
3603 return ret;
3604}
3605
3606
3607
3608
3609
3610static void free_swap_count_continuations(struct swap_info_struct *si)
3611{
3612 pgoff_t offset;
3613
3614 for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
3615 struct page *head;
3616 head = vmalloc_to_page(si->swap_map + offset);
3617 if (page_private(head)) {
3618 struct page *page, *next;
3619
3620 list_for_each_entry_safe(page, next, &head->lru, lru) {
3621 list_del(&page->lru);
3622 __free_page(page);
3623 }
3624 }
3625 }
3626}
3627
3628#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
3629void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
3630 gfp_t gfp_mask)
3631{
3632 struct swap_info_struct *si, *next;
3633 if (!(gfp_mask & __GFP_IO) || !memcg)
3634 return;
3635
3636 if (!blk_cgroup_congested())
3637 return;
3638
3639
3640
3641
3642
3643 if (current->throttle_queue)
3644 return;
3645
3646 spin_lock(&swap_avail_lock);
3647 plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
3648 avail_lists[node]) {
3649 if (si->bdev) {
3650 blkcg_schedule_throttle(bdev_get_queue(si->bdev),
3651 true);
3652 break;
3653 }
3654 }
3655 spin_unlock(&swap_avail_lock);
3656}
3657#endif
3658
3659static int __init swapfile_init(void)
3660{
3661 int nid;
3662
3663 swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
3664 GFP_KERNEL);
3665 if (!swap_avail_heads) {
3666 pr_emerg("Not enough memory for swap heads, swap is disabled\n");
3667 return -ENOMEM;
3668 }
3669
3670 for_each_node(nid)
3671 plist_head_init(&swap_avail_heads[nid]);
3672
3673 return 0;
3674}
3675subsys_initcall(swapfile_init);
3676