1
2
3
4
5
6
7
8
9#include <linux/blkdev.h>
10#include <linux/mm.h>
11#include <linux/sched/mm.h>
12#include <linux/sched/task.h>
13#include <linux/hugetlb.h>
14#include <linux/mman.h>
15#include <linux/slab.h>
16#include <linux/kernel_stat.h>
17#include <linux/swap.h>
18#include <linux/vmalloc.h>
19#include <linux/pagemap.h>
20#include <linux/namei.h>
21#include <linux/shmem_fs.h>
22#include <linux/blk-cgroup.h>
23#include <linux/random.h>
24#include <linux/writeback.h>
25#include <linux/proc_fs.h>
26#include <linux/seq_file.h>
27#include <linux/init.h>
28#include <linux/ksm.h>
29#include <linux/rmap.h>
30#include <linux/security.h>
31#include <linux/backing-dev.h>
32#include <linux/mutex.h>
33#include <linux/capability.h>
34#include <linux/syscalls.h>
35#include <linux/memcontrol.h>
36#include <linux/poll.h>
37#include <linux/oom.h>
38#include <linux/swapfile.h>
39#include <linux/export.h>
40#include <linux/sort.h>
41#include <linux/completion.h>
42#include <linux/suspend.h>
43#include <linux/zswap.h>
44#include <linux/plist.h>
45
46#include <asm/tlbflush.h>
47#include <linux/swapops.h>
48#include <linux/swap_cgroup.h>
49#include "internal.h"
50#include "swap.h"
51
52static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
53 unsigned char);
54static void free_swap_count_continuations(struct swap_info_struct *);
55static void swap_entries_free(struct swap_info_struct *si,
56 struct swap_cluster_info *ci,
57 swp_entry_t entry, unsigned int nr_pages);
58static void swap_range_alloc(struct swap_info_struct *si,
59 unsigned int nr_entries);
60static bool folio_swapcache_freeable(struct folio *folio);
61static struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
62 unsigned long offset);
63static inline void unlock_cluster(struct swap_cluster_info *ci);
64
65static DEFINE_SPINLOCK(swap_lock);
66static unsigned int nr_swapfiles;
67atomic_long_t nr_swap_pages;
68
69
70
71
72
73EXPORT_SYMBOL_GPL(nr_swap_pages);
74
75long total_swap_pages;
76static int least_priority = -1;
77unsigned long swapfile_maximum_size;
78#ifdef CONFIG_MIGRATION
79bool swap_migration_ad_supported;
80#endif
81
82static const char Bad_file[] = "Bad swap file entry ";
83static const char Unused_file[] = "Unused swap file entry ";
84static const char Bad_offset[] = "Bad swap offset entry ";
85static const char Unused_offset[] = "Unused swap offset entry ";
86
87
88
89
90
91static PLIST_HEAD(swap_active_head);
92
93
94
95
96
97
98
99
100
101
102
103
104
105static struct plist_head *swap_avail_heads;
106static DEFINE_SPINLOCK(swap_avail_lock);
107
108static struct swap_info_struct *swap_info[MAX_SWAPFILES];
109
110static DEFINE_MUTEX(swapon_mutex);
111
112static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
113
114static atomic_t proc_poll_event = ATOMIC_INIT(0);
115
116atomic_t nr_rotate_swap = ATOMIC_INIT(0);
117
118struct percpu_swap_cluster {
119 struct swap_info_struct *si[SWAP_NR_ORDERS];
120 unsigned long offset[SWAP_NR_ORDERS];
121 local_lock_t lock;
122};
123
124static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
125 .si = { NULL },
126 .offset = { SWAP_ENTRY_INVALID },
127 .lock = INIT_LOCAL_LOCK(),
128};
129
130static struct swap_info_struct *swap_type_to_swap_info(int type)
131{
132 if (type >= MAX_SWAPFILES)
133 return NULL;
134
135 return READ_ONCE(swap_info[type]);
136}
137
138static inline unsigned char swap_count(unsigned char ent)
139{
140 return ent & ~SWAP_HAS_CACHE;
141}
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156#define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2))
157#define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT)
158static long swap_usage_in_pages(struct swap_info_struct *si)
159{
160 return atomic_long_read(&si->inuse_pages) & SWAP_USAGE_COUNTER_MASK;
161}
162
163
164#define TTRS_ANYWAY 0x1
165
166
167
168
169#define TTRS_UNMAPPED 0x2
170
171#define TTRS_FULL 0x4
172
173static bool swap_only_has_cache(struct swap_info_struct *si,
174 unsigned long offset, int nr_pages)
175{
176 unsigned char *map = si->swap_map + offset;
177 unsigned char *map_end = map + nr_pages;
178
179 do {
180 VM_BUG_ON(!(*map & SWAP_HAS_CACHE));
181 if (*map != SWAP_HAS_CACHE)
182 return false;
183 } while (++map < map_end);
184
185 return true;
186}
187
188static bool swap_is_last_map(struct swap_info_struct *si,
189 unsigned long offset, int nr_pages, bool *has_cache)
190{
191 unsigned char *map = si->swap_map + offset;
192 unsigned char *map_end = map + nr_pages;
193 unsigned char count = *map;
194
195 if (swap_count(count) != 1 && swap_count(count) != SWAP_MAP_SHMEM)
196 return false;
197
198 while (++map < map_end) {
199 if (*map != count)
200 return false;
201 }
202
203 *has_cache = !!(count & SWAP_HAS_CACHE);
204 return true;
205}
206
207
208
209
210
211
212static int __try_to_reclaim_swap(struct swap_info_struct *si,
213 unsigned long offset, unsigned long flags)
214{
215 swp_entry_t entry = swp_entry(si->type, offset);
216 struct address_space *address_space = swap_address_space(entry);
217 struct swap_cluster_info *ci;
218 struct folio *folio;
219 int ret, nr_pages;
220 bool need_reclaim;
221
222again:
223 folio = filemap_get_folio(address_space, swap_cache_index(entry));
224 if (IS_ERR(folio))
225 return 0;
226
227 nr_pages = folio_nr_pages(folio);
228 ret = -nr_pages;
229
230
231
232
233
234
235
236
237 if (!folio_trylock(folio))
238 goto out;
239
240
241
242
243
244 entry = folio->swap;
245 if (offset < swp_offset(entry) || offset >= swp_offset(entry) + nr_pages) {
246 folio_unlock(folio);
247 folio_put(folio);
248 goto again;
249 }
250 offset = swp_offset(entry);
251
252 need_reclaim = ((flags & TTRS_ANYWAY) ||
253 ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
254 ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)));
255 if (!need_reclaim || !folio_swapcache_freeable(folio))
256 goto out_unlock;
257
258
259
260
261
262
263 ci = lock_cluster(si, offset);
264 need_reclaim = swap_only_has_cache(si, offset, nr_pages);
265 unlock_cluster(ci);
266 if (!need_reclaim)
267 goto out_unlock;
268
269 delete_from_swap_cache(folio);
270 folio_set_dirty(folio);
271 ret = nr_pages;
272out_unlock:
273 folio_unlock(folio);
274out:
275 folio_put(folio);
276 return ret;
277}
278
279static inline struct swap_extent *first_se(struct swap_info_struct *sis)
280{
281 struct rb_node *rb = rb_first(&sis->swap_extent_root);
282 return rb_entry(rb, struct swap_extent, rb_node);
283}
284
285static inline struct swap_extent *next_se(struct swap_extent *se)
286{
287 struct rb_node *rb = rb_next(&se->rb_node);
288 return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
289}
290
291
292
293
294
295static int discard_swap(struct swap_info_struct *si)
296{
297 struct swap_extent *se;
298 sector_t start_block;
299 sector_t nr_blocks;
300 int err = 0;
301
302
303 se = first_se(si);
304 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
305 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
306 if (nr_blocks) {
307 err = blkdev_issue_discard(si->bdev, start_block,
308 nr_blocks, GFP_KERNEL);
309 if (err)
310 return err;
311 cond_resched();
312 }
313
314 for (se = next_se(se); se; se = next_se(se)) {
315 start_block = se->start_block << (PAGE_SHIFT - 9);
316 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
317
318 err = blkdev_issue_discard(si->bdev, start_block,
319 nr_blocks, GFP_KERNEL);
320 if (err)
321 break;
322
323 cond_resched();
324 }
325 return err;
326}
327
328static struct swap_extent *
329offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
330{
331 struct swap_extent *se;
332 struct rb_node *rb;
333
334 rb = sis->swap_extent_root.rb_node;
335 while (rb) {
336 se = rb_entry(rb, struct swap_extent, rb_node);
337 if (offset < se->start_page)
338 rb = rb->rb_left;
339 else if (offset >= se->start_page + se->nr_pages)
340 rb = rb->rb_right;
341 else
342 return se;
343 }
344
345 BUG();
346}
347
348sector_t swap_folio_sector(struct folio *folio)
349{
350 struct swap_info_struct *sis = swp_swap_info(folio->swap);
351 struct swap_extent *se;
352 sector_t sector;
353 pgoff_t offset;
354
355 offset = swp_offset(folio->swap);
356 se = offset_to_swap_extent(sis, offset);
357 sector = se->start_block + (offset - se->start_page);
358 return sector << (PAGE_SHIFT - 9);
359}
360
361
362
363
364
365static void discard_swap_cluster(struct swap_info_struct *si,
366 pgoff_t start_page, pgoff_t nr_pages)
367{
368 struct swap_extent *se = offset_to_swap_extent(si, start_page);
369
370 while (nr_pages) {
371 pgoff_t offset = start_page - se->start_page;
372 sector_t start_block = se->start_block + offset;
373 sector_t nr_blocks = se->nr_pages - offset;
374
375 if (nr_blocks > nr_pages)
376 nr_blocks = nr_pages;
377 start_page += nr_blocks;
378 nr_pages -= nr_blocks;
379
380 start_block <<= PAGE_SHIFT - 9;
381 nr_blocks <<= PAGE_SHIFT - 9;
382 if (blkdev_issue_discard(si->bdev, start_block,
383 nr_blocks, GFP_NOIO))
384 break;
385
386 se = next_se(se);
387 }
388}
389
390#ifdef CONFIG_THP_SWAP
391#define SWAPFILE_CLUSTER HPAGE_PMD_NR
392
393#define swap_entry_order(order) (order)
394#else
395#define SWAPFILE_CLUSTER 256
396
397
398
399
400
401#define swap_entry_order(order) 0
402#endif
403#define LATENCY_LIMIT 256
404
405static inline bool cluster_is_empty(struct swap_cluster_info *info)
406{
407 return info->count == 0;
408}
409
410static inline bool cluster_is_discard(struct swap_cluster_info *info)
411{
412 return info->flags == CLUSTER_FLAG_DISCARD;
413}
414
415static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order)
416{
417 if (unlikely(ci->flags > CLUSTER_FLAG_USABLE))
418 return false;
419 if (!order)
420 return true;
421 return cluster_is_empty(ci) || order == ci->order;
422}
423
424static inline unsigned int cluster_index(struct swap_info_struct *si,
425 struct swap_cluster_info *ci)
426{
427 return ci - si->cluster_info;
428}
429
430static inline struct swap_cluster_info *offset_to_cluster(struct swap_info_struct *si,
431 unsigned long offset)
432{
433 return &si->cluster_info[offset / SWAPFILE_CLUSTER];
434}
435
436static inline unsigned int cluster_offset(struct swap_info_struct *si,
437 struct swap_cluster_info *ci)
438{
439 return cluster_index(si, ci) * SWAPFILE_CLUSTER;
440}
441
442static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
443 unsigned long offset)
444{
445 struct swap_cluster_info *ci;
446
447 ci = offset_to_cluster(si, offset);
448 spin_lock(&ci->lock);
449
450 return ci;
451}
452
453static inline void unlock_cluster(struct swap_cluster_info *ci)
454{
455 spin_unlock(&ci->lock);
456}
457
458static void move_cluster(struct swap_info_struct *si,
459 struct swap_cluster_info *ci, struct list_head *list,
460 enum swap_cluster_flags new_flags)
461{
462 VM_WARN_ON(ci->flags == new_flags);
463
464 BUILD_BUG_ON(1 << sizeof(ci->flags) * BITS_PER_BYTE < CLUSTER_FLAG_MAX);
465 lockdep_assert_held(&ci->lock);
466
467 spin_lock(&si->lock);
468 if (ci->flags == CLUSTER_FLAG_NONE)
469 list_add_tail(&ci->list, list);
470 else
471 list_move_tail(&ci->list, list);
472 spin_unlock(&si->lock);
473
474 if (ci->flags == CLUSTER_FLAG_FRAG)
475 atomic_long_dec(&si->frag_cluster_nr[ci->order]);
476 else if (new_flags == CLUSTER_FLAG_FRAG)
477 atomic_long_inc(&si->frag_cluster_nr[ci->order]);
478 ci->flags = new_flags;
479}
480
481
482static void swap_cluster_schedule_discard(struct swap_info_struct *si,
483 struct swap_cluster_info *ci)
484{
485 VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE);
486 move_cluster(si, ci, &si->discard_clusters, CLUSTER_FLAG_DISCARD);
487 schedule_work(&si->discard_work);
488}
489
490static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
491{
492 lockdep_assert_held(&ci->lock);
493 move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE);
494 ci->order = 0;
495}
496
497
498
499
500
501
502
503
504
505
506static struct swap_cluster_info *isolate_lock_cluster(
507 struct swap_info_struct *si, struct list_head *list)
508{
509 struct swap_cluster_info *ci, *ret = NULL;
510
511 spin_lock(&si->lock);
512
513 if (unlikely(!(si->flags & SWP_WRITEOK)))
514 goto out;
515
516 list_for_each_entry(ci, list, list) {
517 if (!spin_trylock(&ci->lock))
518 continue;
519
520
521 VM_BUG_ON(!ci->flags);
522 VM_BUG_ON(ci->flags > CLUSTER_FLAG_USABLE &&
523 ci->flags != CLUSTER_FLAG_FULL);
524
525 list_del(&ci->list);
526 ci->flags = CLUSTER_FLAG_NONE;
527 ret = ci;
528 break;
529 }
530out:
531 spin_unlock(&si->lock);
532
533 return ret;
534}
535
536
537
538
539
540
541
542static bool swap_do_scheduled_discard(struct swap_info_struct *si)
543{
544 struct swap_cluster_info *ci;
545 bool ret = false;
546 unsigned int idx;
547
548 spin_lock(&si->lock);
549 while (!list_empty(&si->discard_clusters)) {
550 ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
551
552
553
554
555
556 list_del(&ci->list);
557 idx = cluster_index(si, ci);
558 spin_unlock(&si->lock);
559 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
560 SWAPFILE_CLUSTER);
561
562 spin_lock(&ci->lock);
563
564
565
566
567 ci->flags = CLUSTER_FLAG_NONE;
568 __free_cluster(si, ci);
569 spin_unlock(&ci->lock);
570 ret = true;
571 spin_lock(&si->lock);
572 }
573 spin_unlock(&si->lock);
574 return ret;
575}
576
577static void swap_discard_work(struct work_struct *work)
578{
579 struct swap_info_struct *si;
580
581 si = container_of(work, struct swap_info_struct, discard_work);
582
583 swap_do_scheduled_discard(si);
584}
585
586static void swap_users_ref_free(struct percpu_ref *ref)
587{
588 struct swap_info_struct *si;
589
590 si = container_of(ref, struct swap_info_struct, users);
591 complete(&si->comp);
592}
593
594
595
596
597
598static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
599{
600 VM_BUG_ON(ci->count != 0);
601 VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE);
602 lockdep_assert_held(&ci->lock);
603
604
605
606
607
608
609 if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
610 (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
611 swap_cluster_schedule_discard(si, ci);
612 return;
613 }
614
615 __free_cluster(si, ci);
616}
617
618
619
620
621
622static void partial_free_cluster(struct swap_info_struct *si,
623 struct swap_cluster_info *ci)
624{
625 VM_BUG_ON(!ci->count || ci->count == SWAPFILE_CLUSTER);
626 lockdep_assert_held(&ci->lock);
627
628 if (ci->flags != CLUSTER_FLAG_NONFULL)
629 move_cluster(si, ci, &si->nonfull_clusters[ci->order],
630 CLUSTER_FLAG_NONFULL);
631}
632
633
634
635
636
637
638static void relocate_cluster(struct swap_info_struct *si,
639 struct swap_cluster_info *ci)
640{
641 lockdep_assert_held(&ci->lock);
642
643
644 if (cluster_is_discard(ci))
645 return;
646
647 if (!ci->count) {
648 if (ci->flags != CLUSTER_FLAG_FREE)
649 free_cluster(si, ci);
650 } else if (ci->count != SWAPFILE_CLUSTER) {
651 if (ci->flags != CLUSTER_FLAG_FRAG)
652 move_cluster(si, ci, &si->frag_clusters[ci->order],
653 CLUSTER_FLAG_FRAG);
654 } else {
655 if (ci->flags != CLUSTER_FLAG_FULL)
656 move_cluster(si, ci, &si->full_clusters,
657 CLUSTER_FLAG_FULL);
658 }
659}
660
661
662
663
664
665
666static void inc_cluster_info_page(struct swap_info_struct *si,
667 struct swap_cluster_info *cluster_info, unsigned long page_nr)
668{
669 unsigned long idx = page_nr / SWAPFILE_CLUSTER;
670 struct swap_cluster_info *ci;
671
672 ci = cluster_info + idx;
673 ci->count++;
674
675 VM_BUG_ON(ci->count > SWAPFILE_CLUSTER);
676 VM_BUG_ON(ci->flags);
677}
678
679static bool cluster_reclaim_range(struct swap_info_struct *si,
680 struct swap_cluster_info *ci,
681 unsigned long start, unsigned long end)
682{
683 unsigned char *map = si->swap_map;
684 unsigned long offset = start;
685 int nr_reclaim;
686
687 spin_unlock(&ci->lock);
688 do {
689 switch (READ_ONCE(map[offset])) {
690 case 0:
691 offset++;
692 break;
693 case SWAP_HAS_CACHE:
694 nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
695 if (nr_reclaim > 0)
696 offset += nr_reclaim;
697 else
698 goto out;
699 break;
700 default:
701 goto out;
702 }
703 } while (offset < end);
704out:
705 spin_lock(&ci->lock);
706
707
708
709
710 for (offset = start; offset < end; offset++)
711 if (READ_ONCE(map[offset]))
712 return false;
713
714 return true;
715}
716
717static bool cluster_scan_range(struct swap_info_struct *si,
718 struct swap_cluster_info *ci,
719 unsigned long start, unsigned int nr_pages,
720 bool *need_reclaim)
721{
722 unsigned long offset, end = start + nr_pages;
723 unsigned char *map = si->swap_map;
724
725 if (cluster_is_empty(ci))
726 return true;
727
728 for (offset = start; offset < end; offset++) {
729 switch (READ_ONCE(map[offset])) {
730 case 0:
731 continue;
732 case SWAP_HAS_CACHE:
733 if (!vm_swap_full())
734 return false;
735 *need_reclaim = true;
736 continue;
737 default:
738 return false;
739 }
740 }
741
742 return true;
743}
744
745static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
746 unsigned int start, unsigned char usage,
747 unsigned int order)
748{
749 unsigned int nr_pages = 1 << order;
750
751 lockdep_assert_held(&ci->lock);
752
753 if (!(si->flags & SWP_WRITEOK))
754 return false;
755
756
757
758
759
760 if (cluster_is_empty(ci))
761 ci->order = order;
762
763 memset(si->swap_map + start, usage, nr_pages);
764 swap_range_alloc(si, nr_pages);
765 ci->count += nr_pages;
766
767 return true;
768}
769
770
771static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
772 struct swap_cluster_info *ci,
773 unsigned long offset,
774 unsigned int order,
775 unsigned char usage)
776{
777 unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
778 unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER);
779 unsigned long end = min(start + SWAPFILE_CLUSTER, si->max);
780 unsigned int nr_pages = 1 << order;
781 bool need_reclaim, ret;
782
783 lockdep_assert_held(&ci->lock);
784
785 if (end < nr_pages || ci->count + nr_pages > SWAPFILE_CLUSTER)
786 goto out;
787
788 for (end -= nr_pages; offset <= end; offset += nr_pages) {
789 need_reclaim = false;
790 if (!cluster_scan_range(si, ci, offset, nr_pages, &need_reclaim))
791 continue;
792 if (need_reclaim) {
793 ret = cluster_reclaim_range(si, ci, offset, offset + nr_pages);
794
795
796
797
798
799
800 if (!cluster_is_usable(ci, order))
801 goto out;
802 if (cluster_is_empty(ci))
803 offset = start;
804
805 if (!ret)
806 continue;
807 }
808 if (!cluster_alloc_range(si, ci, offset, usage, order))
809 break;
810 found = offset;
811 offset += nr_pages;
812 if (ci->count < SWAPFILE_CLUSTER && offset <= end)
813 next = offset;
814 break;
815 }
816out:
817 relocate_cluster(si, ci);
818 unlock_cluster(ci);
819 if (si->flags & SWP_SOLIDSTATE) {
820 this_cpu_write(percpu_swap_cluster.offset[order], next);
821 this_cpu_write(percpu_swap_cluster.si[order], si);
822 } else {
823 si->global_cluster->next[order] = next;
824 }
825 return found;
826}
827
828static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
829{
830 long to_scan = 1;
831 unsigned long offset, end;
832 struct swap_cluster_info *ci;
833 unsigned char *map = si->swap_map;
834 int nr_reclaim;
835
836 if (force)
837 to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER;
838
839 while ((ci = isolate_lock_cluster(si, &si->full_clusters))) {
840 offset = cluster_offset(si, ci);
841 end = min(si->max, offset + SWAPFILE_CLUSTER);
842 to_scan--;
843
844 while (offset < end) {
845 if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) {
846 spin_unlock(&ci->lock);
847 nr_reclaim = __try_to_reclaim_swap(si, offset,
848 TTRS_ANYWAY);
849 spin_lock(&ci->lock);
850 if (nr_reclaim) {
851 offset += abs(nr_reclaim);
852 continue;
853 }
854 }
855 offset++;
856 }
857
858
859 if (ci->flags == CLUSTER_FLAG_NONE)
860 relocate_cluster(si, ci);
861
862 unlock_cluster(ci);
863 if (to_scan <= 0)
864 break;
865 }
866}
867
868static void swap_reclaim_work(struct work_struct *work)
869{
870 struct swap_info_struct *si;
871
872 si = container_of(work, struct swap_info_struct, reclaim_work);
873
874 swap_reclaim_full_clusters(si, true);
875}
876
877
878
879
880
881static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
882 unsigned char usage)
883{
884 struct swap_cluster_info *ci;
885 unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
886
887
888
889
890
891 if (order && !(si->flags & SWP_BLKDEV))
892 return 0;
893
894 if (!(si->flags & SWP_SOLIDSTATE)) {
895
896 spin_lock(&si->global_cluster_lock);
897 offset = si->global_cluster->next[order];
898 if (offset == SWAP_ENTRY_INVALID)
899 goto new_cluster;
900
901 ci = lock_cluster(si, offset);
902
903 if (cluster_is_usable(ci, order)) {
904 if (cluster_is_empty(ci))
905 offset = cluster_offset(si, ci);
906 found = alloc_swap_scan_cluster(si, ci, offset,
907 order, usage);
908 } else {
909 unlock_cluster(ci);
910 }
911 if (found)
912 goto done;
913 }
914
915new_cluster:
916 ci = isolate_lock_cluster(si, &si->free_clusters);
917 if (ci) {
918 found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
919 order, usage);
920 if (found)
921 goto done;
922 }
923
924
925 if (vm_swap_full())
926 swap_reclaim_full_clusters(si, false);
927
928 if (order < PMD_ORDER) {
929 unsigned int frags = 0, frags_existing;
930
931 while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) {
932 found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
933 order, usage);
934 if (found)
935 goto done;
936
937 frags++;
938 }
939
940 frags_existing = atomic_long_read(&si->frag_cluster_nr[order]);
941 while (frags < frags_existing &&
942 (ci = isolate_lock_cluster(si, &si->frag_clusters[order]))) {
943 atomic_long_dec(&si->frag_cluster_nr[order]);
944
945
946
947
948
949
950 found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
951 order, usage);
952 if (found)
953 goto done;
954 frags++;
955 }
956 }
957
958
959
960
961
962 if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si))
963 goto new_cluster;
964
965 if (order)
966 goto done;
967
968
969 for (int o = 1; o < SWAP_NR_ORDERS; o++) {
970
971
972
973
974 while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) {
975 atomic_long_dec(&si->frag_cluster_nr[o]);
976 found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
977 0, usage);
978 if (found)
979 goto done;
980 }
981
982 while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) {
983 found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
984 0, usage);
985 if (found)
986 goto done;
987 }
988 }
989done:
990 if (!(si->flags & SWP_SOLIDSTATE))
991 spin_unlock(&si->global_cluster_lock);
992 return found;
993}
994
995
996static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
997{
998 int nid;
999 unsigned long pages;
1000
1001 spin_lock(&swap_avail_lock);
1002
1003 if (swapoff) {
1004
1005
1006
1007
1008
1009
1010 lockdep_assert_held(&si->lock);
1011 si->flags &= ~SWP_WRITEOK;
1012 atomic_long_or(SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
1013 } else {
1014
1015
1016
1017
1018
1019
1020
1021 pages = si->pages;
1022 if (!atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
1023 pages | SWAP_USAGE_OFFLIST_BIT))
1024 goto skip;
1025 }
1026
1027 for_each_node(nid)
1028 plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]);
1029
1030skip:
1031 spin_unlock(&swap_avail_lock);
1032}
1033
1034
1035static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
1036{
1037 int nid;
1038 long val;
1039 unsigned long pages;
1040
1041 spin_lock(&swap_avail_lock);
1042
1043
1044 if (swapon) {
1045 lockdep_assert_held(&si->lock);
1046 si->flags |= SWP_WRITEOK;
1047 } else {
1048 if (!(READ_ONCE(si->flags) & SWP_WRITEOK))
1049 goto skip;
1050 }
1051
1052 if (!(atomic_long_read(&si->inuse_pages) & SWAP_USAGE_OFFLIST_BIT))
1053 goto skip;
1054
1055 val = atomic_long_fetch_and_relaxed(~SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
1056
1057
1058
1059
1060
1061
1062 pages = si->pages;
1063 if (val == pages) {
1064
1065 if (atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
1066 pages | SWAP_USAGE_OFFLIST_BIT))
1067 goto skip;
1068 }
1069
1070 for_each_node(nid)
1071 plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
1072
1073skip:
1074 spin_unlock(&swap_avail_lock);
1075}
1076
1077
1078
1079
1080
1081
1082static bool swap_usage_add(struct swap_info_struct *si, unsigned int nr_entries)
1083{
1084 long val = atomic_long_add_return_relaxed(nr_entries, &si->inuse_pages);
1085
1086
1087
1088
1089
1090 if (unlikely(val == si->pages)) {
1091 del_from_avail_list(si, false);
1092 return true;
1093 }
1094
1095 return false;
1096}
1097
1098static void swap_usage_sub(struct swap_info_struct *si, unsigned int nr_entries)
1099{
1100 long val = atomic_long_sub_return_relaxed(nr_entries, &si->inuse_pages);
1101
1102
1103
1104
1105
1106 if (unlikely(val & SWAP_USAGE_OFFLIST_BIT))
1107 add_to_avail_list(si, false);
1108}
1109
1110static void swap_range_alloc(struct swap_info_struct *si,
1111 unsigned int nr_entries)
1112{
1113 if (swap_usage_add(si, nr_entries)) {
1114 if (vm_swap_full())
1115 schedule_work(&si->reclaim_work);
1116 }
1117 atomic_long_sub(nr_entries, &nr_swap_pages);
1118}
1119
1120static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
1121 unsigned int nr_entries)
1122{
1123 unsigned long begin = offset;
1124 unsigned long end = offset + nr_entries - 1;
1125 void (*swap_slot_free_notify)(struct block_device *, unsigned long);
1126 unsigned int i;
1127
1128
1129
1130
1131
1132 for (i = 0; i < nr_entries; i++) {
1133 clear_bit(offset + i, si->zeromap);
1134 zswap_invalidate(swp_entry(si->type, offset + i));
1135 }
1136
1137 if (si->flags & SWP_BLKDEV)
1138 swap_slot_free_notify =
1139 si->bdev->bd_disk->fops->swap_slot_free_notify;
1140 else
1141 swap_slot_free_notify = NULL;
1142 while (offset <= end) {
1143 arch_swap_invalidate_page(si->type, offset);
1144 if (swap_slot_free_notify)
1145 swap_slot_free_notify(si->bdev, offset);
1146 offset++;
1147 }
1148 clear_shadow_from_swap_cache(si->type, begin, end);
1149
1150
1151
1152
1153
1154 smp_wmb();
1155 atomic_long_add(nr_entries, &nr_swap_pages);
1156 swap_usage_sub(si, nr_entries);
1157}
1158
1159static bool get_swap_device_info(struct swap_info_struct *si)
1160{
1161 if (!percpu_ref_tryget_live(&si->users))
1162 return false;
1163
1164
1165
1166
1167
1168
1169
1170
1171 smp_rmb();
1172 return true;
1173}
1174
1175
1176
1177
1178
1179static bool swap_alloc_fast(swp_entry_t *entry,
1180 int order)
1181{
1182 struct swap_cluster_info *ci;
1183 struct swap_info_struct *si;
1184 unsigned int offset, found = SWAP_ENTRY_INVALID;
1185
1186
1187
1188
1189
1190 si = this_cpu_read(percpu_swap_cluster.si[order]);
1191 offset = this_cpu_read(percpu_swap_cluster.offset[order]);
1192 if (!si || !offset || !get_swap_device_info(si))
1193 return false;
1194
1195 ci = lock_cluster(si, offset);
1196 if (cluster_is_usable(ci, order)) {
1197 if (cluster_is_empty(ci))
1198 offset = cluster_offset(si, ci);
1199 found = alloc_swap_scan_cluster(si, ci, offset, order, SWAP_HAS_CACHE);
1200 if (found)
1201 *entry = swp_entry(si->type, found);
1202 } else {
1203 unlock_cluster(ci);
1204 }
1205
1206 put_swap_device(si);
1207 return !!found;
1208}
1209
1210
1211static bool swap_alloc_slow(swp_entry_t *entry,
1212 int order)
1213{
1214 int node;
1215 unsigned long offset;
1216 struct swap_info_struct *si, *next;
1217
1218 node = numa_node_id();
1219 spin_lock(&swap_avail_lock);
1220start_over:
1221 plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
1222
1223 plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
1224 spin_unlock(&swap_avail_lock);
1225 if (get_swap_device_info(si)) {
1226 offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE);
1227 put_swap_device(si);
1228 if (offset) {
1229 *entry = swp_entry(si->type, offset);
1230 return true;
1231 }
1232 if (order)
1233 return false;
1234 }
1235
1236 spin_lock(&swap_avail_lock);
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248 if (plist_node_empty(&next->avail_lists[node]))
1249 goto start_over;
1250 }
1251 spin_unlock(&swap_avail_lock);
1252 return false;
1253}
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266int folio_alloc_swap(struct folio *folio, gfp_t gfp)
1267{
1268 unsigned int order = folio_order(folio);
1269 unsigned int size = 1 << order;
1270 swp_entry_t entry = {};
1271
1272 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
1273 VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
1274
1275 if (order) {
1276
1277
1278
1279
1280 if (!IS_ENABLED(CONFIG_THP_SWAP))
1281 return -EAGAIN;
1282
1283
1284
1285
1286
1287 if (size > SWAPFILE_CLUSTER) {
1288 VM_WARN_ON_ONCE(1);
1289 return -EINVAL;
1290 }
1291 }
1292
1293 local_lock(&percpu_swap_cluster.lock);
1294 if (!swap_alloc_fast(&entry, order))
1295 swap_alloc_slow(&entry, order);
1296 local_unlock(&percpu_swap_cluster.lock);
1297
1298
1299 if (mem_cgroup_try_charge_swap(folio, entry))
1300 goto out_free;
1301
1302 if (!entry.val)
1303 return -ENOMEM;
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313 if (add_to_swap_cache(folio, entry, gfp | __GFP_NOMEMALLOC, NULL))
1314 goto out_free;
1315
1316 return 0;
1317
1318out_free:
1319 put_swap_folio(folio, entry);
1320 return -ENOMEM;
1321}
1322
1323static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
1324{
1325 struct swap_info_struct *si;
1326 unsigned long offset;
1327
1328 if (!entry.val)
1329 goto out;
1330 si = swp_swap_info(entry);
1331 if (!si)
1332 goto bad_nofile;
1333 if (data_race(!(si->flags & SWP_USED)))
1334 goto bad_device;
1335 offset = swp_offset(entry);
1336 if (offset >= si->max)
1337 goto bad_offset;
1338 if (data_race(!si->swap_map[swp_offset(entry)]))
1339 goto bad_free;
1340 return si;
1341
1342bad_free:
1343 pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val);
1344 goto out;
1345bad_offset:
1346 pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
1347 goto out;
1348bad_device:
1349 pr_err("%s: %s%08lx\n", __func__, Unused_file, entry.val);
1350 goto out;
1351bad_nofile:
1352 pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
1353out:
1354 return NULL;
1355}
1356
1357static unsigned char swap_entry_put_locked(struct swap_info_struct *si,
1358 struct swap_cluster_info *ci,
1359 swp_entry_t entry,
1360 unsigned char usage)
1361{
1362 unsigned long offset = swp_offset(entry);
1363 unsigned char count;
1364 unsigned char has_cache;
1365
1366 count = si->swap_map[offset];
1367
1368 has_cache = count & SWAP_HAS_CACHE;
1369 count &= ~SWAP_HAS_CACHE;
1370
1371 if (usage == SWAP_HAS_CACHE) {
1372 VM_BUG_ON(!has_cache);
1373 has_cache = 0;
1374 } else if (count == SWAP_MAP_SHMEM) {
1375
1376
1377
1378
1379 count = 0;
1380 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
1381 if (count == COUNT_CONTINUED) {
1382 if (swap_count_continued(si, offset, count))
1383 count = SWAP_MAP_MAX | COUNT_CONTINUED;
1384 else
1385 count = SWAP_MAP_MAX;
1386 } else
1387 count--;
1388 }
1389
1390 usage = count | has_cache;
1391 if (usage)
1392 WRITE_ONCE(si->swap_map[offset], usage);
1393 else
1394 swap_entries_free(si, ci, entry, 1);
1395
1396 return usage;
1397}
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438struct swap_info_struct *get_swap_device(swp_entry_t entry)
1439{
1440 struct swap_info_struct *si;
1441 unsigned long offset;
1442
1443 if (!entry.val)
1444 goto out;
1445 si = swp_swap_info(entry);
1446 if (!si)
1447 goto bad_nofile;
1448 if (!get_swap_device_info(si))
1449 goto out;
1450 offset = swp_offset(entry);
1451 if (offset >= si->max)
1452 goto put_out;
1453
1454 return si;
1455bad_nofile:
1456 pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
1457out:
1458 return NULL;
1459put_out:
1460 pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
1461 percpu_ref_put(&si->users);
1462 return NULL;
1463}
1464
1465static void swap_entries_put_cache(struct swap_info_struct *si,
1466 swp_entry_t entry, int nr)
1467{
1468 unsigned long offset = swp_offset(entry);
1469 struct swap_cluster_info *ci;
1470
1471 ci = lock_cluster(si, offset);
1472 if (swap_only_has_cache(si, offset, nr))
1473 swap_entries_free(si, ci, entry, nr);
1474 else {
1475 for (int i = 0; i < nr; i++, entry.val++)
1476 swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE);
1477 }
1478 unlock_cluster(ci);
1479}
1480
1481static bool swap_entries_put_map(struct swap_info_struct *si,
1482 swp_entry_t entry, int nr)
1483{
1484 unsigned long offset = swp_offset(entry);
1485 struct swap_cluster_info *ci;
1486 bool has_cache = false;
1487 unsigned char count;
1488 int i;
1489
1490 if (nr <= 1)
1491 goto fallback;
1492 count = swap_count(data_race(si->swap_map[offset]));
1493 if (count != 1 && count != SWAP_MAP_SHMEM)
1494 goto fallback;
1495
1496 ci = lock_cluster(si, offset);
1497 if (!swap_is_last_map(si, offset, nr, &has_cache)) {
1498 goto locked_fallback;
1499 }
1500 if (!has_cache)
1501 swap_entries_free(si, ci, entry, nr);
1502 else
1503 for (i = 0; i < nr; i++)
1504 WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
1505 unlock_cluster(ci);
1506
1507 return has_cache;
1508
1509fallback:
1510 ci = lock_cluster(si, offset);
1511locked_fallback:
1512 for (i = 0; i < nr; i++, entry.val++) {
1513 count = swap_entry_put_locked(si, ci, entry, 1);
1514 if (count == SWAP_HAS_CACHE)
1515 has_cache = true;
1516 }
1517 unlock_cluster(ci);
1518 return has_cache;
1519
1520}
1521
1522
1523
1524
1525
1526
1527static bool swap_entries_put_map_nr(struct swap_info_struct *si,
1528 swp_entry_t entry, int nr)
1529{
1530 int cluster_nr, cluster_rest;
1531 unsigned long offset = swp_offset(entry);
1532 bool has_cache = false;
1533
1534 cluster_rest = SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER;
1535 while (nr) {
1536 cluster_nr = min(nr, cluster_rest);
1537 has_cache |= swap_entries_put_map(si, entry, cluster_nr);
1538 cluster_rest = SWAPFILE_CLUSTER;
1539 nr -= cluster_nr;
1540 entry.val += cluster_nr;
1541 }
1542
1543 return has_cache;
1544}
1545
1546
1547
1548
1549
1550static inline bool __maybe_unused swap_is_last_ref(unsigned char count)
1551{
1552 return (count == SWAP_HAS_CACHE) || (count == 1) ||
1553 (count == SWAP_MAP_SHMEM);
1554}
1555
1556
1557
1558
1559
1560static void swap_entries_free(struct swap_info_struct *si,
1561 struct swap_cluster_info *ci,
1562 swp_entry_t entry, unsigned int nr_pages)
1563{
1564 unsigned long offset = swp_offset(entry);
1565 unsigned char *map = si->swap_map + offset;
1566 unsigned char *map_end = map + nr_pages;
1567
1568
1569 VM_BUG_ON(ci != offset_to_cluster(si, offset + nr_pages - 1));
1570 VM_BUG_ON(cluster_is_empty(ci));
1571 VM_BUG_ON(ci->count < nr_pages);
1572
1573 ci->count -= nr_pages;
1574 do {
1575 VM_BUG_ON(!swap_is_last_ref(*map));
1576 *map = 0;
1577 } while (++map < map_end);
1578
1579 mem_cgroup_uncharge_swap(entry, nr_pages);
1580 swap_range_free(si, offset, nr_pages);
1581
1582 if (!ci->count)
1583 free_cluster(si, ci);
1584 else
1585 partial_free_cluster(si, ci);
1586}
1587
1588
1589
1590
1591
1592void swap_free_nr(swp_entry_t entry, int nr_pages)
1593{
1594 int nr;
1595 struct swap_info_struct *sis;
1596 unsigned long offset = swp_offset(entry);
1597
1598 sis = _swap_info_get(entry);
1599 if (!sis)
1600 return;
1601
1602 while (nr_pages) {
1603 nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
1604 swap_entries_put_map(sis, swp_entry(sis->type, offset), nr);
1605 offset += nr;
1606 nr_pages -= nr;
1607 }
1608}
1609
1610
1611
1612
1613void put_swap_folio(struct folio *folio, swp_entry_t entry)
1614{
1615 struct swap_info_struct *si;
1616 int size = 1 << swap_entry_order(folio_order(folio));
1617
1618 si = _swap_info_get(entry);
1619 if (!si)
1620 return;
1621
1622 swap_entries_put_cache(si, entry, size);
1623}
1624
1625int __swap_count(swp_entry_t entry)
1626{
1627 struct swap_info_struct *si = swp_swap_info(entry);
1628 pgoff_t offset = swp_offset(entry);
1629
1630 return swap_count(si->swap_map[offset]);
1631}
1632
1633
1634
1635
1636
1637
1638bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry)
1639{
1640 pgoff_t offset = swp_offset(entry);
1641 struct swap_cluster_info *ci;
1642 int count;
1643
1644 ci = lock_cluster(si, offset);
1645 count = swap_count(si->swap_map[offset]);
1646 unlock_cluster(ci);
1647 return !!count;
1648}
1649
1650
1651
1652
1653
1654int swp_swapcount(swp_entry_t entry)
1655{
1656 int count, tmp_count, n;
1657 struct swap_info_struct *si;
1658 struct swap_cluster_info *ci;
1659 struct page *page;
1660 pgoff_t offset;
1661 unsigned char *map;
1662
1663 si = _swap_info_get(entry);
1664 if (!si)
1665 return 0;
1666
1667 offset = swp_offset(entry);
1668
1669 ci = lock_cluster(si, offset);
1670
1671 count = swap_count(si->swap_map[offset]);
1672 if (!(count & COUNT_CONTINUED))
1673 goto out;
1674
1675 count &= ~COUNT_CONTINUED;
1676 n = SWAP_MAP_MAX + 1;
1677
1678 page = vmalloc_to_page(si->swap_map + offset);
1679 offset &= ~PAGE_MASK;
1680 VM_BUG_ON(page_private(page) != SWP_CONTINUED);
1681
1682 do {
1683 page = list_next_entry(page, lru);
1684 map = kmap_local_page(page);
1685 tmp_count = map[offset];
1686 kunmap_local(map);
1687
1688 count += (tmp_count & ~COUNT_CONTINUED) * n;
1689 n *= (SWAP_CONT_MAX + 1);
1690 } while (tmp_count & COUNT_CONTINUED);
1691out:
1692 unlock_cluster(ci);
1693 return count;
1694}
1695
1696static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
1697 swp_entry_t entry, int order)
1698{
1699 struct swap_cluster_info *ci;
1700 unsigned char *map = si->swap_map;
1701 unsigned int nr_pages = 1 << order;
1702 unsigned long roffset = swp_offset(entry);
1703 unsigned long offset = round_down(roffset, nr_pages);
1704 int i;
1705 bool ret = false;
1706
1707 ci = lock_cluster(si, offset);
1708 if (nr_pages == 1) {
1709 if (swap_count(map[roffset]))
1710 ret = true;
1711 goto unlock_out;
1712 }
1713 for (i = 0; i < nr_pages; i++) {
1714 if (swap_count(map[offset + i])) {
1715 ret = true;
1716 break;
1717 }
1718 }
1719unlock_out:
1720 unlock_cluster(ci);
1721 return ret;
1722}
1723
1724static bool folio_swapped(struct folio *folio)
1725{
1726 swp_entry_t entry = folio->swap;
1727 struct swap_info_struct *si = _swap_info_get(entry);
1728
1729 if (!si)
1730 return false;
1731
1732 if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio)))
1733 return swap_entry_swapped(si, entry);
1734
1735 return swap_page_trans_huge_swapped(si, entry, folio_order(folio));
1736}
1737
1738static bool folio_swapcache_freeable(struct folio *folio)
1739{
1740 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
1741
1742 if (!folio_test_swapcache(folio))
1743 return false;
1744 if (folio_test_writeback(folio))
1745 return false;
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762 if (pm_suspended_storage())
1763 return false;
1764
1765 return true;
1766}
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777bool folio_free_swap(struct folio *folio)
1778{
1779 if (!folio_swapcache_freeable(folio))
1780 return false;
1781 if (folio_swapped(folio))
1782 return false;
1783
1784 delete_from_swap_cache(folio);
1785 folio_set_dirty(folio);
1786 return true;
1787}
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799void free_swap_and_cache_nr(swp_entry_t entry, int nr)
1800{
1801 const unsigned long start_offset = swp_offset(entry);
1802 const unsigned long end_offset = start_offset + nr;
1803 struct swap_info_struct *si;
1804 bool any_only_cache = false;
1805 unsigned long offset;
1806
1807 si = get_swap_device(entry);
1808 if (!si)
1809 return;
1810
1811 if (WARN_ON(end_offset > si->max))
1812 goto out;
1813
1814
1815
1816
1817 any_only_cache = swap_entries_put_map_nr(si, entry, nr);
1818
1819
1820
1821
1822
1823 if (!any_only_cache)
1824 goto out;
1825
1826
1827
1828
1829 for (offset = start_offset; offset < end_offset; offset += nr) {
1830 nr = 1;
1831 if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
1832
1833
1834
1835
1836
1837
1838
1839
1840 nr = __try_to_reclaim_swap(si, offset,
1841 TTRS_UNMAPPED | TTRS_FULL);
1842 if (nr == 0)
1843 nr = 1;
1844 else if (nr < 0)
1845 nr = -nr;
1846 nr = ALIGN(offset + 1, nr) - offset;
1847 }
1848 }
1849
1850out:
1851 put_swap_device(si);
1852}
1853
1854#ifdef CONFIG_HIBERNATION
1855
1856swp_entry_t get_swap_page_of_type(int type)
1857{
1858 struct swap_info_struct *si = swap_type_to_swap_info(type);
1859 unsigned long offset;
1860 swp_entry_t entry = {0};
1861
1862 if (!si)
1863 goto fail;
1864
1865
1866 if (get_swap_device_info(si)) {
1867 if (si->flags & SWP_WRITEOK) {
1868 offset = cluster_alloc_swap_entry(si, 0, 1);
1869 if (offset) {
1870 entry = swp_entry(si->type, offset);
1871 atomic_long_dec(&nr_swap_pages);
1872 }
1873 }
1874 put_swap_device(si);
1875 }
1876fail:
1877 return entry;
1878}
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888int swap_type_of(dev_t device, sector_t offset)
1889{
1890 int type;
1891
1892 if (!device)
1893 return -1;
1894
1895 spin_lock(&swap_lock);
1896 for (type = 0; type < nr_swapfiles; type++) {
1897 struct swap_info_struct *sis = swap_info[type];
1898
1899 if (!(sis->flags & SWP_WRITEOK))
1900 continue;
1901
1902 if (device == sis->bdev->bd_dev) {
1903 struct swap_extent *se = first_se(sis);
1904
1905 if (se->start_block == offset) {
1906 spin_unlock(&swap_lock);
1907 return type;
1908 }
1909 }
1910 }
1911 spin_unlock(&swap_lock);
1912 return -ENODEV;
1913}
1914
1915int find_first_swap(dev_t *device)
1916{
1917 int type;
1918
1919 spin_lock(&swap_lock);
1920 for (type = 0; type < nr_swapfiles; type++) {
1921 struct swap_info_struct *sis = swap_info[type];
1922
1923 if (!(sis->flags & SWP_WRITEOK))
1924 continue;
1925 *device = sis->bdev->bd_dev;
1926 spin_unlock(&swap_lock);
1927 return type;
1928 }
1929 spin_unlock(&swap_lock);
1930 return -ENODEV;
1931}
1932
1933
1934
1935
1936
1937sector_t swapdev_block(int type, pgoff_t offset)
1938{
1939 struct swap_info_struct *si = swap_type_to_swap_info(type);
1940 struct swap_extent *se;
1941
1942 if (!si || !(si->flags & SWP_WRITEOK))
1943 return 0;
1944 se = offset_to_swap_extent(si, offset);
1945 return se->start_block + (offset - se->start_page);
1946}
1947
1948
1949
1950
1951
1952
1953
1954unsigned int count_swap_pages(int type, int free)
1955{
1956 unsigned int n = 0;
1957
1958 spin_lock(&swap_lock);
1959 if ((unsigned int)type < nr_swapfiles) {
1960 struct swap_info_struct *sis = swap_info[type];
1961
1962 spin_lock(&sis->lock);
1963 if (sis->flags & SWP_WRITEOK) {
1964 n = sis->pages;
1965 if (free)
1966 n -= swap_usage_in_pages(sis);
1967 }
1968 spin_unlock(&sis->lock);
1969 }
1970 spin_unlock(&swap_lock);
1971 return n;
1972}
1973#endif
1974
1975static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
1976{
1977 return pte_same(pte_swp_clear_flags(pte), swp_pte);
1978}
1979
1980
1981
1982
1983
1984
1985static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
1986 unsigned long addr, swp_entry_t entry, struct folio *folio)
1987{
1988 struct page *page;
1989 struct folio *swapcache;
1990 spinlock_t *ptl;
1991 pte_t *pte, new_pte, old_pte;
1992 bool hwpoisoned = false;
1993 int ret = 1;
1994
1995 swapcache = folio;
1996 folio = ksm_might_need_to_copy(folio, vma, addr);
1997 if (unlikely(!folio))
1998 return -ENOMEM;
1999 else if (unlikely(folio == ERR_PTR(-EHWPOISON))) {
2000 hwpoisoned = true;
2001 folio = swapcache;
2002 }
2003
2004 page = folio_file_page(folio, swp_offset(entry));
2005 if (PageHWPoison(page))
2006 hwpoisoned = true;
2007
2008 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
2009 if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte),
2010 swp_entry_to_pte(entry)))) {
2011 ret = 0;
2012 goto out;
2013 }
2014
2015 old_pte = ptep_get(pte);
2016
2017 if (unlikely(hwpoisoned || !folio_test_uptodate(folio))) {
2018 swp_entry_t swp_entry;
2019
2020 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
2021 if (hwpoisoned) {
2022 swp_entry = make_hwpoison_entry(page);
2023 } else {
2024 swp_entry = make_poisoned_swp_entry();
2025 }
2026 new_pte = swp_entry_to_pte(swp_entry);
2027 ret = 0;
2028 goto setpte;
2029 }
2030
2031
2032
2033
2034
2035
2036 arch_swap_restore(folio_swap(entry, folio), folio);
2037
2038 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
2039 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
2040 folio_get(folio);
2041 if (folio == swapcache) {
2042 rmap_t rmap_flags = RMAP_NONE;
2043
2044
2045
2046
2047
2048
2049 VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
2050 if (pte_swp_exclusive(old_pte))
2051 rmap_flags |= RMAP_EXCLUSIVE;
2052
2053
2054
2055
2056
2057 if (!folio_test_anon(folio)) {
2058 VM_WARN_ON_ONCE(folio_test_large(folio));
2059 VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
2060 folio_add_new_anon_rmap(folio, vma, addr, rmap_flags);
2061 } else {
2062 folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags);
2063 }
2064 } else {
2065 folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
2066 folio_add_lru_vma(folio, vma);
2067 }
2068 new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
2069 if (pte_swp_soft_dirty(old_pte))
2070 new_pte = pte_mksoft_dirty(new_pte);
2071 if (pte_swp_uffd_wp(old_pte))
2072 new_pte = pte_mkuffd_wp(new_pte);
2073setpte:
2074 set_pte_at(vma->vm_mm, addr, pte, new_pte);
2075 swap_free(entry);
2076out:
2077 if (pte)
2078 pte_unmap_unlock(pte, ptl);
2079 if (folio != swapcache) {
2080 folio_unlock(folio);
2081 folio_put(folio);
2082 }
2083 return ret;
2084}
2085
2086static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
2087 unsigned long addr, unsigned long end,
2088 unsigned int type)
2089{
2090 pte_t *pte = NULL;
2091 struct swap_info_struct *si;
2092
2093 si = swap_info[type];
2094 do {
2095 struct folio *folio;
2096 unsigned long offset;
2097 unsigned char swp_count;
2098 swp_entry_t entry;
2099 int ret;
2100 pte_t ptent;
2101
2102 if (!pte++) {
2103 pte = pte_offset_map(pmd, addr);
2104 if (!pte)
2105 break;
2106 }
2107
2108 ptent = ptep_get_lockless(pte);
2109
2110 if (!is_swap_pte(ptent))
2111 continue;
2112
2113 entry = pte_to_swp_entry(ptent);
2114 if (swp_type(entry) != type)
2115 continue;
2116
2117 offset = swp_offset(entry);
2118 pte_unmap(pte);
2119 pte = NULL;
2120
2121 folio = swap_cache_get_folio(entry, vma, addr);
2122 if (!folio) {
2123 struct vm_fault vmf = {
2124 .vma = vma,
2125 .address = addr,
2126 .real_address = addr,
2127 .pmd = pmd,
2128 };
2129
2130 folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
2131 &vmf);
2132 }
2133 if (!folio) {
2134 swp_count = READ_ONCE(si->swap_map[offset]);
2135 if (swp_count == 0 || swp_count == SWAP_MAP_BAD)
2136 continue;
2137 return -ENOMEM;
2138 }
2139
2140 folio_lock(folio);
2141 folio_wait_writeback(folio);
2142 ret = unuse_pte(vma, pmd, addr, entry, folio);
2143 if (ret < 0) {
2144 folio_unlock(folio);
2145 folio_put(folio);
2146 return ret;
2147 }
2148
2149 folio_free_swap(folio);
2150 folio_unlock(folio);
2151 folio_put(folio);
2152 } while (addr += PAGE_SIZE, addr != end);
2153
2154 if (pte)
2155 pte_unmap(pte);
2156 return 0;
2157}
2158
2159static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
2160 unsigned long addr, unsigned long end,
2161 unsigned int type)
2162{
2163 pmd_t *pmd;
2164 unsigned long next;
2165 int ret;
2166
2167 pmd = pmd_offset(pud, addr);
2168 do {
2169 cond_resched();
2170 next = pmd_addr_end(addr, end);
2171 ret = unuse_pte_range(vma, pmd, addr, next, type);
2172 if (ret)
2173 return ret;
2174 } while (pmd++, addr = next, addr != end);
2175 return 0;
2176}
2177
2178static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
2179 unsigned long addr, unsigned long end,
2180 unsigned int type)
2181{
2182 pud_t *pud;
2183 unsigned long next;
2184 int ret;
2185
2186 pud = pud_offset(p4d, addr);
2187 do {
2188 next = pud_addr_end(addr, end);
2189 if (pud_none_or_clear_bad(pud))
2190 continue;
2191 ret = unuse_pmd_range(vma, pud, addr, next, type);
2192 if (ret)
2193 return ret;
2194 } while (pud++, addr = next, addr != end);
2195 return 0;
2196}
2197
2198static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
2199 unsigned long addr, unsigned long end,
2200 unsigned int type)
2201{
2202 p4d_t *p4d;
2203 unsigned long next;
2204 int ret;
2205
2206 p4d = p4d_offset(pgd, addr);
2207 do {
2208 next = p4d_addr_end(addr, end);
2209 if (p4d_none_or_clear_bad(p4d))
2210 continue;
2211 ret = unuse_pud_range(vma, p4d, addr, next, type);
2212 if (ret)
2213 return ret;
2214 } while (p4d++, addr = next, addr != end);
2215 return 0;
2216}
2217
2218static int unuse_vma(struct vm_area_struct *vma, unsigned int type)
2219{
2220 pgd_t *pgd;
2221 unsigned long addr, end, next;
2222 int ret;
2223
2224 addr = vma->vm_start;
2225 end = vma->vm_end;
2226
2227 pgd = pgd_offset(vma->vm_mm, addr);
2228 do {
2229 next = pgd_addr_end(addr, end);
2230 if (pgd_none_or_clear_bad(pgd))
2231 continue;
2232 ret = unuse_p4d_range(vma, pgd, addr, next, type);
2233 if (ret)
2234 return ret;
2235 } while (pgd++, addr = next, addr != end);
2236 return 0;
2237}
2238
2239static int unuse_mm(struct mm_struct *mm, unsigned int type)
2240{
2241 struct vm_area_struct *vma;
2242 int ret = 0;
2243 VMA_ITERATOR(vmi, mm, 0);
2244
2245 mmap_read_lock(mm);
2246 for_each_vma(vmi, vma) {
2247 if (vma->anon_vma && !is_vm_hugetlb_page(vma)) {
2248 ret = unuse_vma(vma, type);
2249 if (ret)
2250 break;
2251 }
2252
2253 cond_resched();
2254 }
2255 mmap_read_unlock(mm);
2256 return ret;
2257}
2258
2259
2260
2261
2262
2263
2264static unsigned int find_next_to_unuse(struct swap_info_struct *si,
2265 unsigned int prev)
2266{
2267 unsigned int i;
2268 unsigned char count;
2269
2270
2271
2272
2273
2274
2275
2276 for (i = prev + 1; i < si->max; i++) {
2277 count = READ_ONCE(si->swap_map[i]);
2278 if (count && swap_count(count) != SWAP_MAP_BAD)
2279 break;
2280 if ((i % LATENCY_LIMIT) == 0)
2281 cond_resched();
2282 }
2283
2284 if (i == si->max)
2285 i = 0;
2286
2287 return i;
2288}
2289
2290static int try_to_unuse(unsigned int type)
2291{
2292 struct mm_struct *prev_mm;
2293 struct mm_struct *mm;
2294 struct list_head *p;
2295 int retval = 0;
2296 struct swap_info_struct *si = swap_info[type];
2297 struct folio *folio;
2298 swp_entry_t entry;
2299 unsigned int i;
2300
2301 if (!swap_usage_in_pages(si))
2302 goto success;
2303
2304retry:
2305 retval = shmem_unuse(type);
2306 if (retval)
2307 return retval;
2308
2309 prev_mm = &init_mm;
2310 mmget(prev_mm);
2311
2312 spin_lock(&mmlist_lock);
2313 p = &init_mm.mmlist;
2314 while (swap_usage_in_pages(si) &&
2315 !signal_pending(current) &&
2316 (p = p->next) != &init_mm.mmlist) {
2317
2318 mm = list_entry(p, struct mm_struct, mmlist);
2319 if (!mmget_not_zero(mm))
2320 continue;
2321 spin_unlock(&mmlist_lock);
2322 mmput(prev_mm);
2323 prev_mm = mm;
2324 retval = unuse_mm(mm, type);
2325 if (retval) {
2326 mmput(prev_mm);
2327 return retval;
2328 }
2329
2330
2331
2332
2333
2334 cond_resched();
2335 spin_lock(&mmlist_lock);
2336 }
2337 spin_unlock(&mmlist_lock);
2338
2339 mmput(prev_mm);
2340
2341 i = 0;
2342 while (swap_usage_in_pages(si) &&
2343 !signal_pending(current) &&
2344 (i = find_next_to_unuse(si, i)) != 0) {
2345
2346 entry = swp_entry(type, i);
2347 folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
2348 if (IS_ERR(folio))
2349 continue;
2350
2351
2352
2353
2354
2355
2356
2357 folio_lock(folio);
2358 folio_wait_writeback(folio);
2359 folio_free_swap(folio);
2360 folio_unlock(folio);
2361 folio_put(folio);
2362 }
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377 if (swap_usage_in_pages(si)) {
2378 if (!signal_pending(current))
2379 goto retry;
2380 return -EINTR;
2381 }
2382
2383success:
2384
2385
2386
2387
2388 smp_mb();
2389 return 0;
2390}
2391
2392
2393
2394
2395
2396
2397
2398static void drain_mmlist(void)
2399{
2400 struct list_head *p, *next;
2401 unsigned int type;
2402
2403 for (type = 0; type < nr_swapfiles; type++)
2404 if (swap_usage_in_pages(swap_info[type]))
2405 return;
2406 spin_lock(&mmlist_lock);
2407 list_for_each_safe(p, next, &init_mm.mmlist)
2408 list_del_init(p);
2409 spin_unlock(&mmlist_lock);
2410}
2411
2412
2413
2414
2415static void destroy_swap_extents(struct swap_info_struct *sis)
2416{
2417 while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
2418 struct rb_node *rb = sis->swap_extent_root.rb_node;
2419 struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
2420
2421 rb_erase(rb, &sis->swap_extent_root);
2422 kfree(se);
2423 }
2424
2425 if (sis->flags & SWP_ACTIVATED) {
2426 struct file *swap_file = sis->swap_file;
2427 struct address_space *mapping = swap_file->f_mapping;
2428
2429 sis->flags &= ~SWP_ACTIVATED;
2430 if (mapping->a_ops->swap_deactivate)
2431 mapping->a_ops->swap_deactivate(swap_file);
2432 }
2433}
2434
2435
2436
2437
2438
2439
2440
2441int
2442add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
2443 unsigned long nr_pages, sector_t start_block)
2444{
2445 struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
2446 struct swap_extent *se;
2447 struct swap_extent *new_se;
2448
2449
2450
2451
2452
2453 while (*link) {
2454 parent = *link;
2455 link = &parent->rb_right;
2456 }
2457
2458 if (parent) {
2459 se = rb_entry(parent, struct swap_extent, rb_node);
2460 BUG_ON(se->start_page + se->nr_pages != start_page);
2461 if (se->start_block + se->nr_pages == start_block) {
2462
2463 se->nr_pages += nr_pages;
2464 return 0;
2465 }
2466 }
2467
2468
2469 new_se = kmalloc(sizeof(*se), GFP_KERNEL);
2470 if (new_se == NULL)
2471 return -ENOMEM;
2472 new_se->start_page = start_page;
2473 new_se->nr_pages = nr_pages;
2474 new_se->start_block = start_block;
2475
2476 rb_link_node(&new_se->rb_node, parent, link);
2477 rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
2478 return 1;
2479}
2480EXPORT_SYMBOL_GPL(add_swap_extent);
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
2510{
2511 struct file *swap_file = sis->swap_file;
2512 struct address_space *mapping = swap_file->f_mapping;
2513 struct inode *inode = mapping->host;
2514 int ret;
2515
2516 if (S_ISBLK(inode->i_mode)) {
2517 ret = add_swap_extent(sis, 0, sis->max, 0);
2518 *span = sis->pages;
2519 return ret;
2520 }
2521
2522 if (mapping->a_ops->swap_activate) {
2523 ret = mapping->a_ops->swap_activate(sis, swap_file, span);
2524 if (ret < 0)
2525 return ret;
2526 sis->flags |= SWP_ACTIVATED;
2527 if ((sis->flags & SWP_FS_OPS) &&
2528 sio_pool_init() != 0) {
2529 destroy_swap_extents(sis);
2530 return -ENOMEM;
2531 }
2532 return ret;
2533 }
2534
2535 return generic_swapfile_activate(sis, swap_file, span);
2536}
2537
2538static int swap_node(struct swap_info_struct *si)
2539{
2540 struct block_device *bdev;
2541
2542 if (si->bdev)
2543 bdev = si->bdev;
2544 else
2545 bdev = si->swap_file->f_inode->i_sb->s_bdev;
2546
2547 return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
2548}
2549
2550static void setup_swap_info(struct swap_info_struct *si, int prio,
2551 unsigned char *swap_map,
2552 struct swap_cluster_info *cluster_info,
2553 unsigned long *zeromap)
2554{
2555 int i;
2556
2557 if (prio >= 0)
2558 si->prio = prio;
2559 else
2560 si->prio = --least_priority;
2561
2562
2563
2564
2565 si->list.prio = -si->prio;
2566 for_each_node(i) {
2567 if (si->prio >= 0)
2568 si->avail_lists[i].prio = -si->prio;
2569 else {
2570 if (swap_node(si) == i)
2571 si->avail_lists[i].prio = 1;
2572 else
2573 si->avail_lists[i].prio = -si->prio;
2574 }
2575 }
2576 si->swap_map = swap_map;
2577 si->cluster_info = cluster_info;
2578 si->zeromap = zeromap;
2579}
2580
2581static void _enable_swap_info(struct swap_info_struct *si)
2582{
2583 atomic_long_add(si->pages, &nr_swap_pages);
2584 total_swap_pages += si->pages;
2585
2586 assert_spin_locked(&swap_lock);
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597 plist_add(&si->list, &swap_active_head);
2598
2599
2600 add_to_avail_list(si, true);
2601}
2602
2603static void enable_swap_info(struct swap_info_struct *si, int prio,
2604 unsigned char *swap_map,
2605 struct swap_cluster_info *cluster_info,
2606 unsigned long *zeromap)
2607{
2608 spin_lock(&swap_lock);
2609 spin_lock(&si->lock);
2610 setup_swap_info(si, prio, swap_map, cluster_info, zeromap);
2611 spin_unlock(&si->lock);
2612 spin_unlock(&swap_lock);
2613
2614
2615
2616 percpu_ref_resurrect(&si->users);
2617 spin_lock(&swap_lock);
2618 spin_lock(&si->lock);
2619 _enable_swap_info(si);
2620 spin_unlock(&si->lock);
2621 spin_unlock(&swap_lock);
2622}
2623
2624static void reinsert_swap_info(struct swap_info_struct *si)
2625{
2626 spin_lock(&swap_lock);
2627 spin_lock(&si->lock);
2628 setup_swap_info(si, si->prio, si->swap_map, si->cluster_info, si->zeromap);
2629 _enable_swap_info(si);
2630 spin_unlock(&si->lock);
2631 spin_unlock(&swap_lock);
2632}
2633
2634
2635
2636
2637
2638static void wait_for_allocation(struct swap_info_struct *si)
2639{
2640 unsigned long offset;
2641 unsigned long end = ALIGN(si->max, SWAPFILE_CLUSTER);
2642 struct swap_cluster_info *ci;
2643
2644 BUG_ON(si->flags & SWP_WRITEOK);
2645
2646 for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) {
2647 ci = lock_cluster(si, offset);
2648 unlock_cluster(ci);
2649 }
2650}
2651
2652
2653
2654
2655
2656static void flush_percpu_swap_cluster(struct swap_info_struct *si)
2657{
2658 int cpu, i;
2659 struct swap_info_struct **pcp_si;
2660
2661 for_each_possible_cpu(cpu) {
2662 pcp_si = per_cpu_ptr(percpu_swap_cluster.si, cpu);
2663
2664
2665
2666
2667
2668 for (i = 0; i < SWAP_NR_ORDERS; i++)
2669 cmpxchg(&pcp_si[i], si, NULL);
2670 }
2671}
2672
2673
2674SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
2675{
2676 struct swap_info_struct *p = NULL;
2677 unsigned char *swap_map;
2678 unsigned long *zeromap;
2679 struct swap_cluster_info *cluster_info;
2680 struct file *swap_file, *victim;
2681 struct address_space *mapping;
2682 struct inode *inode;
2683 struct filename *pathname;
2684 int err, found = 0;
2685
2686 if (!capable(CAP_SYS_ADMIN))
2687 return -EPERM;
2688
2689 BUG_ON(!current->mm);
2690
2691 pathname = getname(specialfile);
2692 if (IS_ERR(pathname))
2693 return PTR_ERR(pathname);
2694
2695 victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
2696 err = PTR_ERR(victim);
2697 if (IS_ERR(victim))
2698 goto out;
2699
2700 mapping = victim->f_mapping;
2701 spin_lock(&swap_lock);
2702 plist_for_each_entry(p, &swap_active_head, list) {
2703 if (p->flags & SWP_WRITEOK) {
2704 if (p->swap_file->f_mapping == mapping) {
2705 found = 1;
2706 break;
2707 }
2708 }
2709 }
2710 if (!found) {
2711 err = -EINVAL;
2712 spin_unlock(&swap_lock);
2713 goto out_dput;
2714 }
2715 if (!security_vm_enough_memory_mm(current->mm, p->pages))
2716 vm_unacct_memory(p->pages);
2717 else {
2718 err = -ENOMEM;
2719 spin_unlock(&swap_lock);
2720 goto out_dput;
2721 }
2722 spin_lock(&p->lock);
2723 del_from_avail_list(p, true);
2724 if (p->prio < 0) {
2725 struct swap_info_struct *si = p;
2726 int nid;
2727
2728 plist_for_each_entry_continue(si, &swap_active_head, list) {
2729 si->prio++;
2730 si->list.prio--;
2731 for_each_node(nid) {
2732 if (si->avail_lists[nid].prio != 1)
2733 si->avail_lists[nid].prio--;
2734 }
2735 }
2736 least_priority++;
2737 }
2738 plist_del(&p->list, &swap_active_head);
2739 atomic_long_sub(p->pages, &nr_swap_pages);
2740 total_swap_pages -= p->pages;
2741 spin_unlock(&p->lock);
2742 spin_unlock(&swap_lock);
2743
2744 wait_for_allocation(p);
2745
2746 set_current_oom_origin();
2747 err = try_to_unuse(p->type);
2748 clear_current_oom_origin();
2749
2750 if (err) {
2751
2752 reinsert_swap_info(p);
2753 goto out_dput;
2754 }
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764 percpu_ref_kill(&p->users);
2765 synchronize_rcu();
2766 wait_for_completion(&p->comp);
2767
2768 flush_work(&p->discard_work);
2769 flush_work(&p->reclaim_work);
2770 flush_percpu_swap_cluster(p);
2771
2772 destroy_swap_extents(p);
2773 if (p->flags & SWP_CONTINUED)
2774 free_swap_count_continuations(p);
2775
2776 if (!p->bdev || !bdev_nonrot(p->bdev))
2777 atomic_dec(&nr_rotate_swap);
2778
2779 mutex_lock(&swapon_mutex);
2780 spin_lock(&swap_lock);
2781 spin_lock(&p->lock);
2782 drain_mmlist();
2783
2784 swap_file = p->swap_file;
2785 p->swap_file = NULL;
2786 p->max = 0;
2787 swap_map = p->swap_map;
2788 p->swap_map = NULL;
2789 zeromap = p->zeromap;
2790 p->zeromap = NULL;
2791 cluster_info = p->cluster_info;
2792 p->cluster_info = NULL;
2793 spin_unlock(&p->lock);
2794 spin_unlock(&swap_lock);
2795 arch_swap_invalidate_area(p->type);
2796 zswap_swapoff(p->type);
2797 mutex_unlock(&swapon_mutex);
2798 kfree(p->global_cluster);
2799 p->global_cluster = NULL;
2800 vfree(swap_map);
2801 kvfree(zeromap);
2802 kvfree(cluster_info);
2803
2804 swap_cgroup_swapoff(p->type);
2805 exit_swap_address_space(p->type);
2806
2807 inode = mapping->host;
2808
2809 inode_lock(inode);
2810 inode->i_flags &= ~S_SWAPFILE;
2811 inode_unlock(inode);
2812 filp_close(swap_file, NULL);
2813
2814
2815
2816
2817
2818
2819 spin_lock(&swap_lock);
2820 p->flags = 0;
2821 spin_unlock(&swap_lock);
2822
2823 err = 0;
2824 atomic_inc(&proc_poll_event);
2825 wake_up_interruptible(&proc_poll_wait);
2826
2827out_dput:
2828 filp_close(victim, NULL);
2829out:
2830 putname(pathname);
2831 return err;
2832}
2833
2834#ifdef CONFIG_PROC_FS
2835static __poll_t swaps_poll(struct file *file, poll_table *wait)
2836{
2837 struct seq_file *seq = file->private_data;
2838
2839 poll_wait(file, &proc_poll_wait, wait);
2840
2841 if (seq->poll_event != atomic_read(&proc_poll_event)) {
2842 seq->poll_event = atomic_read(&proc_poll_event);
2843 return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
2844 }
2845
2846 return EPOLLIN | EPOLLRDNORM;
2847}
2848
2849
2850static void *swap_start(struct seq_file *swap, loff_t *pos)
2851{
2852 struct swap_info_struct *si;
2853 int type;
2854 loff_t l = *pos;
2855
2856 mutex_lock(&swapon_mutex);
2857
2858 if (!l)
2859 return SEQ_START_TOKEN;
2860
2861 for (type = 0; (si = swap_type_to_swap_info(type)); type++) {
2862 if (!(si->flags & SWP_USED) || !si->swap_map)
2863 continue;
2864 if (!--l)
2865 return si;
2866 }
2867
2868 return NULL;
2869}
2870
2871static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
2872{
2873 struct swap_info_struct *si = v;
2874 int type;
2875
2876 if (v == SEQ_START_TOKEN)
2877 type = 0;
2878 else
2879 type = si->type + 1;
2880
2881 ++(*pos);
2882 for (; (si = swap_type_to_swap_info(type)); type++) {
2883 if (!(si->flags & SWP_USED) || !si->swap_map)
2884 continue;
2885 return si;
2886 }
2887
2888 return NULL;
2889}
2890
2891static void swap_stop(struct seq_file *swap, void *v)
2892{
2893 mutex_unlock(&swapon_mutex);
2894}
2895
2896static int swap_show(struct seq_file *swap, void *v)
2897{
2898 struct swap_info_struct *si = v;
2899 struct file *file;
2900 int len;
2901 unsigned long bytes, inuse;
2902
2903 if (si == SEQ_START_TOKEN) {
2904 seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
2905 return 0;
2906 }
2907
2908 bytes = K(si->pages);
2909 inuse = K(swap_usage_in_pages(si));
2910
2911 file = si->swap_file;
2912 len = seq_file_path(swap, file, " \t\n\\");
2913 seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
2914 len < 40 ? 40 - len : 1, " ",
2915 S_ISBLK(file_inode(file)->i_mode) ?
2916 "partition" : "file\t",
2917 bytes, bytes < 10000000 ? "\t" : "",
2918 inuse, inuse < 10000000 ? "\t" : "",
2919 si->prio);
2920 return 0;
2921}
2922
2923static const struct seq_operations swaps_op = {
2924 .start = swap_start,
2925 .next = swap_next,
2926 .stop = swap_stop,
2927 .show = swap_show
2928};
2929
2930static int swaps_open(struct inode *inode, struct file *file)
2931{
2932 struct seq_file *seq;
2933 int ret;
2934
2935 ret = seq_open(file, &swaps_op);
2936 if (ret)
2937 return ret;
2938
2939 seq = file->private_data;
2940 seq->poll_event = atomic_read(&proc_poll_event);
2941 return 0;
2942}
2943
2944static const struct proc_ops swaps_proc_ops = {
2945 .proc_flags = PROC_ENTRY_PERMANENT,
2946 .proc_open = swaps_open,
2947 .proc_read = seq_read,
2948 .proc_lseek = seq_lseek,
2949 .proc_release = seq_release,
2950 .proc_poll = swaps_poll,
2951};
2952
2953static int __init procswaps_init(void)
2954{
2955 proc_create("swaps", 0, NULL, &swaps_proc_ops);
2956 return 0;
2957}
2958__initcall(procswaps_init);
2959#endif
2960
2961#ifdef MAX_SWAPFILES_CHECK
2962static int __init max_swapfiles_check(void)
2963{
2964 MAX_SWAPFILES_CHECK();
2965 return 0;
2966}
2967late_initcall(max_swapfiles_check);
2968#endif
2969
2970static struct swap_info_struct *alloc_swap_info(void)
2971{
2972 struct swap_info_struct *p;
2973 struct swap_info_struct *defer = NULL;
2974 unsigned int type;
2975 int i;
2976
2977 p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
2978 if (!p)
2979 return ERR_PTR(-ENOMEM);
2980
2981 if (percpu_ref_init(&p->users, swap_users_ref_free,
2982 PERCPU_REF_INIT_DEAD, GFP_KERNEL)) {
2983 kvfree(p);
2984 return ERR_PTR(-ENOMEM);
2985 }
2986
2987 spin_lock(&swap_lock);
2988 for (type = 0; type < nr_swapfiles; type++) {
2989 if (!(swap_info[type]->flags & SWP_USED))
2990 break;
2991 }
2992 if (type >= MAX_SWAPFILES) {
2993 spin_unlock(&swap_lock);
2994 percpu_ref_exit(&p->users);
2995 kvfree(p);
2996 return ERR_PTR(-EPERM);
2997 }
2998 if (type >= nr_swapfiles) {
2999 p->type = type;
3000
3001
3002
3003
3004 smp_store_release(&swap_info[type], p);
3005 nr_swapfiles++;
3006 } else {
3007 defer = p;
3008 p = swap_info[type];
3009
3010
3011
3012
3013 }
3014 p->swap_extent_root = RB_ROOT;
3015 plist_node_init(&p->list, 0);
3016 for_each_node(i)
3017 plist_node_init(&p->avail_lists[i], 0);
3018 p->flags = SWP_USED;
3019 spin_unlock(&swap_lock);
3020 if (defer) {
3021 percpu_ref_exit(&defer->users);
3022 kvfree(defer);
3023 }
3024 spin_lock_init(&p->lock);
3025 spin_lock_init(&p->cont_lock);
3026 atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT);
3027 init_completion(&p->comp);
3028
3029 return p;
3030}
3031
3032static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
3033{
3034 if (S_ISBLK(inode->i_mode)) {
3035 si->bdev = I_BDEV(inode);
3036
3037
3038
3039
3040
3041 if (bdev_is_zoned(si->bdev))
3042 return -EINVAL;
3043 si->flags |= SWP_BLKDEV;
3044 } else if (S_ISREG(inode->i_mode)) {
3045 si->bdev = inode->i_sb->s_bdev;
3046 }
3047
3048 return 0;
3049}
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068unsigned long generic_max_swapfile_size(void)
3069{
3070 return swp_offset(pte_to_swp_entry(
3071 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
3072}
3073
3074
3075__weak unsigned long arch_max_swapfile_size(void)
3076{
3077 return generic_max_swapfile_size();
3078}
3079
3080static unsigned long read_swap_header(struct swap_info_struct *si,
3081 union swap_header *swap_header,
3082 struct inode *inode)
3083{
3084 int i;
3085 unsigned long maxpages;
3086 unsigned long swapfilepages;
3087 unsigned long last_page;
3088
3089 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
3090 pr_err("Unable to find swap-space signature\n");
3091 return 0;
3092 }
3093
3094
3095 if (swab32(swap_header->info.version) == 1) {
3096 swab32s(&swap_header->info.version);
3097 swab32s(&swap_header->info.last_page);
3098 swab32s(&swap_header->info.nr_badpages);
3099 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
3100 return 0;
3101 for (i = 0; i < swap_header->info.nr_badpages; i++)
3102 swab32s(&swap_header->info.badpages[i]);
3103 }
3104
3105 if (swap_header->info.version != 1) {
3106 pr_warn("Unable to handle swap header version %d\n",
3107 swap_header->info.version);
3108 return 0;
3109 }
3110
3111 maxpages = swapfile_maximum_size;
3112 last_page = swap_header->info.last_page;
3113 if (!last_page) {
3114 pr_warn("Empty swap-file\n");
3115 return 0;
3116 }
3117 if (last_page > maxpages) {
3118 pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
3119 K(maxpages), K(last_page));
3120 }
3121 if (maxpages > last_page) {
3122 maxpages = last_page + 1;
3123
3124 if ((unsigned int)maxpages == 0)
3125 maxpages = UINT_MAX;
3126 }
3127
3128 if (!maxpages)
3129 return 0;
3130 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
3131 if (swapfilepages && maxpages > swapfilepages) {
3132 pr_warn("Swap area shorter than signature indicates\n");
3133 return 0;
3134 }
3135 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
3136 return 0;
3137 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
3138 return 0;
3139
3140 return maxpages;
3141}
3142
3143static int setup_swap_map(struct swap_info_struct *si,
3144 union swap_header *swap_header,
3145 unsigned char *swap_map,
3146 unsigned long maxpages)
3147{
3148 unsigned long i;
3149
3150 swap_map[0] = SWAP_MAP_BAD;
3151 for (i = 0; i < swap_header->info.nr_badpages; i++) {
3152 unsigned int page_nr = swap_header->info.badpages[i];
3153 if (page_nr == 0 || page_nr > swap_header->info.last_page)
3154 return -EINVAL;
3155 if (page_nr < maxpages) {
3156 swap_map[page_nr] = SWAP_MAP_BAD;
3157 si->pages--;
3158 }
3159 }
3160
3161 if (!si->pages) {
3162 pr_warn("Empty swap-file\n");
3163 return -EINVAL;
3164 }
3165
3166 return 0;
3167}
3168
3169#define SWAP_CLUSTER_INFO_COLS \
3170 DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
3171#define SWAP_CLUSTER_SPACE_COLS \
3172 DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
3173#define SWAP_CLUSTER_COLS \
3174 max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
3175
3176static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
3177 union swap_header *swap_header,
3178 unsigned long maxpages)
3179{
3180 unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
3181 struct swap_cluster_info *cluster_info;
3182 unsigned long i, j, idx;
3183 int err = -ENOMEM;
3184
3185 cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL);
3186 if (!cluster_info)
3187 goto err;
3188
3189 for (i = 0; i < nr_clusters; i++)
3190 spin_lock_init(&cluster_info[i].lock);
3191
3192 if (!(si->flags & SWP_SOLIDSTATE)) {
3193 si->global_cluster = kmalloc(sizeof(*si->global_cluster),
3194 GFP_KERNEL);
3195 if (!si->global_cluster)
3196 goto err_free;
3197 for (i = 0; i < SWAP_NR_ORDERS; i++)
3198 si->global_cluster->next[i] = SWAP_ENTRY_INVALID;
3199 spin_lock_init(&si->global_cluster_lock);
3200 }
3201
3202
3203
3204
3205
3206
3207
3208
3209 inc_cluster_info_page(si, cluster_info, 0);
3210 for (i = 0; i < swap_header->info.nr_badpages; i++) {
3211 unsigned int page_nr = swap_header->info.badpages[i];
3212
3213 if (page_nr >= maxpages)
3214 continue;
3215 inc_cluster_info_page(si, cluster_info, page_nr);
3216 }
3217 for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
3218 inc_cluster_info_page(si, cluster_info, i);
3219
3220 INIT_LIST_HEAD(&si->free_clusters);
3221 INIT_LIST_HEAD(&si->full_clusters);
3222 INIT_LIST_HEAD(&si->discard_clusters);
3223
3224 for (i = 0; i < SWAP_NR_ORDERS; i++) {
3225 INIT_LIST_HEAD(&si->nonfull_clusters[i]);
3226 INIT_LIST_HEAD(&si->frag_clusters[i]);
3227 atomic_long_set(&si->frag_cluster_nr[i], 0);
3228 }
3229
3230
3231
3232
3233
3234 for (j = 0; j < SWAP_CLUSTER_COLS; j++) {
3235 for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
3236 struct swap_cluster_info *ci;
3237 idx = i * SWAP_CLUSTER_COLS + j;
3238 ci = cluster_info + idx;
3239 if (idx >= nr_clusters)
3240 continue;
3241 if (ci->count) {
3242 ci->flags = CLUSTER_FLAG_NONFULL;
3243 list_add_tail(&ci->list, &si->nonfull_clusters[0]);
3244 continue;
3245 }
3246 ci->flags = CLUSTER_FLAG_FREE;
3247 list_add_tail(&ci->list, &si->free_clusters);
3248 }
3249 }
3250
3251 return cluster_info;
3252
3253err_free:
3254 kvfree(cluster_info);
3255err:
3256 return ERR_PTR(err);
3257}
3258
3259SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
3260{
3261 struct swap_info_struct *si;
3262 struct filename *name;
3263 struct file *swap_file = NULL;
3264 struct address_space *mapping;
3265 struct dentry *dentry;
3266 int prio;
3267 int error;
3268 union swap_header *swap_header;
3269 int nr_extents;
3270 sector_t span;
3271 unsigned long maxpages;
3272 unsigned char *swap_map = NULL;
3273 unsigned long *zeromap = NULL;
3274 struct swap_cluster_info *cluster_info = NULL;
3275 struct folio *folio = NULL;
3276 struct inode *inode = NULL;
3277 bool inced_nr_rotate_swap = false;
3278
3279 if (swap_flags & ~SWAP_FLAGS_VALID)
3280 return -EINVAL;
3281
3282 if (!capable(CAP_SYS_ADMIN))
3283 return -EPERM;
3284
3285 if (!swap_avail_heads)
3286 return -ENOMEM;
3287
3288 si = alloc_swap_info();
3289 if (IS_ERR(si))
3290 return PTR_ERR(si);
3291
3292 INIT_WORK(&si->discard_work, swap_discard_work);
3293 INIT_WORK(&si->reclaim_work, swap_reclaim_work);
3294
3295 name = getname(specialfile);
3296 if (IS_ERR(name)) {
3297 error = PTR_ERR(name);
3298 name = NULL;
3299 goto bad_swap;
3300 }
3301 swap_file = file_open_name(name, O_RDWR | O_LARGEFILE | O_EXCL, 0);
3302 if (IS_ERR(swap_file)) {
3303 error = PTR_ERR(swap_file);
3304 swap_file = NULL;
3305 goto bad_swap;
3306 }
3307
3308 si->swap_file = swap_file;
3309 mapping = swap_file->f_mapping;
3310 dentry = swap_file->f_path.dentry;
3311 inode = mapping->host;
3312
3313 error = claim_swapfile(si, inode);
3314 if (unlikely(error))
3315 goto bad_swap;
3316
3317 inode_lock(inode);
3318 if (d_unlinked(dentry) || cant_mount(dentry)) {
3319 error = -ENOENT;
3320 goto bad_swap_unlock_inode;
3321 }
3322 if (IS_SWAPFILE(inode)) {
3323 error = -EBUSY;
3324 goto bad_swap_unlock_inode;
3325 }
3326
3327
3328
3329
3330
3331 if (mapping_min_folio_order(mapping) > 0) {
3332 error = -EINVAL;
3333 goto bad_swap_unlock_inode;
3334 }
3335
3336
3337
3338
3339 if (!mapping->a_ops->read_folio) {
3340 error = -EINVAL;
3341 goto bad_swap_unlock_inode;
3342 }
3343 folio = read_mapping_folio(mapping, 0, swap_file);
3344 if (IS_ERR(folio)) {
3345 error = PTR_ERR(folio);
3346 goto bad_swap_unlock_inode;
3347 }
3348 swap_header = kmap_local_folio(folio, 0);
3349
3350 maxpages = read_swap_header(si, swap_header, inode);
3351 if (unlikely(!maxpages)) {
3352 error = -EINVAL;
3353 goto bad_swap_unlock_inode;
3354 }
3355
3356 si->max = maxpages;
3357 si->pages = maxpages - 1;
3358 nr_extents = setup_swap_extents(si, &span);
3359 if (nr_extents < 0) {
3360 error = nr_extents;
3361 goto bad_swap_unlock_inode;
3362 }
3363 if (si->pages != si->max - 1) {
3364 pr_err("swap:%u != (max:%u - 1)\n", si->pages, si->max);
3365 error = -EINVAL;
3366 goto bad_swap_unlock_inode;
3367 }
3368
3369 maxpages = si->max;
3370
3371
3372 swap_map = vzalloc(maxpages);
3373 if (!swap_map) {
3374 error = -ENOMEM;
3375 goto bad_swap_unlock_inode;
3376 }
3377
3378 error = swap_cgroup_swapon(si->type, maxpages);
3379 if (error)
3380 goto bad_swap_unlock_inode;
3381
3382 error = setup_swap_map(si, swap_header, swap_map, maxpages);
3383 if (error)
3384 goto bad_swap_unlock_inode;
3385
3386
3387
3388
3389
3390 zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long),
3391 GFP_KERNEL | __GFP_ZERO);
3392 if (!zeromap) {
3393 error = -ENOMEM;
3394 goto bad_swap_unlock_inode;
3395 }
3396
3397 if (si->bdev && bdev_stable_writes(si->bdev))
3398 si->flags |= SWP_STABLE_WRITES;
3399
3400 if (si->bdev && bdev_synchronous(si->bdev))
3401 si->flags |= SWP_SYNCHRONOUS_IO;
3402
3403 if (si->bdev && bdev_nonrot(si->bdev)) {
3404 si->flags |= SWP_SOLIDSTATE;
3405 } else {
3406 atomic_inc(&nr_rotate_swap);
3407 inced_nr_rotate_swap = true;
3408 }
3409
3410 cluster_info = setup_clusters(si, swap_header, maxpages);
3411 if (IS_ERR(cluster_info)) {
3412 error = PTR_ERR(cluster_info);
3413 cluster_info = NULL;
3414 goto bad_swap_unlock_inode;
3415 }
3416
3417 if ((swap_flags & SWAP_FLAG_DISCARD) &&
3418 si->bdev && bdev_max_discard_sectors(si->bdev)) {
3419
3420
3421
3422
3423
3424
3425 si->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
3426 SWP_PAGE_DISCARD);
3427
3428
3429
3430
3431
3432
3433
3434 if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
3435 si->flags &= ~SWP_PAGE_DISCARD;
3436 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
3437 si->flags &= ~SWP_AREA_DISCARD;
3438
3439
3440 if (si->flags & SWP_AREA_DISCARD) {
3441 int err = discard_swap(si);
3442 if (unlikely(err))
3443 pr_err("swapon: discard_swap(%p): %d\n",
3444 si, err);
3445 }
3446 }
3447
3448 error = init_swap_address_space(si->type, maxpages);
3449 if (error)
3450 goto bad_swap_unlock_inode;
3451
3452 error = zswap_swapon(si->type, maxpages);
3453 if (error)
3454 goto free_swap_address_space;
3455
3456
3457
3458
3459
3460 inode->i_flags |= S_SWAPFILE;
3461 error = inode_drain_writes(inode);
3462 if (error) {
3463 inode->i_flags &= ~S_SWAPFILE;
3464 goto free_swap_zswap;
3465 }
3466
3467 mutex_lock(&swapon_mutex);
3468 prio = -1;
3469 if (swap_flags & SWAP_FLAG_PREFER)
3470 prio = swap_flags & SWAP_FLAG_PRIO_MASK;
3471 enable_swap_info(si, prio, swap_map, cluster_info, zeromap);
3472
3473 pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n",
3474 K(si->pages), name->name, si->prio, nr_extents,
3475 K((unsigned long long)span),
3476 (si->flags & SWP_SOLIDSTATE) ? "SS" : "",
3477 (si->flags & SWP_DISCARDABLE) ? "D" : "",
3478 (si->flags & SWP_AREA_DISCARD) ? "s" : "",
3479 (si->flags & SWP_PAGE_DISCARD) ? "c" : "");
3480
3481 mutex_unlock(&swapon_mutex);
3482 atomic_inc(&proc_poll_event);
3483 wake_up_interruptible(&proc_poll_wait);
3484
3485 error = 0;
3486 goto out;
3487free_swap_zswap:
3488 zswap_swapoff(si->type);
3489free_swap_address_space:
3490 exit_swap_address_space(si->type);
3491bad_swap_unlock_inode:
3492 inode_unlock(inode);
3493bad_swap:
3494 kfree(si->global_cluster);
3495 si->global_cluster = NULL;
3496 inode = NULL;
3497 destroy_swap_extents(si);
3498 swap_cgroup_swapoff(si->type);
3499 spin_lock(&swap_lock);
3500 si->swap_file = NULL;
3501 si->flags = 0;
3502 spin_unlock(&swap_lock);
3503 vfree(swap_map);
3504 kvfree(zeromap);
3505 kvfree(cluster_info);
3506 if (inced_nr_rotate_swap)
3507 atomic_dec(&nr_rotate_swap);
3508 if (swap_file)
3509 filp_close(swap_file, NULL);
3510out:
3511 if (!IS_ERR_OR_NULL(folio))
3512 folio_release_kmap(folio, swap_header);
3513 if (name)
3514 putname(name);
3515 if (inode)
3516 inode_unlock(inode);
3517 return error;
3518}
3519
3520void si_swapinfo(struct sysinfo *val)
3521{
3522 unsigned int type;
3523 unsigned long nr_to_be_unused = 0;
3524
3525 spin_lock(&swap_lock);
3526 for (type = 0; type < nr_swapfiles; type++) {
3527 struct swap_info_struct *si = swap_info[type];
3528
3529 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
3530 nr_to_be_unused += swap_usage_in_pages(si);
3531 }
3532 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
3533 val->totalswap = total_swap_pages + nr_to_be_unused;
3534 spin_unlock(&swap_lock);
3535}
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
3548{
3549 struct swap_info_struct *si;
3550 struct swap_cluster_info *ci;
3551 unsigned long offset;
3552 unsigned char count;
3553 unsigned char has_cache;
3554 int err, i;
3555
3556 si = swp_swap_info(entry);
3557 if (WARN_ON_ONCE(!si)) {
3558 pr_err("%s%08lx\n", Bad_file, entry.val);
3559 return -EINVAL;
3560 }
3561
3562 offset = swp_offset(entry);
3563 VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
3564 VM_WARN_ON(usage == 1 && nr > 1);
3565 ci = lock_cluster(si, offset);
3566
3567 err = 0;
3568 for (i = 0; i < nr; i++) {
3569 count = si->swap_map[offset + i];
3570
3571
3572
3573
3574
3575 if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
3576 err = -ENOENT;
3577 goto unlock_out;
3578 }
3579
3580 has_cache = count & SWAP_HAS_CACHE;
3581 count &= ~SWAP_HAS_CACHE;
3582
3583 if (!count && !has_cache) {
3584 err = -ENOENT;
3585 } else if (usage == SWAP_HAS_CACHE) {
3586 if (has_cache)
3587 err = -EEXIST;
3588 } else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) {
3589 err = -EINVAL;
3590 }
3591
3592 if (err)
3593 goto unlock_out;
3594 }
3595
3596 for (i = 0; i < nr; i++) {
3597 count = si->swap_map[offset + i];
3598 has_cache = count & SWAP_HAS_CACHE;
3599 count &= ~SWAP_HAS_CACHE;
3600
3601 if (usage == SWAP_HAS_CACHE)
3602 has_cache = SWAP_HAS_CACHE;
3603 else if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
3604 count += usage;
3605 else if (swap_count_continued(si, offset + i, count))
3606 count = COUNT_CONTINUED;
3607 else {
3608
3609
3610
3611
3612 err = -ENOMEM;
3613 goto unlock_out;
3614 }
3615
3616 WRITE_ONCE(si->swap_map[offset + i], count | has_cache);
3617 }
3618
3619unlock_out:
3620 unlock_cluster(ci);
3621 return err;
3622}
3623
3624
3625
3626
3627
3628void swap_shmem_alloc(swp_entry_t entry, int nr)
3629{
3630 __swap_duplicate(entry, SWAP_MAP_SHMEM, nr);
3631}
3632
3633
3634
3635
3636
3637
3638
3639
3640int swap_duplicate(swp_entry_t entry)
3641{
3642 int err = 0;
3643
3644 while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM)
3645 err = add_swap_count_continuation(entry, GFP_ATOMIC);
3646 return err;
3647}
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657int swapcache_prepare(swp_entry_t entry, int nr)
3658{
3659 return __swap_duplicate(entry, SWAP_HAS_CACHE, nr);
3660}
3661
3662
3663
3664
3665
3666void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr)
3667{
3668 swap_entries_put_cache(si, entry, nr);
3669}
3670
3671struct swap_info_struct *swp_swap_info(swp_entry_t entry)
3672{
3673 return swap_type_to_swap_info(swp_type(entry));
3674}
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
3692{
3693 struct swap_info_struct *si;
3694 struct swap_cluster_info *ci;
3695 struct page *head;
3696 struct page *page;
3697 struct page *list_page;
3698 pgoff_t offset;
3699 unsigned char count;
3700 int ret = 0;
3701
3702
3703
3704
3705
3706 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
3707
3708 si = get_swap_device(entry);
3709 if (!si) {
3710
3711
3712
3713
3714 goto outer;
3715 }
3716
3717 offset = swp_offset(entry);
3718
3719 ci = lock_cluster(si, offset);
3720
3721 count = swap_count(si->swap_map[offset]);
3722
3723 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
3724
3725
3726
3727
3728
3729 goto out;
3730 }
3731
3732 if (!page) {
3733 ret = -ENOMEM;
3734 goto out;
3735 }
3736
3737 head = vmalloc_to_page(si->swap_map + offset);
3738 offset &= ~PAGE_MASK;
3739
3740 spin_lock(&si->cont_lock);
3741
3742
3743
3744
3745 if (!page_private(head)) {
3746 BUG_ON(count & COUNT_CONTINUED);
3747 INIT_LIST_HEAD(&head->lru);
3748 set_page_private(head, SWP_CONTINUED);
3749 si->flags |= SWP_CONTINUED;
3750 }
3751
3752 list_for_each_entry(list_page, &head->lru, lru) {
3753 unsigned char *map;
3754
3755
3756
3757
3758
3759 if (!(count & COUNT_CONTINUED))
3760 goto out_unlock_cont;
3761
3762 map = kmap_local_page(list_page) + offset;
3763 count = *map;
3764 kunmap_local(map);
3765
3766
3767
3768
3769
3770 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
3771 goto out_unlock_cont;
3772 }
3773
3774 list_add_tail(&page->lru, &head->lru);
3775 page = NULL;
3776out_unlock_cont:
3777 spin_unlock(&si->cont_lock);
3778out:
3779 unlock_cluster(ci);
3780 put_swap_device(si);
3781outer:
3782 if (page)
3783 __free_page(page);
3784 return ret;
3785}
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796static bool swap_count_continued(struct swap_info_struct *si,
3797 pgoff_t offset, unsigned char count)
3798{
3799 struct page *head;
3800 struct page *page;
3801 unsigned char *map;
3802 bool ret;
3803
3804 head = vmalloc_to_page(si->swap_map + offset);
3805 if (page_private(head) != SWP_CONTINUED) {
3806 BUG_ON(count & COUNT_CONTINUED);
3807 return false;
3808 }
3809
3810 spin_lock(&si->cont_lock);
3811 offset &= ~PAGE_MASK;
3812 page = list_next_entry(head, lru);
3813 map = kmap_local_page(page) + offset;
3814
3815 if (count == SWAP_MAP_MAX)
3816 goto init_map;
3817
3818 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) {
3819
3820
3821
3822 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
3823 kunmap_local(map);
3824 page = list_next_entry(page, lru);
3825 BUG_ON(page == head);
3826 map = kmap_local_page(page) + offset;
3827 }
3828 if (*map == SWAP_CONT_MAX) {
3829 kunmap_local(map);
3830 page = list_next_entry(page, lru);
3831 if (page == head) {
3832 ret = false;
3833 goto out;
3834 }
3835 map = kmap_local_page(page) + offset;
3836init_map: *map = 0;
3837 }
3838 *map += 1;
3839 kunmap_local(map);
3840 while ((page = list_prev_entry(page, lru)) != head) {
3841 map = kmap_local_page(page) + offset;
3842 *map = COUNT_CONTINUED;
3843 kunmap_local(map);
3844 }
3845 ret = true;
3846
3847 } else {
3848
3849
3850
3851 BUG_ON(count != COUNT_CONTINUED);
3852 while (*map == COUNT_CONTINUED) {
3853 kunmap_local(map);
3854 page = list_next_entry(page, lru);
3855 BUG_ON(page == head);
3856 map = kmap_local_page(page) + offset;
3857 }
3858 BUG_ON(*map == 0);
3859 *map -= 1;
3860 if (*map == 0)
3861 count = 0;
3862 kunmap_local(map);
3863 while ((page = list_prev_entry(page, lru)) != head) {
3864 map = kmap_local_page(page) + offset;
3865 *map = SWAP_CONT_MAX | count;
3866 count = COUNT_CONTINUED;
3867 kunmap_local(map);
3868 }
3869 ret = count == COUNT_CONTINUED;
3870 }
3871out:
3872 spin_unlock(&si->cont_lock);
3873 return ret;
3874}
3875
3876
3877
3878
3879
3880static void free_swap_count_continuations(struct swap_info_struct *si)
3881{
3882 pgoff_t offset;
3883
3884 for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
3885 struct page *head;
3886 head = vmalloc_to_page(si->swap_map + offset);
3887 if (page_private(head)) {
3888 struct page *page, *next;
3889
3890 list_for_each_entry_safe(page, next, &head->lru, lru) {
3891 list_del(&page->lru);
3892 __free_page(page);
3893 }
3894 }
3895 }
3896}
3897
3898#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
3899static bool __has_usable_swap(void)
3900{
3901 return !plist_head_empty(&swap_active_head);
3902}
3903
3904void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
3905{
3906 struct swap_info_struct *si, *next;
3907 int nid = folio_nid(folio);
3908
3909 if (!(gfp & __GFP_IO))
3910 return;
3911
3912 if (!__has_usable_swap())
3913 return;
3914
3915 if (!blk_cgroup_congested())
3916 return;
3917
3918
3919
3920
3921
3922 if (current->throttle_disk)
3923 return;
3924
3925 spin_lock(&swap_avail_lock);
3926 plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
3927 avail_lists[nid]) {
3928 if (si->bdev) {
3929 blkcg_schedule_throttle(si->bdev->bd_disk, true);
3930 break;
3931 }
3932 }
3933 spin_unlock(&swap_avail_lock);
3934}
3935#endif
3936
3937static int __init swapfile_init(void)
3938{
3939 int nid;
3940
3941 swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
3942 GFP_KERNEL);
3943 if (!swap_avail_heads) {
3944 pr_emerg("Not enough memory for swap heads, swap is disabled\n");
3945 return -ENOMEM;
3946 }
3947
3948 for_each_node(nid)
3949 plist_head_init(&swap_avail_heads[nid]);
3950
3951 swapfile_maximum_size = arch_max_swapfile_size();
3952
3953#ifdef CONFIG_MIGRATION
3954 if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS))
3955 swap_migration_ad_supported = true;
3956#endif
3957
3958 return 0;
3959}
3960subsys_initcall(swapfile_init);
3961