1
2
3
4
5
6
7
8#include <linux/mm.h>
9#include <linux/sched/mm.h>
10#include <linux/sched/task.h>
11#include <linux/hugetlb.h>
12#include <linux/mman.h>
13#include <linux/slab.h>
14#include <linux/kernel_stat.h>
15#include <linux/swap.h>
16#include <linux/vmalloc.h>
17#include <linux/pagemap.h>
18#include <linux/namei.h>
19#include <linux/shmem_fs.h>
20#include <linux/blkdev.h>
21#include <linux/random.h>
22#include <linux/writeback.h>
23#include <linux/proc_fs.h>
24#include <linux/seq_file.h>
25#include <linux/init.h>
26#include <linux/ksm.h>
27#include <linux/rmap.h>
28#include <linux/security.h>
29#include <linux/backing-dev.h>
30#include <linux/mutex.h>
31#include <linux/capability.h>
32#include <linux/syscalls.h>
33#include <linux/memcontrol.h>
34#include <linux/poll.h>
35#include <linux/oom.h>
36#include <linux/frontswap.h>
37#include <linux/swapfile.h>
38#include <linux/export.h>
39#include <linux/swap_slots.h>
40#include <linux/sort.h>
41
42#include <asm/pgtable.h>
43#include <asm/tlbflush.h>
44#include <linux/swapops.h>
45#include <linux/swap_cgroup.h>
46
47static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
48 unsigned char);
49static void free_swap_count_continuations(struct swap_info_struct *);
50static sector_t map_swap_entry(swp_entry_t, struct block_device**);
51
52DEFINE_SPINLOCK(swap_lock);
53static unsigned int nr_swapfiles;
54atomic_long_t nr_swap_pages;
55
56
57
58
59
60EXPORT_SYMBOL_GPL(nr_swap_pages);
61
62long total_swap_pages;
63static int least_priority = -1;
64
65static const char Bad_file[] = "Bad swap file entry ";
66static const char Unused_file[] = "Unused swap file entry ";
67static const char Bad_offset[] = "Bad swap offset entry ";
68static const char Unused_offset[] = "Unused swap offset entry ";
69
70
71
72
73
74PLIST_HEAD(swap_active_head);
75
76
77
78
79
80
81
82
83
84
85
86
87
88static struct plist_head *swap_avail_heads;
89static DEFINE_SPINLOCK(swap_avail_lock);
90
91struct swap_info_struct *swap_info[MAX_SWAPFILES];
92
93static DEFINE_MUTEX(swapon_mutex);
94
95static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
96
97static atomic_t proc_poll_event = ATOMIC_INIT(0);
98
99atomic_t nr_rotate_swap = ATOMIC_INIT(0);
100
101static inline unsigned char swap_count(unsigned char ent)
102{
103 return ent & ~SWAP_HAS_CACHE;
104}
105
106
107static int
108__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
109{
110 swp_entry_t entry = swp_entry(si->type, offset);
111 struct page *page;
112 int ret = 0;
113
114 page = find_get_page(swap_address_space(entry), swp_offset(entry));
115 if (!page)
116 return 0;
117
118
119
120
121
122
123
124 if (trylock_page(page)) {
125 ret = try_to_free_swap(page);
126 unlock_page(page);
127 }
128 put_page(page);
129 return ret;
130}
131
132
133
134
135
136static int discard_swap(struct swap_info_struct *si)
137{
138 struct swap_extent *se;
139 sector_t start_block;
140 sector_t nr_blocks;
141 int err = 0;
142
143
144 se = &si->first_swap_extent;
145 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
146 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
147 if (nr_blocks) {
148 err = blkdev_issue_discard(si->bdev, start_block,
149 nr_blocks, GFP_KERNEL, 0);
150 if (err)
151 return err;
152 cond_resched();
153 }
154
155 list_for_each_entry(se, &si->first_swap_extent.list, list) {
156 start_block = se->start_block << (PAGE_SHIFT - 9);
157 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
158
159 err = blkdev_issue_discard(si->bdev, start_block,
160 nr_blocks, GFP_KERNEL, 0);
161 if (err)
162 break;
163
164 cond_resched();
165 }
166 return err;
167}
168
169
170
171
172
173static void discard_swap_cluster(struct swap_info_struct *si,
174 pgoff_t start_page, pgoff_t nr_pages)
175{
176 struct swap_extent *se = si->curr_swap_extent;
177 int found_extent = 0;
178
179 while (nr_pages) {
180 if (se->start_page <= start_page &&
181 start_page < se->start_page + se->nr_pages) {
182 pgoff_t offset = start_page - se->start_page;
183 sector_t start_block = se->start_block + offset;
184 sector_t nr_blocks = se->nr_pages - offset;
185
186 if (nr_blocks > nr_pages)
187 nr_blocks = nr_pages;
188 start_page += nr_blocks;
189 nr_pages -= nr_blocks;
190
191 if (!found_extent++)
192 si->curr_swap_extent = se;
193
194 start_block <<= PAGE_SHIFT - 9;
195 nr_blocks <<= PAGE_SHIFT - 9;
196 if (blkdev_issue_discard(si->bdev, start_block,
197 nr_blocks, GFP_NOIO, 0))
198 break;
199 }
200
201 se = list_next_entry(se, list);
202 }
203}
204
205#ifdef CONFIG_THP_SWAP
206#define SWAPFILE_CLUSTER HPAGE_PMD_NR
207
208#define swap_entry_size(size) (size)
209#else
210#define SWAPFILE_CLUSTER 256
211
212
213
214
215
216#define swap_entry_size(size) 1
217#endif
218#define LATENCY_LIMIT 256
219
220static inline void cluster_set_flag(struct swap_cluster_info *info,
221 unsigned int flag)
222{
223 info->flags = flag;
224}
225
226static inline unsigned int cluster_count(struct swap_cluster_info *info)
227{
228 return info->data;
229}
230
231static inline void cluster_set_count(struct swap_cluster_info *info,
232 unsigned int c)
233{
234 info->data = c;
235}
236
237static inline void cluster_set_count_flag(struct swap_cluster_info *info,
238 unsigned int c, unsigned int f)
239{
240 info->flags = f;
241 info->data = c;
242}
243
244static inline unsigned int cluster_next(struct swap_cluster_info *info)
245{
246 return info->data;
247}
248
249static inline void cluster_set_next(struct swap_cluster_info *info,
250 unsigned int n)
251{
252 info->data = n;
253}
254
255static inline void cluster_set_next_flag(struct swap_cluster_info *info,
256 unsigned int n, unsigned int f)
257{
258 info->flags = f;
259 info->data = n;
260}
261
262static inline bool cluster_is_free(struct swap_cluster_info *info)
263{
264 return info->flags & CLUSTER_FLAG_FREE;
265}
266
267static inline bool cluster_is_null(struct swap_cluster_info *info)
268{
269 return info->flags & CLUSTER_FLAG_NEXT_NULL;
270}
271
272static inline void cluster_set_null(struct swap_cluster_info *info)
273{
274 info->flags = CLUSTER_FLAG_NEXT_NULL;
275 info->data = 0;
276}
277
278static inline bool cluster_is_huge(struct swap_cluster_info *info)
279{
280 if (IS_ENABLED(CONFIG_THP_SWAP))
281 return info->flags & CLUSTER_FLAG_HUGE;
282 return false;
283}
284
285static inline void cluster_clear_huge(struct swap_cluster_info *info)
286{
287 info->flags &= ~CLUSTER_FLAG_HUGE;
288}
289
290static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
291 unsigned long offset)
292{
293 struct swap_cluster_info *ci;
294
295 ci = si->cluster_info;
296 if (ci) {
297 ci += offset / SWAPFILE_CLUSTER;
298 spin_lock(&ci->lock);
299 }
300 return ci;
301}
302
303static inline void unlock_cluster(struct swap_cluster_info *ci)
304{
305 if (ci)
306 spin_unlock(&ci->lock);
307}
308
309
310
311
312
313static inline struct swap_cluster_info *lock_cluster_or_swap_info(
314 struct swap_info_struct *si, unsigned long offset)
315{
316 struct swap_cluster_info *ci;
317
318
319 ci = lock_cluster(si, offset);
320
321 if (!ci)
322 spin_lock(&si->lock);
323
324 return ci;
325}
326
327static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
328 struct swap_cluster_info *ci)
329{
330 if (ci)
331 unlock_cluster(ci);
332 else
333 spin_unlock(&si->lock);
334}
335
336static inline bool cluster_list_empty(struct swap_cluster_list *list)
337{
338 return cluster_is_null(&list->head);
339}
340
341static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
342{
343 return cluster_next(&list->head);
344}
345
346static void cluster_list_init(struct swap_cluster_list *list)
347{
348 cluster_set_null(&list->head);
349 cluster_set_null(&list->tail);
350}
351
352static void cluster_list_add_tail(struct swap_cluster_list *list,
353 struct swap_cluster_info *ci,
354 unsigned int idx)
355{
356 if (cluster_list_empty(list)) {
357 cluster_set_next_flag(&list->head, idx, 0);
358 cluster_set_next_flag(&list->tail, idx, 0);
359 } else {
360 struct swap_cluster_info *ci_tail;
361 unsigned int tail = cluster_next(&list->tail);
362
363
364
365
366
367 ci_tail = ci + tail;
368 spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
369 cluster_set_next(ci_tail, idx);
370 spin_unlock(&ci_tail->lock);
371 cluster_set_next_flag(&list->tail, idx, 0);
372 }
373}
374
375static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
376 struct swap_cluster_info *ci)
377{
378 unsigned int idx;
379
380 idx = cluster_next(&list->head);
381 if (cluster_next(&list->tail) == idx) {
382 cluster_set_null(&list->head);
383 cluster_set_null(&list->tail);
384 } else
385 cluster_set_next_flag(&list->head,
386 cluster_next(&ci[idx]), 0);
387
388 return idx;
389}
390
391
392static void swap_cluster_schedule_discard(struct swap_info_struct *si,
393 unsigned int idx)
394{
395
396
397
398
399
400
401 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
402 SWAP_MAP_BAD, SWAPFILE_CLUSTER);
403
404 cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
405
406 schedule_work(&si->discard_work);
407}
408
409static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
410{
411 struct swap_cluster_info *ci = si->cluster_info;
412
413 cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
414 cluster_list_add_tail(&si->free_clusters, ci, idx);
415}
416
417
418
419
420
421static void swap_do_scheduled_discard(struct swap_info_struct *si)
422{
423 struct swap_cluster_info *info, *ci;
424 unsigned int idx;
425
426 info = si->cluster_info;
427
428 while (!cluster_list_empty(&si->discard_clusters)) {
429 idx = cluster_list_del_first(&si->discard_clusters, info);
430 spin_unlock(&si->lock);
431
432 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
433 SWAPFILE_CLUSTER);
434
435 spin_lock(&si->lock);
436 ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
437 __free_cluster(si, idx);
438 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
439 0, SWAPFILE_CLUSTER);
440 unlock_cluster(ci);
441 }
442}
443
444static void swap_discard_work(struct work_struct *work)
445{
446 struct swap_info_struct *si;
447
448 si = container_of(work, struct swap_info_struct, discard_work);
449
450 spin_lock(&si->lock);
451 swap_do_scheduled_discard(si);
452 spin_unlock(&si->lock);
453}
454
455static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
456{
457 struct swap_cluster_info *ci = si->cluster_info;
458
459 VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
460 cluster_list_del_first(&si->free_clusters, ci);
461 cluster_set_count_flag(ci + idx, 0, 0);
462}
463
464static void free_cluster(struct swap_info_struct *si, unsigned long idx)
465{
466 struct swap_cluster_info *ci = si->cluster_info + idx;
467
468 VM_BUG_ON(cluster_count(ci) != 0);
469
470
471
472
473
474 if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
475 (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
476 swap_cluster_schedule_discard(si, idx);
477 return;
478 }
479
480 __free_cluster(si, idx);
481}
482
483
484
485
486
487static void inc_cluster_info_page(struct swap_info_struct *p,
488 struct swap_cluster_info *cluster_info, unsigned long page_nr)
489{
490 unsigned long idx = page_nr / SWAPFILE_CLUSTER;
491
492 if (!cluster_info)
493 return;
494 if (cluster_is_free(&cluster_info[idx]))
495 alloc_cluster(p, idx);
496
497 VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
498 cluster_set_count(&cluster_info[idx],
499 cluster_count(&cluster_info[idx]) + 1);
500}
501
502
503
504
505
506
507static void dec_cluster_info_page(struct swap_info_struct *p,
508 struct swap_cluster_info *cluster_info, unsigned long page_nr)
509{
510 unsigned long idx = page_nr / SWAPFILE_CLUSTER;
511
512 if (!cluster_info)
513 return;
514
515 VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
516 cluster_set_count(&cluster_info[idx],
517 cluster_count(&cluster_info[idx]) - 1);
518
519 if (cluster_count(&cluster_info[idx]) == 0)
520 free_cluster(p, idx);
521}
522
523
524
525
526
527static bool
528scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
529 unsigned long offset)
530{
531 struct percpu_cluster *percpu_cluster;
532 bool conflict;
533
534 offset /= SWAPFILE_CLUSTER;
535 conflict = !cluster_list_empty(&si->free_clusters) &&
536 offset != cluster_list_first(&si->free_clusters) &&
537 cluster_is_free(&si->cluster_info[offset]);
538
539 if (!conflict)
540 return false;
541
542 percpu_cluster = this_cpu_ptr(si->percpu_cluster);
543 cluster_set_null(&percpu_cluster->index);
544 return true;
545}
546
547
548
549
550
551static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
552 unsigned long *offset, unsigned long *scan_base)
553{
554 struct percpu_cluster *cluster;
555 struct swap_cluster_info *ci;
556 bool found_free;
557 unsigned long tmp, max;
558
559new_cluster:
560 cluster = this_cpu_ptr(si->percpu_cluster);
561 if (cluster_is_null(&cluster->index)) {
562 if (!cluster_list_empty(&si->free_clusters)) {
563 cluster->index = si->free_clusters.head;
564 cluster->next = cluster_next(&cluster->index) *
565 SWAPFILE_CLUSTER;
566 } else if (!cluster_list_empty(&si->discard_clusters)) {
567
568
569
570
571 swap_do_scheduled_discard(si);
572 *scan_base = *offset = si->cluster_next;
573 goto new_cluster;
574 } else
575 return false;
576 }
577
578 found_free = false;
579
580
581
582
583
584 tmp = cluster->next;
585 max = min_t(unsigned long, si->max,
586 (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
587 if (tmp >= max) {
588 cluster_set_null(&cluster->index);
589 goto new_cluster;
590 }
591 ci = lock_cluster(si, tmp);
592 while (tmp < max) {
593 if (!si->swap_map[tmp]) {
594 found_free = true;
595 break;
596 }
597 tmp++;
598 }
599 unlock_cluster(ci);
600 if (!found_free) {
601 cluster_set_null(&cluster->index);
602 goto new_cluster;
603 }
604 cluster->next = tmp + 1;
605 *offset = tmp;
606 *scan_base = tmp;
607 return found_free;
608}
609
610static void __del_from_avail_list(struct swap_info_struct *p)
611{
612 int nid;
613
614 for_each_node(nid)
615 plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
616}
617
618static void del_from_avail_list(struct swap_info_struct *p)
619{
620 spin_lock(&swap_avail_lock);
621 __del_from_avail_list(p);
622 spin_unlock(&swap_avail_lock);
623}
624
625static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
626 unsigned int nr_entries)
627{
628 unsigned int end = offset + nr_entries - 1;
629
630 if (offset == si->lowest_bit)
631 si->lowest_bit += nr_entries;
632 if (end == si->highest_bit)
633 si->highest_bit -= nr_entries;
634 si->inuse_pages += nr_entries;
635 if (si->inuse_pages == si->pages) {
636 si->lowest_bit = si->max;
637 si->highest_bit = 0;
638 del_from_avail_list(si);
639 }
640}
641
642static void add_to_avail_list(struct swap_info_struct *p)
643{
644 int nid;
645
646 spin_lock(&swap_avail_lock);
647 for_each_node(nid) {
648 WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
649 plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
650 }
651 spin_unlock(&swap_avail_lock);
652}
653
654static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
655 unsigned int nr_entries)
656{
657 unsigned long end = offset + nr_entries - 1;
658 void (*swap_slot_free_notify)(struct block_device *, unsigned long);
659
660 if (offset < si->lowest_bit)
661 si->lowest_bit = offset;
662 if (end > si->highest_bit) {
663 bool was_full = !si->highest_bit;
664
665 si->highest_bit = end;
666 if (was_full && (si->flags & SWP_WRITEOK))
667 add_to_avail_list(si);
668 }
669 atomic_long_add(nr_entries, &nr_swap_pages);
670 si->inuse_pages -= nr_entries;
671 if (si->flags & SWP_BLKDEV)
672 swap_slot_free_notify =
673 si->bdev->bd_disk->fops->swap_slot_free_notify;
674 else
675 swap_slot_free_notify = NULL;
676 while (offset <= end) {
677 frontswap_invalidate_page(si->type, offset);
678 if (swap_slot_free_notify)
679 swap_slot_free_notify(si->bdev, offset);
680 offset++;
681 }
682}
683
684static int scan_swap_map_slots(struct swap_info_struct *si,
685 unsigned char usage, int nr,
686 swp_entry_t slots[])
687{
688 struct swap_cluster_info *ci;
689 unsigned long offset;
690 unsigned long scan_base;
691 unsigned long last_in_cluster = 0;
692 int latency_ration = LATENCY_LIMIT;
693 int n_ret = 0;
694
695 if (nr > SWAP_BATCH)
696 nr = SWAP_BATCH;
697
698
699
700
701
702
703
704
705
706
707
708
709 si->flags += SWP_SCANNING;
710 scan_base = offset = si->cluster_next;
711
712
713 if (si->cluster_info) {
714 if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
715 goto checks;
716 else
717 goto scan;
718 }
719
720 if (unlikely(!si->cluster_nr--)) {
721 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
722 si->cluster_nr = SWAPFILE_CLUSTER - 1;
723 goto checks;
724 }
725
726 spin_unlock(&si->lock);
727
728
729
730
731
732
733
734 scan_base = offset = si->lowest_bit;
735 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
736
737
738 for (; last_in_cluster <= si->highest_bit; offset++) {
739 if (si->swap_map[offset])
740 last_in_cluster = offset + SWAPFILE_CLUSTER;
741 else if (offset == last_in_cluster) {
742 spin_lock(&si->lock);
743 offset -= SWAPFILE_CLUSTER - 1;
744 si->cluster_next = offset;
745 si->cluster_nr = SWAPFILE_CLUSTER - 1;
746 goto checks;
747 }
748 if (unlikely(--latency_ration < 0)) {
749 cond_resched();
750 latency_ration = LATENCY_LIMIT;
751 }
752 }
753
754 offset = scan_base;
755 spin_lock(&si->lock);
756 si->cluster_nr = SWAPFILE_CLUSTER - 1;
757 }
758
759checks:
760 if (si->cluster_info) {
761 while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
762
763 if (n_ret)
764 goto done;
765 if (!scan_swap_map_try_ssd_cluster(si, &offset,
766 &scan_base))
767 goto scan;
768 }
769 }
770 if (!(si->flags & SWP_WRITEOK))
771 goto no_page;
772 if (!si->highest_bit)
773 goto no_page;
774 if (offset > si->highest_bit)
775 scan_base = offset = si->lowest_bit;
776
777 ci = lock_cluster(si, offset);
778
779 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
780 int swap_was_freed;
781 unlock_cluster(ci);
782 spin_unlock(&si->lock);
783 swap_was_freed = __try_to_reclaim_swap(si, offset);
784 spin_lock(&si->lock);
785
786 if (swap_was_freed)
787 goto checks;
788 goto scan;
789 }
790
791 if (si->swap_map[offset]) {
792 unlock_cluster(ci);
793 if (!n_ret)
794 goto scan;
795 else
796 goto done;
797 }
798 si->swap_map[offset] = usage;
799 inc_cluster_info_page(si, si->cluster_info, offset);
800 unlock_cluster(ci);
801
802 swap_range_alloc(si, offset, 1);
803 si->cluster_next = offset + 1;
804 slots[n_ret++] = swp_entry(si->type, offset);
805
806
807 if ((n_ret == nr) || (offset >= si->highest_bit))
808 goto done;
809
810
811
812
813 if (unlikely(--latency_ration < 0)) {
814 if (n_ret)
815 goto done;
816 spin_unlock(&si->lock);
817 cond_resched();
818 spin_lock(&si->lock);
819 latency_ration = LATENCY_LIMIT;
820 }
821
822
823 if (si->cluster_info) {
824 if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
825 goto checks;
826 else
827 goto done;
828 }
829
830 ++offset;
831
832
833 if (si->cluster_nr && !si->swap_map[offset]) {
834 --si->cluster_nr;
835 goto checks;
836 }
837
838done:
839 si->flags -= SWP_SCANNING;
840 return n_ret;
841
842scan:
843 spin_unlock(&si->lock);
844 while (++offset <= si->highest_bit) {
845 if (!si->swap_map[offset]) {
846 spin_lock(&si->lock);
847 goto checks;
848 }
849 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
850 spin_lock(&si->lock);
851 goto checks;
852 }
853 if (unlikely(--latency_ration < 0)) {
854 cond_resched();
855 latency_ration = LATENCY_LIMIT;
856 }
857 }
858 offset = si->lowest_bit;
859 while (offset < scan_base) {
860 if (!si->swap_map[offset]) {
861 spin_lock(&si->lock);
862 goto checks;
863 }
864 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
865 spin_lock(&si->lock);
866 goto checks;
867 }
868 if (unlikely(--latency_ration < 0)) {
869 cond_resched();
870 latency_ration = LATENCY_LIMIT;
871 }
872 offset++;
873 }
874 spin_lock(&si->lock);
875
876no_page:
877 si->flags -= SWP_SCANNING;
878 return n_ret;
879}
880
881static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
882{
883 unsigned long idx;
884 struct swap_cluster_info *ci;
885 unsigned long offset, i;
886 unsigned char *map;
887
888
889
890
891
892 if (!IS_ENABLED(CONFIG_THP_SWAP)) {
893 VM_WARN_ON_ONCE(1);
894 return 0;
895 }
896
897 if (cluster_list_empty(&si->free_clusters))
898 return 0;
899
900 idx = cluster_list_first(&si->free_clusters);
901 offset = idx * SWAPFILE_CLUSTER;
902 ci = lock_cluster(si, offset);
903 alloc_cluster(si, idx);
904 cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
905
906 map = si->swap_map + offset;
907 for (i = 0; i < SWAPFILE_CLUSTER; i++)
908 map[i] = SWAP_HAS_CACHE;
909 unlock_cluster(ci);
910 swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
911 *slot = swp_entry(si->type, offset);
912
913 return 1;
914}
915
916static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
917{
918 unsigned long offset = idx * SWAPFILE_CLUSTER;
919 struct swap_cluster_info *ci;
920
921 ci = lock_cluster(si, offset);
922 cluster_set_count_flag(ci, 0, 0);
923 free_cluster(si, idx);
924 unlock_cluster(ci);
925 swap_range_free(si, offset, SWAPFILE_CLUSTER);
926}
927
928static unsigned long scan_swap_map(struct swap_info_struct *si,
929 unsigned char usage)
930{
931 swp_entry_t entry;
932 int n_ret;
933
934 n_ret = scan_swap_map_slots(si, usage, 1, &entry);
935
936 if (n_ret)
937 return swp_offset(entry);
938 else
939 return 0;
940
941}
942
943int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
944{
945 unsigned long size = swap_entry_size(entry_size);
946 struct swap_info_struct *si, *next;
947 long avail_pgs;
948 int n_ret = 0;
949 int node;
950
951
952 WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
953
954 avail_pgs = atomic_long_read(&nr_swap_pages) / size;
955 if (avail_pgs <= 0)
956 goto noswap;
957
958 if (n_goal > SWAP_BATCH)
959 n_goal = SWAP_BATCH;
960
961 if (n_goal > avail_pgs)
962 n_goal = avail_pgs;
963
964 atomic_long_sub(n_goal * size, &nr_swap_pages);
965
966 spin_lock(&swap_avail_lock);
967
968start_over:
969 node = numa_node_id();
970 plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
971
972 plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
973 spin_unlock(&swap_avail_lock);
974 spin_lock(&si->lock);
975 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
976 spin_lock(&swap_avail_lock);
977 if (plist_node_empty(&si->avail_lists[node])) {
978 spin_unlock(&si->lock);
979 goto nextsi;
980 }
981 WARN(!si->highest_bit,
982 "swap_info %d in list but !highest_bit\n",
983 si->type);
984 WARN(!(si->flags & SWP_WRITEOK),
985 "swap_info %d in list but !SWP_WRITEOK\n",
986 si->type);
987 __del_from_avail_list(si);
988 spin_unlock(&si->lock);
989 goto nextsi;
990 }
991 if (size == SWAPFILE_CLUSTER) {
992 if (!(si->flags & SWP_FILE))
993 n_ret = swap_alloc_cluster(si, swp_entries);
994 } else
995 n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
996 n_goal, swp_entries);
997 spin_unlock(&si->lock);
998 if (n_ret || size == SWAPFILE_CLUSTER)
999 goto check_out;
1000 pr_debug("scan_swap_map of si %d failed to find offset\n",
1001 si->type);
1002
1003 spin_lock(&swap_avail_lock);
1004nextsi:
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016 if (plist_node_empty(&next->avail_lists[node]))
1017 goto start_over;
1018 }
1019
1020 spin_unlock(&swap_avail_lock);
1021
1022check_out:
1023 if (n_ret < n_goal)
1024 atomic_long_add((long)(n_goal - n_ret) * size,
1025 &nr_swap_pages);
1026noswap:
1027 return n_ret;
1028}
1029
1030
1031swp_entry_t get_swap_page_of_type(int type)
1032{
1033 struct swap_info_struct *si;
1034 pgoff_t offset;
1035
1036 si = swap_info[type];
1037 spin_lock(&si->lock);
1038 if (si && (si->flags & SWP_WRITEOK)) {
1039 atomic_long_dec(&nr_swap_pages);
1040
1041 offset = scan_swap_map(si, 1);
1042 if (offset) {
1043 spin_unlock(&si->lock);
1044 return swp_entry(type, offset);
1045 }
1046 atomic_long_inc(&nr_swap_pages);
1047 }
1048 spin_unlock(&si->lock);
1049 return (swp_entry_t) {0};
1050}
1051
1052static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
1053{
1054 struct swap_info_struct *p;
1055 unsigned long offset, type;
1056
1057 if (!entry.val)
1058 goto out;
1059 type = swp_type(entry);
1060 if (type >= nr_swapfiles)
1061 goto bad_nofile;
1062 p = swap_info[type];
1063 if (!(p->flags & SWP_USED))
1064 goto bad_device;
1065 offset = swp_offset(entry);
1066 if (offset >= p->max)
1067 goto bad_offset;
1068 return p;
1069
1070bad_offset:
1071 pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val);
1072 goto out;
1073bad_device:
1074 pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val);
1075 goto out;
1076bad_nofile:
1077 pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val);
1078out:
1079 return NULL;
1080}
1081
1082static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
1083{
1084 struct swap_info_struct *p;
1085
1086 p = __swap_info_get(entry);
1087 if (!p)
1088 goto out;
1089 if (!p->swap_map[swp_offset(entry)])
1090 goto bad_free;
1091 return p;
1092
1093bad_free:
1094 pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
1095 goto out;
1096out:
1097 return NULL;
1098}
1099
1100static struct swap_info_struct *swap_info_get(swp_entry_t entry)
1101{
1102 struct swap_info_struct *p;
1103
1104 p = _swap_info_get(entry);
1105 if (p)
1106 spin_lock(&p->lock);
1107 return p;
1108}
1109
1110static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
1111 struct swap_info_struct *q)
1112{
1113 struct swap_info_struct *p;
1114
1115 p = _swap_info_get(entry);
1116
1117 if (p != q) {
1118 if (q != NULL)
1119 spin_unlock(&q->lock);
1120 if (p != NULL)
1121 spin_lock(&p->lock);
1122 }
1123 return p;
1124}
1125
1126static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
1127 unsigned long offset,
1128 unsigned char usage)
1129{
1130 unsigned char count;
1131 unsigned char has_cache;
1132
1133 count = p->swap_map[offset];
1134
1135 has_cache = count & SWAP_HAS_CACHE;
1136 count &= ~SWAP_HAS_CACHE;
1137
1138 if (usage == SWAP_HAS_CACHE) {
1139 VM_BUG_ON(!has_cache);
1140 has_cache = 0;
1141 } else if (count == SWAP_MAP_SHMEM) {
1142
1143
1144
1145
1146 count = 0;
1147 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
1148 if (count == COUNT_CONTINUED) {
1149 if (swap_count_continued(p, offset, count))
1150 count = SWAP_MAP_MAX | COUNT_CONTINUED;
1151 else
1152 count = SWAP_MAP_MAX;
1153 } else
1154 count--;
1155 }
1156
1157 usage = count | has_cache;
1158 p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
1159
1160 return usage;
1161}
1162
1163static unsigned char __swap_entry_free(struct swap_info_struct *p,
1164 swp_entry_t entry, unsigned char usage)
1165{
1166 struct swap_cluster_info *ci;
1167 unsigned long offset = swp_offset(entry);
1168
1169 ci = lock_cluster_or_swap_info(p, offset);
1170 usage = __swap_entry_free_locked(p, offset, usage);
1171 unlock_cluster_or_swap_info(p, ci);
1172
1173 return usage;
1174}
1175
1176static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
1177{
1178 struct swap_cluster_info *ci;
1179 unsigned long offset = swp_offset(entry);
1180 unsigned char count;
1181
1182 ci = lock_cluster(p, offset);
1183 count = p->swap_map[offset];
1184 VM_BUG_ON(count != SWAP_HAS_CACHE);
1185 p->swap_map[offset] = 0;
1186 dec_cluster_info_page(p, p->cluster_info, offset);
1187 unlock_cluster(ci);
1188
1189 mem_cgroup_uncharge_swap(entry, 1);
1190 swap_range_free(p, offset, 1);
1191}
1192
1193
1194
1195
1196
1197void swap_free(swp_entry_t entry)
1198{
1199 struct swap_info_struct *p;
1200
1201 p = _swap_info_get(entry);
1202 if (p) {
1203 if (!__swap_entry_free(p, entry, 1))
1204 free_swap_slot(entry);
1205 }
1206}
1207
1208
1209
1210
1211void put_swap_page(struct page *page, swp_entry_t entry)
1212{
1213 unsigned long offset = swp_offset(entry);
1214 unsigned long idx = offset / SWAPFILE_CLUSTER;
1215 struct swap_cluster_info *ci;
1216 struct swap_info_struct *si;
1217 unsigned char *map;
1218 unsigned int i, free_entries = 0;
1219 unsigned char val;
1220 int size = swap_entry_size(hpage_nr_pages(page));
1221
1222 si = _swap_info_get(entry);
1223 if (!si)
1224 return;
1225
1226 ci = lock_cluster_or_swap_info(si, offset);
1227 if (size == SWAPFILE_CLUSTER) {
1228 VM_BUG_ON(!cluster_is_huge(ci));
1229 map = si->swap_map + offset;
1230 for (i = 0; i < SWAPFILE_CLUSTER; i++) {
1231 val = map[i];
1232 VM_BUG_ON(!(val & SWAP_HAS_CACHE));
1233 if (val == SWAP_HAS_CACHE)
1234 free_entries++;
1235 }
1236 cluster_clear_huge(ci);
1237 if (free_entries == SWAPFILE_CLUSTER) {
1238 unlock_cluster_or_swap_info(si, ci);
1239 spin_lock(&si->lock);
1240 ci = lock_cluster(si, offset);
1241 memset(map, 0, SWAPFILE_CLUSTER);
1242 unlock_cluster(ci);
1243 mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
1244 swap_free_cluster(si, idx);
1245 spin_unlock(&si->lock);
1246 return;
1247 }
1248 }
1249 for (i = 0; i < size; i++, entry.val++) {
1250 if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
1251 unlock_cluster_or_swap_info(si, ci);
1252 free_swap_slot(entry);
1253 if (i == size - 1)
1254 return;
1255 lock_cluster_or_swap_info(si, offset);
1256 }
1257 }
1258 unlock_cluster_or_swap_info(si, ci);
1259}
1260
1261#ifdef CONFIG_THP_SWAP
1262int split_swap_cluster(swp_entry_t entry)
1263{
1264 struct swap_info_struct *si;
1265 struct swap_cluster_info *ci;
1266 unsigned long offset = swp_offset(entry);
1267
1268 si = _swap_info_get(entry);
1269 if (!si)
1270 return -EBUSY;
1271 ci = lock_cluster(si, offset);
1272 cluster_clear_huge(ci);
1273 unlock_cluster(ci);
1274 return 0;
1275}
1276#endif
1277
1278static int swp_entry_cmp(const void *ent1, const void *ent2)
1279{
1280 const swp_entry_t *e1 = ent1, *e2 = ent2;
1281
1282 return (int)swp_type(*e1) - (int)swp_type(*e2);
1283}
1284
1285void swapcache_free_entries(swp_entry_t *entries, int n)
1286{
1287 struct swap_info_struct *p, *prev;
1288 int i;
1289
1290 if (n <= 0)
1291 return;
1292
1293 prev = NULL;
1294 p = NULL;
1295
1296
1297
1298
1299
1300
1301 if (nr_swapfiles > 1)
1302 sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
1303 for (i = 0; i < n; ++i) {
1304 p = swap_info_get_cont(entries[i], prev);
1305 if (p)
1306 swap_entry_free(p, entries[i]);
1307 prev = p;
1308 }
1309 if (p)
1310 spin_unlock(&p->lock);
1311}
1312
1313
1314
1315
1316
1317
1318int page_swapcount(struct page *page)
1319{
1320 int count = 0;
1321 struct swap_info_struct *p;
1322 struct swap_cluster_info *ci;
1323 swp_entry_t entry;
1324 unsigned long offset;
1325
1326 entry.val = page_private(page);
1327 p = _swap_info_get(entry);
1328 if (p) {
1329 offset = swp_offset(entry);
1330 ci = lock_cluster_or_swap_info(p, offset);
1331 count = swap_count(p->swap_map[offset]);
1332 unlock_cluster_or_swap_info(p, ci);
1333 }
1334 return count;
1335}
1336
1337int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
1338{
1339 pgoff_t offset = swp_offset(entry);
1340
1341 return swap_count(si->swap_map[offset]);
1342}
1343
1344static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
1345{
1346 int count = 0;
1347 pgoff_t offset = swp_offset(entry);
1348 struct swap_cluster_info *ci;
1349
1350 ci = lock_cluster_or_swap_info(si, offset);
1351 count = swap_count(si->swap_map[offset]);
1352 unlock_cluster_or_swap_info(si, ci);
1353 return count;
1354}
1355
1356
1357
1358
1359
1360
1361int __swp_swapcount(swp_entry_t entry)
1362{
1363 int count = 0;
1364 struct swap_info_struct *si;
1365
1366 si = __swap_info_get(entry);
1367 if (si)
1368 count = swap_swapcount(si, entry);
1369 return count;
1370}
1371
1372
1373
1374
1375
1376int swp_swapcount(swp_entry_t entry)
1377{
1378 int count, tmp_count, n;
1379 struct swap_info_struct *p;
1380 struct swap_cluster_info *ci;
1381 struct page *page;
1382 pgoff_t offset;
1383 unsigned char *map;
1384
1385 p = _swap_info_get(entry);
1386 if (!p)
1387 return 0;
1388
1389 offset = swp_offset(entry);
1390
1391 ci = lock_cluster_or_swap_info(p, offset);
1392
1393 count = swap_count(p->swap_map[offset]);
1394 if (!(count & COUNT_CONTINUED))
1395 goto out;
1396
1397 count &= ~COUNT_CONTINUED;
1398 n = SWAP_MAP_MAX + 1;
1399
1400 page = vmalloc_to_page(p->swap_map + offset);
1401 offset &= ~PAGE_MASK;
1402 VM_BUG_ON(page_private(page) != SWP_CONTINUED);
1403
1404 do {
1405 page = list_next_entry(page, lru);
1406 map = kmap_atomic(page);
1407 tmp_count = map[offset];
1408 kunmap_atomic(map);
1409
1410 count += (tmp_count & ~COUNT_CONTINUED) * n;
1411 n *= (SWAP_CONT_MAX + 1);
1412 } while (tmp_count & COUNT_CONTINUED);
1413out:
1414 unlock_cluster_or_swap_info(p, ci);
1415 return count;
1416}
1417
1418static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
1419 swp_entry_t entry)
1420{
1421 struct swap_cluster_info *ci;
1422 unsigned char *map = si->swap_map;
1423 unsigned long roffset = swp_offset(entry);
1424 unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
1425 int i;
1426 bool ret = false;
1427
1428 ci = lock_cluster_or_swap_info(si, offset);
1429 if (!ci || !cluster_is_huge(ci)) {
1430 if (swap_count(map[roffset]))
1431 ret = true;
1432 goto unlock_out;
1433 }
1434 for (i = 0; i < SWAPFILE_CLUSTER; i++) {
1435 if (swap_count(map[offset + i])) {
1436 ret = true;
1437 break;
1438 }
1439 }
1440unlock_out:
1441 unlock_cluster_or_swap_info(si, ci);
1442 return ret;
1443}
1444
1445static bool page_swapped(struct page *page)
1446{
1447 swp_entry_t entry;
1448 struct swap_info_struct *si;
1449
1450 if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
1451 return page_swapcount(page) != 0;
1452
1453 page = compound_head(page);
1454 entry.val = page_private(page);
1455 si = _swap_info_get(entry);
1456 if (si)
1457 return swap_page_trans_huge_swapped(si, entry);
1458 return false;
1459}
1460
1461static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
1462 int *total_swapcount)
1463{
1464 int i, map_swapcount, _total_mapcount, _total_swapcount;
1465 unsigned long offset = 0;
1466 struct swap_info_struct *si;
1467 struct swap_cluster_info *ci = NULL;
1468 unsigned char *map = NULL;
1469 int mapcount, swapcount = 0;
1470
1471
1472 VM_BUG_ON_PAGE(PageHuge(page), page);
1473
1474 if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
1475 mapcount = page_trans_huge_mapcount(page, total_mapcount);
1476 if (PageSwapCache(page))
1477 swapcount = page_swapcount(page);
1478 if (total_swapcount)
1479 *total_swapcount = swapcount;
1480 return mapcount + swapcount;
1481 }
1482
1483 page = compound_head(page);
1484
1485 _total_mapcount = _total_swapcount = map_swapcount = 0;
1486 if (PageSwapCache(page)) {
1487 swp_entry_t entry;
1488
1489 entry.val = page_private(page);
1490 si = _swap_info_get(entry);
1491 if (si) {
1492 map = si->swap_map;
1493 offset = swp_offset(entry);
1494 }
1495 }
1496 if (map)
1497 ci = lock_cluster(si, offset);
1498 for (i = 0; i < HPAGE_PMD_NR; i++) {
1499 mapcount = atomic_read(&page[i]._mapcount) + 1;
1500 _total_mapcount += mapcount;
1501 if (map) {
1502 swapcount = swap_count(map[offset + i]);
1503 _total_swapcount += swapcount;
1504 }
1505 map_swapcount = max(map_swapcount, mapcount + swapcount);
1506 }
1507 unlock_cluster(ci);
1508 if (PageDoubleMap(page)) {
1509 map_swapcount -= 1;
1510 _total_mapcount -= HPAGE_PMD_NR;
1511 }
1512 mapcount = compound_mapcount(page);
1513 map_swapcount += mapcount;
1514 _total_mapcount += mapcount;
1515 if (total_mapcount)
1516 *total_mapcount = _total_mapcount;
1517 if (total_swapcount)
1518 *total_swapcount = _total_swapcount;
1519
1520 return map_swapcount;
1521}
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533bool reuse_swap_page(struct page *page, int *total_map_swapcount)
1534{
1535 int count, total_mapcount, total_swapcount;
1536
1537 VM_BUG_ON_PAGE(!PageLocked(page), page);
1538 if (unlikely(PageKsm(page)))
1539 return false;
1540 count = page_trans_huge_map_swapcount(page, &total_mapcount,
1541 &total_swapcount);
1542 if (total_map_swapcount)
1543 *total_map_swapcount = total_mapcount + total_swapcount;
1544 if (count == 1 && PageSwapCache(page) &&
1545 (likely(!PageTransCompound(page)) ||
1546
1547 total_swapcount == page_swapcount(page))) {
1548 if (!PageWriteback(page)) {
1549 page = compound_head(page);
1550 delete_from_swap_cache(page);
1551 SetPageDirty(page);
1552 } else {
1553 swp_entry_t entry;
1554 struct swap_info_struct *p;
1555
1556 entry.val = page_private(page);
1557 p = swap_info_get(entry);
1558 if (p->flags & SWP_STABLE_WRITES) {
1559 spin_unlock(&p->lock);
1560 return false;
1561 }
1562 spin_unlock(&p->lock);
1563 }
1564 }
1565
1566 return count <= 1;
1567}
1568
1569
1570
1571
1572
1573int try_to_free_swap(struct page *page)
1574{
1575 VM_BUG_ON_PAGE(!PageLocked(page), page);
1576
1577 if (!PageSwapCache(page))
1578 return 0;
1579 if (PageWriteback(page))
1580 return 0;
1581 if (page_swapped(page))
1582 return 0;
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599 if (pm_suspended_storage())
1600 return 0;
1601
1602 page = compound_head(page);
1603 delete_from_swap_cache(page);
1604 SetPageDirty(page);
1605 return 1;
1606}
1607
1608
1609
1610
1611
1612int free_swap_and_cache(swp_entry_t entry)
1613{
1614 struct swap_info_struct *p;
1615 struct page *page = NULL;
1616 unsigned char count;
1617
1618 if (non_swap_entry(entry))
1619 return 1;
1620
1621 p = _swap_info_get(entry);
1622 if (p) {
1623 count = __swap_entry_free(p, entry, 1);
1624 if (count == SWAP_HAS_CACHE &&
1625 !swap_page_trans_huge_swapped(p, entry)) {
1626 page = find_get_page(swap_address_space(entry),
1627 swp_offset(entry));
1628 if (page && !trylock_page(page)) {
1629 put_page(page);
1630 page = NULL;
1631 }
1632 } else if (!count)
1633 free_swap_slot(entry);
1634 }
1635 if (page) {
1636
1637
1638
1639
1640 if (PageSwapCache(page) && !PageWriteback(page) &&
1641 (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
1642 !swap_page_trans_huge_swapped(p, entry)) {
1643 page = compound_head(page);
1644 delete_from_swap_cache(page);
1645 SetPageDirty(page);
1646 }
1647 unlock_page(page);
1648 put_page(page);
1649 }
1650 return p != NULL;
1651}
1652
1653#ifdef CONFIG_HIBERNATION
1654
1655
1656
1657
1658
1659
1660
1661
1662int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
1663{
1664 struct block_device *bdev = NULL;
1665 int type;
1666
1667 if (device)
1668 bdev = bdget(device);
1669
1670 spin_lock(&swap_lock);
1671 for (type = 0; type < nr_swapfiles; type++) {
1672 struct swap_info_struct *sis = swap_info[type];
1673
1674 if (!(sis->flags & SWP_WRITEOK))
1675 continue;
1676
1677 if (!bdev) {
1678 if (bdev_p)
1679 *bdev_p = bdgrab(sis->bdev);
1680
1681 spin_unlock(&swap_lock);
1682 return type;
1683 }
1684 if (bdev == sis->bdev) {
1685 struct swap_extent *se = &sis->first_swap_extent;
1686
1687 if (se->start_block == offset) {
1688 if (bdev_p)
1689 *bdev_p = bdgrab(sis->bdev);
1690
1691 spin_unlock(&swap_lock);
1692 bdput(bdev);
1693 return type;
1694 }
1695 }
1696 }
1697 spin_unlock(&swap_lock);
1698 if (bdev)
1699 bdput(bdev);
1700
1701 return -ENODEV;
1702}
1703
1704
1705
1706
1707
1708sector_t swapdev_block(int type, pgoff_t offset)
1709{
1710 struct block_device *bdev;
1711
1712 if ((unsigned int)type >= nr_swapfiles)
1713 return 0;
1714 if (!(swap_info[type]->flags & SWP_WRITEOK))
1715 return 0;
1716 return map_swap_entry(swp_entry(type, offset), &bdev);
1717}
1718
1719
1720
1721
1722
1723
1724
1725unsigned int count_swap_pages(int type, int free)
1726{
1727 unsigned int n = 0;
1728
1729 spin_lock(&swap_lock);
1730 if ((unsigned int)type < nr_swapfiles) {
1731 struct swap_info_struct *sis = swap_info[type];
1732
1733 spin_lock(&sis->lock);
1734 if (sis->flags & SWP_WRITEOK) {
1735 n = sis->pages;
1736 if (free)
1737 n -= sis->inuse_pages;
1738 }
1739 spin_unlock(&sis->lock);
1740 }
1741 spin_unlock(&swap_lock);
1742 return n;
1743}
1744#endif
1745
1746static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
1747{
1748 return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
1749}
1750
1751
1752
1753
1754
1755
1756static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
1757 unsigned long addr, swp_entry_t entry, struct page *page)
1758{
1759 struct page *swapcache;
1760 struct mem_cgroup *memcg;
1761 spinlock_t *ptl;
1762 pte_t *pte;
1763 int ret = 1;
1764
1765 swapcache = page;
1766 page = ksm_might_need_to_copy(page, vma, addr);
1767 if (unlikely(!page))
1768 return -ENOMEM;
1769
1770 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
1771 &memcg, false)) {
1772 ret = -ENOMEM;
1773 goto out_nolock;
1774 }
1775
1776 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1777 if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
1778 mem_cgroup_cancel_charge(page, memcg, false);
1779 ret = 0;
1780 goto out;
1781 }
1782
1783 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
1784 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
1785 get_page(page);
1786 set_pte_at(vma->vm_mm, addr, pte,
1787 pte_mkold(mk_pte(page, vma->vm_page_prot)));
1788 if (page == swapcache) {
1789 page_add_anon_rmap(page, vma, addr, false);
1790 mem_cgroup_commit_charge(page, memcg, true, false);
1791 } else {
1792 page_add_new_anon_rmap(page, vma, addr, false);
1793 mem_cgroup_commit_charge(page, memcg, false, false);
1794 lru_cache_add_active_or_unevictable(page, vma);
1795 }
1796 swap_free(entry);
1797
1798
1799
1800
1801 activate_page(page);
1802out:
1803 pte_unmap_unlock(pte, ptl);
1804out_nolock:
1805 if (page != swapcache) {
1806 unlock_page(page);
1807 put_page(page);
1808 }
1809 return ret;
1810}
1811
1812static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
1813 unsigned long addr, unsigned long end,
1814 swp_entry_t entry, struct page *page)
1815{
1816 pte_t swp_pte = swp_entry_to_pte(entry);
1817 pte_t *pte;
1818 int ret = 0;
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829 pte = pte_offset_map(pmd, addr);
1830 do {
1831
1832
1833
1834
1835 if (unlikely(pte_same_as_swp(*pte, swp_pte))) {
1836 pte_unmap(pte);
1837 ret = unuse_pte(vma, pmd, addr, entry, page);
1838 if (ret)
1839 goto out;
1840 pte = pte_offset_map(pmd, addr);
1841 }
1842 } while (pte++, addr += PAGE_SIZE, addr != end);
1843 pte_unmap(pte - 1);
1844out:
1845 return ret;
1846}
1847
1848static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
1849 unsigned long addr, unsigned long end,
1850 swp_entry_t entry, struct page *page)
1851{
1852 pmd_t *pmd;
1853 unsigned long next;
1854 int ret;
1855
1856 pmd = pmd_offset(pud, addr);
1857 do {
1858 cond_resched();
1859 next = pmd_addr_end(addr, end);
1860 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1861 continue;
1862 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
1863 if (ret)
1864 return ret;
1865 } while (pmd++, addr = next, addr != end);
1866 return 0;
1867}
1868
1869static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
1870 unsigned long addr, unsigned long end,
1871 swp_entry_t entry, struct page *page)
1872{
1873 pud_t *pud;
1874 unsigned long next;
1875 int ret;
1876
1877 pud = pud_offset(p4d, addr);
1878 do {
1879 next = pud_addr_end(addr, end);
1880 if (pud_none_or_clear_bad(pud))
1881 continue;
1882 ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
1883 if (ret)
1884 return ret;
1885 } while (pud++, addr = next, addr != end);
1886 return 0;
1887}
1888
1889static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
1890 unsigned long addr, unsigned long end,
1891 swp_entry_t entry, struct page *page)
1892{
1893 p4d_t *p4d;
1894 unsigned long next;
1895 int ret;
1896
1897 p4d = p4d_offset(pgd, addr);
1898 do {
1899 next = p4d_addr_end(addr, end);
1900 if (p4d_none_or_clear_bad(p4d))
1901 continue;
1902 ret = unuse_pud_range(vma, p4d, addr, next, entry, page);
1903 if (ret)
1904 return ret;
1905 } while (p4d++, addr = next, addr != end);
1906 return 0;
1907}
1908
1909static int unuse_vma(struct vm_area_struct *vma,
1910 swp_entry_t entry, struct page *page)
1911{
1912 pgd_t *pgd;
1913 unsigned long addr, end, next;
1914 int ret;
1915
1916 if (page_anon_vma(page)) {
1917 addr = page_address_in_vma(page, vma);
1918 if (addr == -EFAULT)
1919 return 0;
1920 else
1921 end = addr + PAGE_SIZE;
1922 } else {
1923 addr = vma->vm_start;
1924 end = vma->vm_end;
1925 }
1926
1927 pgd = pgd_offset(vma->vm_mm, addr);
1928 do {
1929 next = pgd_addr_end(addr, end);
1930 if (pgd_none_or_clear_bad(pgd))
1931 continue;
1932 ret = unuse_p4d_range(vma, pgd, addr, next, entry, page);
1933 if (ret)
1934 return ret;
1935 } while (pgd++, addr = next, addr != end);
1936 return 0;
1937}
1938
1939static int unuse_mm(struct mm_struct *mm,
1940 swp_entry_t entry, struct page *page)
1941{
1942 struct vm_area_struct *vma;
1943 int ret = 0;
1944
1945 if (!down_read_trylock(&mm->mmap_sem)) {
1946
1947
1948
1949
1950 activate_page(page);
1951 unlock_page(page);
1952 down_read(&mm->mmap_sem);
1953 lock_page(page);
1954 }
1955 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1956 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
1957 break;
1958 cond_resched();
1959 }
1960 up_read(&mm->mmap_sem);
1961 return (ret < 0)? ret: 0;
1962}
1963
1964
1965
1966
1967
1968
1969static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1970 unsigned int prev, bool frontswap)
1971{
1972 unsigned int max = si->max;
1973 unsigned int i = prev;
1974 unsigned char count;
1975
1976
1977
1978
1979
1980
1981
1982 for (;;) {
1983 if (++i >= max) {
1984 if (!prev) {
1985 i = 0;
1986 break;
1987 }
1988
1989
1990
1991
1992 max = prev + 1;
1993 prev = 0;
1994 i = 1;
1995 }
1996 count = READ_ONCE(si->swap_map[i]);
1997 if (count && swap_count(count) != SWAP_MAP_BAD)
1998 if (!frontswap || frontswap_test(si, i))
1999 break;
2000 if ((i % LATENCY_LIMIT) == 0)
2001 cond_resched();
2002 }
2003 return i;
2004}
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014int try_to_unuse(unsigned int type, bool frontswap,
2015 unsigned long pages_to_unuse)
2016{
2017 struct swap_info_struct *si = swap_info[type];
2018 struct mm_struct *start_mm;
2019 volatile unsigned char *swap_map;
2020
2021
2022
2023
2024 unsigned char swcount;
2025 struct page *page;
2026 swp_entry_t entry;
2027 unsigned int i = 0;
2028 int retval = 0;
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044 start_mm = &init_mm;
2045 mmget(&init_mm);
2046
2047
2048
2049
2050
2051
2052 while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
2053 if (signal_pending(current)) {
2054 retval = -EINTR;
2055 break;
2056 }
2057
2058
2059
2060
2061
2062
2063 swap_map = &si->swap_map[i];
2064 entry = swp_entry(type, i);
2065 page = read_swap_cache_async(entry,
2066 GFP_HIGHUSER_MOVABLE, NULL, 0, false);
2067 if (!page) {
2068
2069
2070
2071
2072
2073
2074 swcount = *swap_map;
2075
2076
2077
2078
2079
2080
2081
2082 if (!swcount || swcount == SWAP_MAP_BAD)
2083 continue;
2084 retval = -ENOMEM;
2085 break;
2086 }
2087
2088
2089
2090
2091 if (atomic_read(&start_mm->mm_users) == 1) {
2092 mmput(start_mm);
2093 start_mm = &init_mm;
2094 mmget(&init_mm);
2095 }
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105 wait_on_page_locked(page);
2106 wait_on_page_writeback(page);
2107 lock_page(page);
2108 wait_on_page_writeback(page);
2109
2110
2111
2112
2113 swcount = *swap_map;
2114 if (swap_count(swcount) == SWAP_MAP_SHMEM) {
2115 retval = shmem_unuse(entry, page);
2116
2117 if (retval < 0)
2118 break;
2119 continue;
2120 }
2121 if (swap_count(swcount) && start_mm != &init_mm)
2122 retval = unuse_mm(start_mm, entry, page);
2123
2124 if (swap_count(*swap_map)) {
2125 int set_start_mm = (*swap_map >= swcount);
2126 struct list_head *p = &start_mm->mmlist;
2127 struct mm_struct *new_start_mm = start_mm;
2128 struct mm_struct *prev_mm = start_mm;
2129 struct mm_struct *mm;
2130
2131 mmget(new_start_mm);
2132 mmget(prev_mm);
2133 spin_lock(&mmlist_lock);
2134 while (swap_count(*swap_map) && !retval &&
2135 (p = p->next) != &start_mm->mmlist) {
2136 mm = list_entry(p, struct mm_struct, mmlist);
2137 if (!mmget_not_zero(mm))
2138 continue;
2139 spin_unlock(&mmlist_lock);
2140 mmput(prev_mm);
2141 prev_mm = mm;
2142
2143 cond_resched();
2144
2145 swcount = *swap_map;
2146 if (!swap_count(swcount))
2147 ;
2148 else if (mm == &init_mm)
2149 set_start_mm = 1;
2150 else
2151 retval = unuse_mm(mm, entry, page);
2152
2153 if (set_start_mm && *swap_map < swcount) {
2154 mmput(new_start_mm);
2155 mmget(mm);
2156 new_start_mm = mm;
2157 set_start_mm = 0;
2158 }
2159 spin_lock(&mmlist_lock);
2160 }
2161 spin_unlock(&mmlist_lock);
2162 mmput(prev_mm);
2163 mmput(start_mm);
2164 start_mm = new_start_mm;
2165 }
2166 if (retval) {
2167 unlock_page(page);
2168 put_page(page);
2169 break;
2170 }
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191 if (swap_count(*swap_map) &&
2192 PageDirty(page) && PageSwapCache(page)) {
2193 struct writeback_control wbc = {
2194 .sync_mode = WB_SYNC_NONE,
2195 };
2196
2197 swap_writepage(compound_head(page), &wbc);
2198 lock_page(page);
2199 wait_on_page_writeback(page);
2200 }
2201
2202
2203
2204
2205
2206
2207
2208
2209 if (PageSwapCache(page) &&
2210 likely(page_private(page) == entry.val) &&
2211 !page_swapped(page))
2212 delete_from_swap_cache(compound_head(page));
2213
2214
2215
2216
2217
2218
2219 SetPageDirty(page);
2220 unlock_page(page);
2221 put_page(page);
2222
2223
2224
2225
2226
2227 cond_resched();
2228 if (frontswap && pages_to_unuse > 0) {
2229 if (!--pages_to_unuse)
2230 break;
2231 }
2232 }
2233
2234 mmput(start_mm);
2235 return retval;
2236}
2237
2238
2239
2240
2241
2242
2243
2244static void drain_mmlist(void)
2245{
2246 struct list_head *p, *next;
2247 unsigned int type;
2248
2249 for (type = 0; type < nr_swapfiles; type++)
2250 if (swap_info[type]->inuse_pages)
2251 return;
2252 spin_lock(&mmlist_lock);
2253 list_for_each_safe(p, next, &init_mm.mmlist)
2254 list_del_init(p);
2255 spin_unlock(&mmlist_lock);
2256}
2257
2258
2259
2260
2261
2262
2263
2264static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
2265{
2266 struct swap_info_struct *sis;
2267 struct swap_extent *start_se;
2268 struct swap_extent *se;
2269 pgoff_t offset;
2270
2271 sis = swap_info[swp_type(entry)];
2272 *bdev = sis->bdev;
2273
2274 offset = swp_offset(entry);
2275 start_se = sis->curr_swap_extent;
2276 se = start_se;
2277
2278 for ( ; ; ) {
2279 if (se->start_page <= offset &&
2280 offset < (se->start_page + se->nr_pages)) {
2281 return se->start_block + (offset - se->start_page);
2282 }
2283 se = list_next_entry(se, list);
2284 sis->curr_swap_extent = se;
2285 BUG_ON(se == start_se);
2286 }
2287}
2288
2289
2290
2291
2292sector_t map_swap_page(struct page *page, struct block_device **bdev)
2293{
2294 swp_entry_t entry;
2295 entry.val = page_private(page);
2296 return map_swap_entry(entry, bdev);
2297}
2298
2299
2300
2301
2302static void destroy_swap_extents(struct swap_info_struct *sis)
2303{
2304 while (!list_empty(&sis->first_swap_extent.list)) {
2305 struct swap_extent *se;
2306
2307 se = list_first_entry(&sis->first_swap_extent.list,
2308 struct swap_extent, list);
2309 list_del(&se->list);
2310 kfree(se);
2311 }
2312
2313 if (sis->flags & SWP_FILE) {
2314 struct file *swap_file = sis->swap_file;
2315 struct address_space *mapping = swap_file->f_mapping;
2316
2317 sis->flags &= ~SWP_FILE;
2318 mapping->a_ops->swap_deactivate(swap_file);
2319 }
2320}
2321
2322
2323
2324
2325
2326
2327
2328int
2329add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
2330 unsigned long nr_pages, sector_t start_block)
2331{
2332 struct swap_extent *se;
2333 struct swap_extent *new_se;
2334 struct list_head *lh;
2335
2336 if (start_page == 0) {
2337 se = &sis->first_swap_extent;
2338 sis->curr_swap_extent = se;
2339 se->start_page = 0;
2340 se->nr_pages = nr_pages;
2341 se->start_block = start_block;
2342 return 1;
2343 } else {
2344 lh = sis->first_swap_extent.list.prev;
2345 se = list_entry(lh, struct swap_extent, list);
2346 BUG_ON(se->start_page + se->nr_pages != start_page);
2347 if (se->start_block + se->nr_pages == start_block) {
2348
2349 se->nr_pages += nr_pages;
2350 return 0;
2351 }
2352 }
2353
2354
2355
2356
2357 new_se = kmalloc(sizeof(*se), GFP_KERNEL);
2358 if (new_se == NULL)
2359 return -ENOMEM;
2360 new_se->start_page = start_page;
2361 new_se->nr_pages = nr_pages;
2362 new_se->start_block = start_block;
2363
2364 list_add_tail(&new_se->list, &sis->first_swap_extent.list);
2365 return 1;
2366}
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
2400{
2401 struct file *swap_file = sis->swap_file;
2402 struct address_space *mapping = swap_file->f_mapping;
2403 struct inode *inode = mapping->host;
2404 int ret;
2405
2406 if (S_ISBLK(inode->i_mode)) {
2407 ret = add_swap_extent(sis, 0, sis->max, 0);
2408 *span = sis->pages;
2409 return ret;
2410 }
2411
2412 if (mapping->a_ops->swap_activate) {
2413 ret = mapping->a_ops->swap_activate(sis, swap_file, span);
2414 if (!ret) {
2415 sis->flags |= SWP_FILE;
2416 ret = add_swap_extent(sis, 0, sis->max, 0);
2417 *span = sis->pages;
2418 }
2419 return ret;
2420 }
2421
2422 return generic_swapfile_activate(sis, swap_file, span);
2423}
2424
2425static int swap_node(struct swap_info_struct *p)
2426{
2427 struct block_device *bdev;
2428
2429 if (p->bdev)
2430 bdev = p->bdev;
2431 else
2432 bdev = p->swap_file->f_inode->i_sb->s_bdev;
2433
2434 return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
2435}
2436
2437static void _enable_swap_info(struct swap_info_struct *p, int prio,
2438 unsigned char *swap_map,
2439 struct swap_cluster_info *cluster_info)
2440{
2441 int i;
2442
2443 if (prio >= 0)
2444 p->prio = prio;
2445 else
2446 p->prio = --least_priority;
2447
2448
2449
2450
2451 p->list.prio = -p->prio;
2452 for_each_node(i) {
2453 if (p->prio >= 0)
2454 p->avail_lists[i].prio = -p->prio;
2455 else {
2456 if (swap_node(p) == i)
2457 p->avail_lists[i].prio = 1;
2458 else
2459 p->avail_lists[i].prio = -p->prio;
2460 }
2461 }
2462 p->swap_map = swap_map;
2463 p->cluster_info = cluster_info;
2464 p->flags |= SWP_WRITEOK;
2465 atomic_long_add(p->pages, &nr_swap_pages);
2466 total_swap_pages += p->pages;
2467
2468 assert_spin_locked(&swap_lock);
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479 plist_add(&p->list, &swap_active_head);
2480 add_to_avail_list(p);
2481}
2482
2483static void enable_swap_info(struct swap_info_struct *p, int prio,
2484 unsigned char *swap_map,
2485 struct swap_cluster_info *cluster_info,
2486 unsigned long *frontswap_map)
2487{
2488 frontswap_init(p->type, frontswap_map);
2489 spin_lock(&swap_lock);
2490 spin_lock(&p->lock);
2491 _enable_swap_info(p, prio, swap_map, cluster_info);
2492 spin_unlock(&p->lock);
2493 spin_unlock(&swap_lock);
2494}
2495
2496static void reinsert_swap_info(struct swap_info_struct *p)
2497{
2498 spin_lock(&swap_lock);
2499 spin_lock(&p->lock);
2500 _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
2501 spin_unlock(&p->lock);
2502 spin_unlock(&swap_lock);
2503}
2504
2505bool has_usable_swap(void)
2506{
2507 bool ret = true;
2508
2509 spin_lock(&swap_lock);
2510 if (plist_head_empty(&swap_active_head))
2511 ret = false;
2512 spin_unlock(&swap_lock);
2513 return ret;
2514}
2515
2516SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
2517{
2518 struct swap_info_struct *p = NULL;
2519 unsigned char *swap_map;
2520 struct swap_cluster_info *cluster_info;
2521 unsigned long *frontswap_map;
2522 struct file *swap_file, *victim;
2523 struct address_space *mapping;
2524 struct inode *inode;
2525 struct filename *pathname;
2526 int err, found = 0;
2527 unsigned int old_block_size;
2528
2529 if (!capable(CAP_SYS_ADMIN))
2530 return -EPERM;
2531
2532 BUG_ON(!current->mm);
2533
2534 pathname = getname(specialfile);
2535 if (IS_ERR(pathname))
2536 return PTR_ERR(pathname);
2537
2538 victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
2539 err = PTR_ERR(victim);
2540 if (IS_ERR(victim))
2541 goto out;
2542
2543 mapping = victim->f_mapping;
2544 spin_lock(&swap_lock);
2545 plist_for_each_entry(p, &swap_active_head, list) {
2546 if (p->flags & SWP_WRITEOK) {
2547 if (p->swap_file->f_mapping == mapping) {
2548 found = 1;
2549 break;
2550 }
2551 }
2552 }
2553 if (!found) {
2554 err = -EINVAL;
2555 spin_unlock(&swap_lock);
2556 goto out_dput;
2557 }
2558 if (!security_vm_enough_memory_mm(current->mm, p->pages))
2559 vm_unacct_memory(p->pages);
2560 else {
2561 err = -ENOMEM;
2562 spin_unlock(&swap_lock);
2563 goto out_dput;
2564 }
2565 del_from_avail_list(p);
2566 spin_lock(&p->lock);
2567 if (p->prio < 0) {
2568 struct swap_info_struct *si = p;
2569 int nid;
2570
2571 plist_for_each_entry_continue(si, &swap_active_head, list) {
2572 si->prio++;
2573 si->list.prio--;
2574 for_each_node(nid) {
2575 if (si->avail_lists[nid].prio != 1)
2576 si->avail_lists[nid].prio--;
2577 }
2578 }
2579 least_priority++;
2580 }
2581 plist_del(&p->list, &swap_active_head);
2582 atomic_long_sub(p->pages, &nr_swap_pages);
2583 total_swap_pages -= p->pages;
2584 p->flags &= ~SWP_WRITEOK;
2585 spin_unlock(&p->lock);
2586 spin_unlock(&swap_lock);
2587
2588 disable_swap_slots_cache_lock();
2589
2590 set_current_oom_origin();
2591 err = try_to_unuse(p->type, false, 0);
2592 clear_current_oom_origin();
2593
2594 if (err) {
2595
2596 reinsert_swap_info(p);
2597 reenable_swap_slots_cache_unlock();
2598 goto out_dput;
2599 }
2600
2601 reenable_swap_slots_cache_unlock();
2602
2603 flush_work(&p->discard_work);
2604
2605 destroy_swap_extents(p);
2606 if (p->flags & SWP_CONTINUED)
2607 free_swap_count_continuations(p);
2608
2609 if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
2610 atomic_dec(&nr_rotate_swap);
2611
2612 mutex_lock(&swapon_mutex);
2613 spin_lock(&swap_lock);
2614 spin_lock(&p->lock);
2615 drain_mmlist();
2616
2617
2618 p->highest_bit = 0;
2619 while (p->flags >= SWP_SCANNING) {
2620 spin_unlock(&p->lock);
2621 spin_unlock(&swap_lock);
2622 schedule_timeout_uninterruptible(1);
2623 spin_lock(&swap_lock);
2624 spin_lock(&p->lock);
2625 }
2626
2627 swap_file = p->swap_file;
2628 old_block_size = p->old_block_size;
2629 p->swap_file = NULL;
2630 p->max = 0;
2631 swap_map = p->swap_map;
2632 p->swap_map = NULL;
2633 cluster_info = p->cluster_info;
2634 p->cluster_info = NULL;
2635 frontswap_map = frontswap_map_get(p);
2636 spin_unlock(&p->lock);
2637 spin_unlock(&swap_lock);
2638 frontswap_invalidate_area(p->type);
2639 frontswap_map_set(p, NULL);
2640 mutex_unlock(&swapon_mutex);
2641 free_percpu(p->percpu_cluster);
2642 p->percpu_cluster = NULL;
2643 vfree(swap_map);
2644 kvfree(cluster_info);
2645 kvfree(frontswap_map);
2646
2647 swap_cgroup_swapoff(p->type);
2648 exit_swap_address_space(p->type);
2649
2650 inode = mapping->host;
2651 if (S_ISBLK(inode->i_mode)) {
2652 struct block_device *bdev = I_BDEV(inode);
2653 set_blocksize(bdev, old_block_size);
2654 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2655 } else {
2656 inode_lock(inode);
2657 inode->i_flags &= ~S_SWAPFILE;
2658 inode_unlock(inode);
2659 }
2660 filp_close(swap_file, NULL);
2661
2662
2663
2664
2665
2666
2667 spin_lock(&swap_lock);
2668 p->flags = 0;
2669 spin_unlock(&swap_lock);
2670
2671 err = 0;
2672 atomic_inc(&proc_poll_event);
2673 wake_up_interruptible(&proc_poll_wait);
2674
2675out_dput:
2676 filp_close(victim, NULL);
2677out:
2678 putname(pathname);
2679 return err;
2680}
2681
2682#ifdef CONFIG_PROC_FS
2683static __poll_t swaps_poll(struct file *file, poll_table *wait)
2684{
2685 struct seq_file *seq = file->private_data;
2686
2687 poll_wait(file, &proc_poll_wait, wait);
2688
2689 if (seq->poll_event != atomic_read(&proc_poll_event)) {
2690 seq->poll_event = atomic_read(&proc_poll_event);
2691 return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
2692 }
2693
2694 return EPOLLIN | EPOLLRDNORM;
2695}
2696
2697
2698static void *swap_start(struct seq_file *swap, loff_t *pos)
2699{
2700 struct swap_info_struct *si;
2701 int type;
2702 loff_t l = *pos;
2703
2704 mutex_lock(&swapon_mutex);
2705
2706 if (!l)
2707 return SEQ_START_TOKEN;
2708
2709 for (type = 0; type < nr_swapfiles; type++) {
2710 smp_rmb();
2711 si = swap_info[type];
2712 if (!(si->flags & SWP_USED) || !si->swap_map)
2713 continue;
2714 if (!--l)
2715 return si;
2716 }
2717
2718 return NULL;
2719}
2720
2721static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
2722{
2723 struct swap_info_struct *si = v;
2724 int type;
2725
2726 if (v == SEQ_START_TOKEN)
2727 type = 0;
2728 else
2729 type = si->type + 1;
2730
2731 for (; type < nr_swapfiles; type++) {
2732 smp_rmb();
2733 si = swap_info[type];
2734 if (!(si->flags & SWP_USED) || !si->swap_map)
2735 continue;
2736 ++*pos;
2737 return si;
2738 }
2739
2740 return NULL;
2741}
2742
2743static void swap_stop(struct seq_file *swap, void *v)
2744{
2745 mutex_unlock(&swapon_mutex);
2746}
2747
2748static int swap_show(struct seq_file *swap, void *v)
2749{
2750 struct swap_info_struct *si = v;
2751 struct file *file;
2752 int len;
2753
2754 if (si == SEQ_START_TOKEN) {
2755 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
2756 return 0;
2757 }
2758
2759 file = si->swap_file;
2760 len = seq_file_path(swap, file, " \t\n\\");
2761 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
2762 len < 40 ? 40 - len : 1, " ",
2763 S_ISBLK(file_inode(file)->i_mode) ?
2764 "partition" : "file\t",
2765 si->pages << (PAGE_SHIFT - 10),
2766 si->inuse_pages << (PAGE_SHIFT - 10),
2767 si->prio);
2768 return 0;
2769}
2770
2771static const struct seq_operations swaps_op = {
2772 .start = swap_start,
2773 .next = swap_next,
2774 .stop = swap_stop,
2775 .show = swap_show
2776};
2777
2778static int swaps_open(struct inode *inode, struct file *file)
2779{
2780 struct seq_file *seq;
2781 int ret;
2782
2783 ret = seq_open(file, &swaps_op);
2784 if (ret)
2785 return ret;
2786
2787 seq = file->private_data;
2788 seq->poll_event = atomic_read(&proc_poll_event);
2789 return 0;
2790}
2791
2792static const struct file_operations proc_swaps_operations = {
2793 .open = swaps_open,
2794 .read = seq_read,
2795 .llseek = seq_lseek,
2796 .release = seq_release,
2797 .poll = swaps_poll,
2798};
2799
2800static int __init procswaps_init(void)
2801{
2802 proc_create("swaps", 0, NULL, &proc_swaps_operations);
2803 return 0;
2804}
2805__initcall(procswaps_init);
2806#endif
2807
2808#ifdef MAX_SWAPFILES_CHECK
2809static int __init max_swapfiles_check(void)
2810{
2811 MAX_SWAPFILES_CHECK();
2812 return 0;
2813}
2814late_initcall(max_swapfiles_check);
2815#endif
2816
2817static struct swap_info_struct *alloc_swap_info(void)
2818{
2819 struct swap_info_struct *p;
2820 unsigned int type;
2821 int i;
2822
2823 p = kzalloc(sizeof(*p), GFP_KERNEL);
2824 if (!p)
2825 return ERR_PTR(-ENOMEM);
2826
2827 spin_lock(&swap_lock);
2828 for (type = 0; type < nr_swapfiles; type++) {
2829 if (!(swap_info[type]->flags & SWP_USED))
2830 break;
2831 }
2832 if (type >= MAX_SWAPFILES) {
2833 spin_unlock(&swap_lock);
2834 kfree(p);
2835 return ERR_PTR(-EPERM);
2836 }
2837 if (type >= nr_swapfiles) {
2838 p->type = type;
2839 swap_info[type] = p;
2840
2841
2842
2843
2844
2845 smp_wmb();
2846 nr_swapfiles++;
2847 } else {
2848 kfree(p);
2849 p = swap_info[type];
2850
2851
2852
2853
2854 }
2855 INIT_LIST_HEAD(&p->first_swap_extent.list);
2856 plist_node_init(&p->list, 0);
2857 for_each_node(i)
2858 plist_node_init(&p->avail_lists[i], 0);
2859 p->flags = SWP_USED;
2860 spin_unlock(&swap_lock);
2861 spin_lock_init(&p->lock);
2862 spin_lock_init(&p->cont_lock);
2863
2864 return p;
2865}
2866
2867static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
2868{
2869 int error;
2870
2871 if (S_ISBLK(inode->i_mode)) {
2872 p->bdev = bdgrab(I_BDEV(inode));
2873 error = blkdev_get(p->bdev,
2874 FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
2875 if (error < 0) {
2876 p->bdev = NULL;
2877 return error;
2878 }
2879 p->old_block_size = block_size(p->bdev);
2880 error = set_blocksize(p->bdev, PAGE_SIZE);
2881 if (error < 0)
2882 return error;
2883 p->flags |= SWP_BLKDEV;
2884 } else if (S_ISREG(inode->i_mode)) {
2885 p->bdev = inode->i_sb->s_bdev;
2886 inode_lock(inode);
2887 if (IS_SWAPFILE(inode))
2888 return -EBUSY;
2889 } else
2890 return -EINVAL;
2891
2892 return 0;
2893}
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912unsigned long generic_max_swapfile_size(void)
2913{
2914 return swp_offset(pte_to_swp_entry(
2915 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
2916}
2917
2918
2919__weak unsigned long max_swapfile_size(void)
2920{
2921 return generic_max_swapfile_size();
2922}
2923
2924static unsigned long read_swap_header(struct swap_info_struct *p,
2925 union swap_header *swap_header,
2926 struct inode *inode)
2927{
2928 int i;
2929 unsigned long maxpages;
2930 unsigned long swapfilepages;
2931 unsigned long last_page;
2932
2933 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
2934 pr_err("Unable to find swap-space signature\n");
2935 return 0;
2936 }
2937
2938
2939 if (swab32(swap_header->info.version) == 1) {
2940 swab32s(&swap_header->info.version);
2941 swab32s(&swap_header->info.last_page);
2942 swab32s(&swap_header->info.nr_badpages);
2943 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2944 return 0;
2945 for (i = 0; i < swap_header->info.nr_badpages; i++)
2946 swab32s(&swap_header->info.badpages[i]);
2947 }
2948
2949 if (swap_header->info.version != 1) {
2950 pr_warn("Unable to handle swap header version %d\n",
2951 swap_header->info.version);
2952 return 0;
2953 }
2954
2955 p->lowest_bit = 1;
2956 p->cluster_next = 1;
2957 p->cluster_nr = 0;
2958
2959 maxpages = max_swapfile_size();
2960 last_page = swap_header->info.last_page;
2961 if (!last_page) {
2962 pr_warn("Empty swap-file\n");
2963 return 0;
2964 }
2965 if (last_page > maxpages) {
2966 pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
2967 maxpages << (PAGE_SHIFT - 10),
2968 last_page << (PAGE_SHIFT - 10));
2969 }
2970 if (maxpages > last_page) {
2971 maxpages = last_page + 1;
2972
2973 if ((unsigned int)maxpages == 0)
2974 maxpages = UINT_MAX;
2975 }
2976 p->highest_bit = maxpages - 1;
2977
2978 if (!maxpages)
2979 return 0;
2980 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
2981 if (swapfilepages && maxpages > swapfilepages) {
2982 pr_warn("Swap area shorter than signature indicates\n");
2983 return 0;
2984 }
2985 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
2986 return 0;
2987 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2988 return 0;
2989
2990 return maxpages;
2991}
2992
2993#define SWAP_CLUSTER_INFO_COLS \
2994 DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
2995#define SWAP_CLUSTER_SPACE_COLS \
2996 DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
2997#define SWAP_CLUSTER_COLS \
2998 max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
2999
3000static int setup_swap_map_and_extents(struct swap_info_struct *p,
3001 union swap_header *swap_header,
3002 unsigned char *swap_map,
3003 struct swap_cluster_info *cluster_info,
3004 unsigned long maxpages,
3005 sector_t *span)
3006{
3007 unsigned int j, k;
3008 unsigned int nr_good_pages;
3009 int nr_extents;
3010 unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
3011 unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
3012 unsigned long i, idx;
3013
3014 nr_good_pages = maxpages - 1;
3015
3016 cluster_list_init(&p->free_clusters);
3017 cluster_list_init(&p->discard_clusters);
3018
3019 for (i = 0; i < swap_header->info.nr_badpages; i++) {
3020 unsigned int page_nr = swap_header->info.badpages[i];
3021 if (page_nr == 0 || page_nr > swap_header->info.last_page)
3022 return -EINVAL;
3023 if (page_nr < maxpages) {
3024 swap_map[page_nr] = SWAP_MAP_BAD;
3025 nr_good_pages--;
3026
3027
3028
3029
3030 inc_cluster_info_page(p, cluster_info, page_nr);
3031 }
3032 }
3033
3034
3035 for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
3036 inc_cluster_info_page(p, cluster_info, i);
3037
3038 if (nr_good_pages) {
3039 swap_map[0] = SWAP_MAP_BAD;
3040
3041
3042
3043
3044 inc_cluster_info_page(p, cluster_info, 0);
3045 p->max = maxpages;
3046 p->pages = nr_good_pages;
3047 nr_extents = setup_swap_extents(p, span);
3048 if (nr_extents < 0)
3049 return nr_extents;
3050 nr_good_pages = p->pages;
3051 }
3052 if (!nr_good_pages) {
3053 pr_warn("Empty swap-file\n");
3054 return -EINVAL;
3055 }
3056
3057 if (!cluster_info)
3058 return nr_extents;
3059
3060
3061
3062
3063
3064
3065 for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
3066 j = (k + col) % SWAP_CLUSTER_COLS;
3067 for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
3068 idx = i * SWAP_CLUSTER_COLS + j;
3069 if (idx >= nr_clusters)
3070 continue;
3071 if (cluster_count(&cluster_info[idx]))
3072 continue;
3073 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
3074 cluster_list_add_tail(&p->free_clusters, cluster_info,
3075 idx);
3076 }
3077 }
3078 return nr_extents;
3079}
3080
3081
3082
3083
3084
3085static bool swap_discardable(struct swap_info_struct *si)
3086{
3087 struct request_queue *q = bdev_get_queue(si->bdev);
3088
3089 if (!q || !blk_queue_discard(q))
3090 return false;
3091
3092 return true;
3093}
3094
3095SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
3096{
3097 struct swap_info_struct *p;
3098 struct filename *name;
3099 struct file *swap_file = NULL;
3100 struct address_space *mapping;
3101 int prio;
3102 int error;
3103 union swap_header *swap_header;
3104 int nr_extents;
3105 sector_t span;
3106 unsigned long maxpages;
3107 unsigned char *swap_map = NULL;
3108 struct swap_cluster_info *cluster_info = NULL;
3109 unsigned long *frontswap_map = NULL;
3110 struct page *page = NULL;
3111 struct inode *inode = NULL;
3112 bool inced_nr_rotate_swap = false;
3113
3114 if (swap_flags & ~SWAP_FLAGS_VALID)
3115 return -EINVAL;
3116
3117 if (!capable(CAP_SYS_ADMIN))
3118 return -EPERM;
3119
3120 if (!swap_avail_heads)
3121 return -ENOMEM;
3122
3123 p = alloc_swap_info();
3124 if (IS_ERR(p))
3125 return PTR_ERR(p);
3126
3127 INIT_WORK(&p->discard_work, swap_discard_work);
3128
3129 name = getname(specialfile);
3130 if (IS_ERR(name)) {
3131 error = PTR_ERR(name);
3132 name = NULL;
3133 goto bad_swap;
3134 }
3135 swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
3136 if (IS_ERR(swap_file)) {
3137 error = PTR_ERR(swap_file);
3138 swap_file = NULL;
3139 goto bad_swap;
3140 }
3141
3142 p->swap_file = swap_file;
3143 mapping = swap_file->f_mapping;
3144 inode = mapping->host;
3145
3146
3147 error = claim_swapfile(p, inode);
3148 if (unlikely(error))
3149 goto bad_swap;
3150
3151
3152
3153
3154 if (!mapping->a_ops->readpage) {
3155 error = -EINVAL;
3156 goto bad_swap;
3157 }
3158 page = read_mapping_page(mapping, 0, swap_file);
3159 if (IS_ERR(page)) {
3160 error = PTR_ERR(page);
3161 goto bad_swap;
3162 }
3163 swap_header = kmap(page);
3164
3165 maxpages = read_swap_header(p, swap_header, inode);
3166 if (unlikely(!maxpages)) {
3167 error = -EINVAL;
3168 goto bad_swap;
3169 }
3170
3171
3172 swap_map = vzalloc(maxpages);
3173 if (!swap_map) {
3174 error = -ENOMEM;
3175 goto bad_swap;
3176 }
3177
3178 if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
3179 p->flags |= SWP_STABLE_WRITES;
3180
3181 if (bdi_cap_synchronous_io(inode_to_bdi(inode)))
3182 p->flags |= SWP_SYNCHRONOUS_IO;
3183
3184 if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
3185 int cpu;
3186 unsigned long ci, nr_cluster;
3187
3188 p->flags |= SWP_SOLIDSTATE;
3189
3190
3191
3192
3193 p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
3194 nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
3195
3196 cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
3197 GFP_KERNEL);
3198 if (!cluster_info) {
3199 error = -ENOMEM;
3200 goto bad_swap;
3201 }
3202
3203 for (ci = 0; ci < nr_cluster; ci++)
3204 spin_lock_init(&((cluster_info + ci)->lock));
3205
3206 p->percpu_cluster = alloc_percpu(struct percpu_cluster);
3207 if (!p->percpu_cluster) {
3208 error = -ENOMEM;
3209 goto bad_swap;
3210 }
3211 for_each_possible_cpu(cpu) {
3212 struct percpu_cluster *cluster;
3213 cluster = per_cpu_ptr(p->percpu_cluster, cpu);
3214 cluster_set_null(&cluster->index);
3215 }
3216 } else {
3217 atomic_inc(&nr_rotate_swap);
3218 inced_nr_rotate_swap = true;
3219 }
3220
3221 error = swap_cgroup_swapon(p->type, maxpages);
3222 if (error)
3223 goto bad_swap;
3224
3225 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
3226 cluster_info, maxpages, &span);
3227 if (unlikely(nr_extents < 0)) {
3228 error = nr_extents;
3229 goto bad_swap;
3230 }
3231
3232 if (IS_ENABLED(CONFIG_FRONTSWAP))
3233 frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages),
3234 sizeof(long),
3235 GFP_KERNEL);
3236
3237 if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
3238
3239
3240
3241
3242
3243
3244 p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
3245 SWP_PAGE_DISCARD);
3246
3247
3248
3249
3250
3251
3252
3253 if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
3254 p->flags &= ~SWP_PAGE_DISCARD;
3255 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
3256 p->flags &= ~SWP_AREA_DISCARD;
3257
3258
3259 if (p->flags & SWP_AREA_DISCARD) {
3260 int err = discard_swap(p);
3261 if (unlikely(err))
3262 pr_err("swapon: discard_swap(%p): %d\n",
3263 p, err);
3264 }
3265 }
3266
3267 error = init_swap_address_space(p->type, maxpages);
3268 if (error)
3269 goto bad_swap;
3270
3271 mutex_lock(&swapon_mutex);
3272 prio = -1;
3273 if (swap_flags & SWAP_FLAG_PREFER)
3274 prio =
3275 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
3276 enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
3277
3278 pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
3279 p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
3280 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
3281 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
3282 (p->flags & SWP_DISCARDABLE) ? "D" : "",
3283 (p->flags & SWP_AREA_DISCARD) ? "s" : "",
3284 (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
3285 (frontswap_map) ? "FS" : "");
3286
3287 mutex_unlock(&swapon_mutex);
3288 atomic_inc(&proc_poll_event);
3289 wake_up_interruptible(&proc_poll_wait);
3290
3291 if (S_ISREG(inode->i_mode))
3292 inode->i_flags |= S_SWAPFILE;
3293 error = 0;
3294 goto out;
3295bad_swap:
3296 free_percpu(p->percpu_cluster);
3297 p->percpu_cluster = NULL;
3298 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
3299 set_blocksize(p->bdev, p->old_block_size);
3300 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3301 }
3302 destroy_swap_extents(p);
3303 swap_cgroup_swapoff(p->type);
3304 spin_lock(&swap_lock);
3305 p->swap_file = NULL;
3306 p->flags = 0;
3307 spin_unlock(&swap_lock);
3308 vfree(swap_map);
3309 kvfree(cluster_info);
3310 kvfree(frontswap_map);
3311 if (inced_nr_rotate_swap)
3312 atomic_dec(&nr_rotate_swap);
3313 if (swap_file) {
3314 if (inode && S_ISREG(inode->i_mode)) {
3315 inode_unlock(inode);
3316 inode = NULL;
3317 }
3318 filp_close(swap_file, NULL);
3319 }
3320out:
3321 if (page && !IS_ERR(page)) {
3322 kunmap(page);
3323 put_page(page);
3324 }
3325 if (name)
3326 putname(name);
3327 if (inode && S_ISREG(inode->i_mode))
3328 inode_unlock(inode);
3329 if (!error)
3330 enable_swap_slots_cache();
3331 return error;
3332}
3333
3334void si_swapinfo(struct sysinfo *val)
3335{
3336 unsigned int type;
3337 unsigned long nr_to_be_unused = 0;
3338
3339 spin_lock(&swap_lock);
3340 for (type = 0; type < nr_swapfiles; type++) {
3341 struct swap_info_struct *si = swap_info[type];
3342
3343 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
3344 nr_to_be_unused += si->inuse_pages;
3345 }
3346 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
3347 val->totalswap = total_swap_pages + nr_to_be_unused;
3348 spin_unlock(&swap_lock);
3349}
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
3363{
3364 struct swap_info_struct *p;
3365 struct swap_cluster_info *ci;
3366 unsigned long offset, type;
3367 unsigned char count;
3368 unsigned char has_cache;
3369 int err = -EINVAL;
3370
3371 if (non_swap_entry(entry))
3372 goto out;
3373
3374 type = swp_type(entry);
3375 if (type >= nr_swapfiles)
3376 goto bad_file;
3377 p = swap_info[type];
3378 offset = swp_offset(entry);
3379 if (unlikely(offset >= p->max))
3380 goto out;
3381
3382 ci = lock_cluster_or_swap_info(p, offset);
3383
3384 count = p->swap_map[offset];
3385
3386
3387
3388
3389
3390 if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
3391 err = -ENOENT;
3392 goto unlock_out;
3393 }
3394
3395 has_cache = count & SWAP_HAS_CACHE;
3396 count &= ~SWAP_HAS_CACHE;
3397 err = 0;
3398
3399 if (usage == SWAP_HAS_CACHE) {
3400
3401
3402 if (!has_cache && count)
3403 has_cache = SWAP_HAS_CACHE;
3404 else if (has_cache)
3405 err = -EEXIST;
3406 else
3407 err = -ENOENT;
3408
3409 } else if (count || has_cache) {
3410
3411 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
3412 count += usage;
3413 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
3414 err = -EINVAL;
3415 else if (swap_count_continued(p, offset, count))
3416 count = COUNT_CONTINUED;
3417 else
3418 err = -ENOMEM;
3419 } else
3420 err = -ENOENT;
3421
3422 p->swap_map[offset] = count | has_cache;
3423
3424unlock_out:
3425 unlock_cluster_or_swap_info(p, ci);
3426out:
3427 return err;
3428
3429bad_file:
3430 pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
3431 goto out;
3432}
3433
3434
3435
3436
3437
3438void swap_shmem_alloc(swp_entry_t entry)
3439{
3440 __swap_duplicate(entry, SWAP_MAP_SHMEM);
3441}
3442
3443
3444
3445
3446
3447
3448
3449
3450int swap_duplicate(swp_entry_t entry)
3451{
3452 int err = 0;
3453
3454 while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
3455 err = add_swap_count_continuation(entry, GFP_ATOMIC);
3456 return err;
3457}
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467int swapcache_prepare(swp_entry_t entry)
3468{
3469 return __swap_duplicate(entry, SWAP_HAS_CACHE);
3470}
3471
3472struct swap_info_struct *swp_swap_info(swp_entry_t entry)
3473{
3474 return swap_info[swp_type(entry)];
3475}
3476
3477struct swap_info_struct *page_swap_info(struct page *page)
3478{
3479 swp_entry_t entry = { .val = page_private(page) };
3480 return swp_swap_info(entry);
3481}
3482
3483
3484
3485
3486struct address_space *__page_file_mapping(struct page *page)
3487{
3488 return page_swap_info(page)->swap_file->f_mapping;
3489}
3490EXPORT_SYMBOL_GPL(__page_file_mapping);
3491
3492pgoff_t __page_file_index(struct page *page)
3493{
3494 swp_entry_t swap = { .val = page_private(page) };
3495 return swp_offset(swap);
3496}
3497EXPORT_SYMBOL_GPL(__page_file_index);
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
3515{
3516 struct swap_info_struct *si;
3517 struct swap_cluster_info *ci;
3518 struct page *head;
3519 struct page *page;
3520 struct page *list_page;
3521 pgoff_t offset;
3522 unsigned char count;
3523
3524
3525
3526
3527
3528 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
3529
3530 si = swap_info_get(entry);
3531 if (!si) {
3532
3533
3534
3535
3536
3537 goto outer;
3538 }
3539
3540 offset = swp_offset(entry);
3541
3542 ci = lock_cluster(si, offset);
3543
3544 count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
3545
3546 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
3547
3548
3549
3550
3551
3552 goto out;
3553 }
3554
3555 if (!page) {
3556 unlock_cluster(ci);
3557 spin_unlock(&si->lock);
3558 return -ENOMEM;
3559 }
3560
3561
3562
3563
3564
3565
3566 head = vmalloc_to_page(si->swap_map + offset);
3567 offset &= ~PAGE_MASK;
3568
3569 spin_lock(&si->cont_lock);
3570
3571
3572
3573
3574 if (!page_private(head)) {
3575 BUG_ON(count & COUNT_CONTINUED);
3576 INIT_LIST_HEAD(&head->lru);
3577 set_page_private(head, SWP_CONTINUED);
3578 si->flags |= SWP_CONTINUED;
3579 }
3580
3581 list_for_each_entry(list_page, &head->lru, lru) {
3582 unsigned char *map;
3583
3584
3585
3586
3587
3588 if (!(count & COUNT_CONTINUED))
3589 goto out_unlock_cont;
3590
3591 map = kmap_atomic(list_page) + offset;
3592 count = *map;
3593 kunmap_atomic(map);
3594
3595
3596
3597
3598
3599 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
3600 goto out_unlock_cont;
3601 }
3602
3603 list_add_tail(&page->lru, &head->lru);
3604 page = NULL;
3605out_unlock_cont:
3606 spin_unlock(&si->cont_lock);
3607out:
3608 unlock_cluster(ci);
3609 spin_unlock(&si->lock);
3610outer:
3611 if (page)
3612 __free_page(page);
3613 return 0;
3614}
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625static bool swap_count_continued(struct swap_info_struct *si,
3626 pgoff_t offset, unsigned char count)
3627{
3628 struct page *head;
3629 struct page *page;
3630 unsigned char *map;
3631 bool ret;
3632
3633 head = vmalloc_to_page(si->swap_map + offset);
3634 if (page_private(head) != SWP_CONTINUED) {
3635 BUG_ON(count & COUNT_CONTINUED);
3636 return false;
3637 }
3638
3639 spin_lock(&si->cont_lock);
3640 offset &= ~PAGE_MASK;
3641 page = list_entry(head->lru.next, struct page, lru);
3642 map = kmap_atomic(page) + offset;
3643
3644 if (count == SWAP_MAP_MAX)
3645 goto init_map;
3646
3647 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) {
3648
3649
3650
3651 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
3652 kunmap_atomic(map);
3653 page = list_entry(page->lru.next, struct page, lru);
3654 BUG_ON(page == head);
3655 map = kmap_atomic(page) + offset;
3656 }
3657 if (*map == SWAP_CONT_MAX) {
3658 kunmap_atomic(map);
3659 page = list_entry(page->lru.next, struct page, lru);
3660 if (page == head) {
3661 ret = false;
3662 goto out;
3663 }
3664 map = kmap_atomic(page) + offset;
3665init_map: *map = 0;
3666 }
3667 *map += 1;
3668 kunmap_atomic(map);
3669 page = list_entry(page->lru.prev, struct page, lru);
3670 while (page != head) {
3671 map = kmap_atomic(page) + offset;
3672 *map = COUNT_CONTINUED;
3673 kunmap_atomic(map);
3674 page = list_entry(page->lru.prev, struct page, lru);
3675 }
3676 ret = true;
3677
3678 } else {
3679
3680
3681
3682 BUG_ON(count != COUNT_CONTINUED);
3683 while (*map == COUNT_CONTINUED) {
3684 kunmap_atomic(map);
3685 page = list_entry(page->lru.next, struct page, lru);
3686 BUG_ON(page == head);
3687 map = kmap_atomic(page) + offset;
3688 }
3689 BUG_ON(*map == 0);
3690 *map -= 1;
3691 if (*map == 0)
3692 count = 0;
3693 kunmap_atomic(map);
3694 page = list_entry(page->lru.prev, struct page, lru);
3695 while (page != head) {
3696 map = kmap_atomic(page) + offset;
3697 *map = SWAP_CONT_MAX | count;
3698 count = COUNT_CONTINUED;
3699 kunmap_atomic(map);
3700 page = list_entry(page->lru.prev, struct page, lru);
3701 }
3702 ret = count == COUNT_CONTINUED;
3703 }
3704out:
3705 spin_unlock(&si->cont_lock);
3706 return ret;
3707}
3708
3709
3710
3711
3712
3713static void free_swap_count_continuations(struct swap_info_struct *si)
3714{
3715 pgoff_t offset;
3716
3717 for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
3718 struct page *head;
3719 head = vmalloc_to_page(si->swap_map + offset);
3720 if (page_private(head)) {
3721 struct page *page, *next;
3722
3723 list_for_each_entry_safe(page, next, &head->lru, lru) {
3724 list_del(&page->lru);
3725 __free_page(page);
3726 }
3727 }
3728 }
3729}
3730
3731#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
3732void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
3733 gfp_t gfp_mask)
3734{
3735 struct swap_info_struct *si, *next;
3736 if (!(gfp_mask & __GFP_IO) || !memcg)
3737 return;
3738
3739 if (!blk_cgroup_congested())
3740 return;
3741
3742
3743
3744
3745
3746 if (current->throttle_queue)
3747 return;
3748
3749 spin_lock(&swap_avail_lock);
3750 plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
3751 avail_lists[node]) {
3752 if (si->bdev) {
3753 blkcg_schedule_throttle(bdev_get_queue(si->bdev),
3754 true);
3755 break;
3756 }
3757 }
3758 spin_unlock(&swap_avail_lock);
3759}
3760#endif
3761
3762static int __init swapfile_init(void)
3763{
3764 int nid;
3765
3766 swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
3767 GFP_KERNEL);
3768 if (!swap_avail_heads) {
3769 pr_emerg("Not enough memory for swap heads, swap is disabled\n");
3770 return -ENOMEM;
3771 }
3772
3773 for_each_node(nid)
3774 plist_head_init(&swap_avail_heads[nid]);
3775
3776 return 0;
3777}
3778subsys_initcall(swapfile_init);
3779