1
2
3
4
5
6
7
8#include <linux/mm.h>
9#include <linux/hugetlb.h>
10#include <linux/mman.h>
11#include <linux/slab.h>
12#include <linux/kernel_stat.h>
13#include <linux/swap.h>
14#include <linux/vmalloc.h>
15#include <linux/pagemap.h>
16#include <linux/namei.h>
17#include <linux/shmem_fs.h>
18#include <linux/blkdev.h>
19#include <linux/random.h>
20#include <linux/writeback.h>
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/init.h>
24#include <linux/ksm.h>
25#include <linux/rmap.h>
26#include <linux/security.h>
27#include <linux/backing-dev.h>
28#include <linux/mutex.h>
29#include <linux/capability.h>
30#include <linux/syscalls.h>
31#include <linux/memcontrol.h>
32#include <linux/poll.h>
33#include <linux/oom.h>
34#include <linux/frontswap.h>
35#include <linux/swapfile.h>
36#include <linux/export.h>
37
38#include <asm/pgtable.h>
39#include <asm/tlbflush.h>
40#include <linux/swapops.h>
41#include <linux/swap_cgroup.h>
42
43static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
44 unsigned char);
45static void free_swap_count_continuations(struct swap_info_struct *);
46static sector_t map_swap_entry(swp_entry_t, struct block_device**);
47
48DEFINE_SPINLOCK(swap_lock);
49static unsigned int nr_swapfiles;
50atomic_long_t nr_swap_pages;
51
52long total_swap_pages;
53static int least_priority;
54
55static const char Bad_file[] = "Bad swap file entry ";
56static const char Unused_file[] = "Unused swap file entry ";
57static const char Bad_offset[] = "Bad swap offset entry ";
58static const char Unused_offset[] = "Unused swap offset entry ";
59
60
61
62
63
64PLIST_HEAD(swap_active_head);
65
66
67
68
69
70
71
72
73
74
75
76
77
78static PLIST_HEAD(swap_avail_head);
79static DEFINE_SPINLOCK(swap_avail_lock);
80
81struct swap_info_struct *swap_info[MAX_SWAPFILES];
82
83static DEFINE_MUTEX(swapon_mutex);
84
85static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
86
87static atomic_t proc_poll_event = ATOMIC_INIT(0);
88
89static inline unsigned char swap_count(unsigned char ent)
90{
91 return ent & ~SWAP_HAS_CACHE;
92}
93
94
95static int
96__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
97{
98 swp_entry_t entry = swp_entry(si->type, offset);
99 struct page *page;
100 int ret = 0;
101
102 page = find_get_page(swap_address_space(entry), entry.val);
103 if (!page)
104 return 0;
105
106
107
108
109
110
111
112 if (trylock_page(page)) {
113 ret = try_to_free_swap(page);
114 unlock_page(page);
115 }
116 page_cache_release(page);
117 return ret;
118}
119
120
121
122
123
124static int discard_swap(struct swap_info_struct *si)
125{
126 struct swap_extent *se;
127 sector_t start_block;
128 sector_t nr_blocks;
129 int err = 0;
130
131
132 se = &si->first_swap_extent;
133 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
134 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
135 if (nr_blocks) {
136 err = blkdev_issue_discard(si->bdev, start_block,
137 nr_blocks, GFP_KERNEL, 0);
138 if (err)
139 return err;
140 cond_resched();
141 }
142
143 list_for_each_entry(se, &si->first_swap_extent.list, list) {
144 start_block = se->start_block << (PAGE_SHIFT - 9);
145 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
146
147 err = blkdev_issue_discard(si->bdev, start_block,
148 nr_blocks, GFP_KERNEL, 0);
149 if (err)
150 break;
151
152 cond_resched();
153 }
154 return err;
155}
156
157
158
159
160
161static void discard_swap_cluster(struct swap_info_struct *si,
162 pgoff_t start_page, pgoff_t nr_pages)
163{
164 struct swap_extent *se = si->curr_swap_extent;
165 int found_extent = 0;
166
167 while (nr_pages) {
168 struct list_head *lh;
169
170 if (se->start_page <= start_page &&
171 start_page < se->start_page + se->nr_pages) {
172 pgoff_t offset = start_page - se->start_page;
173 sector_t start_block = se->start_block + offset;
174 sector_t nr_blocks = se->nr_pages - offset;
175
176 if (nr_blocks > nr_pages)
177 nr_blocks = nr_pages;
178 start_page += nr_blocks;
179 nr_pages -= nr_blocks;
180
181 if (!found_extent++)
182 si->curr_swap_extent = se;
183
184 start_block <<= PAGE_SHIFT - 9;
185 nr_blocks <<= PAGE_SHIFT - 9;
186 if (blkdev_issue_discard(si->bdev, start_block,
187 nr_blocks, GFP_NOIO, 0))
188 break;
189 }
190
191 lh = se->list.next;
192 se = list_entry(lh, struct swap_extent, list);
193 }
194}
195
196#define SWAPFILE_CLUSTER 256
197#define LATENCY_LIMIT 256
198
199static inline void cluster_set_flag(struct swap_cluster_info *info,
200 unsigned int flag)
201{
202 info->flags = flag;
203}
204
205static inline unsigned int cluster_count(struct swap_cluster_info *info)
206{
207 return info->data;
208}
209
210static inline void cluster_set_count(struct swap_cluster_info *info,
211 unsigned int c)
212{
213 info->data = c;
214}
215
216static inline void cluster_set_count_flag(struct swap_cluster_info *info,
217 unsigned int c, unsigned int f)
218{
219 info->flags = f;
220 info->data = c;
221}
222
223static inline unsigned int cluster_next(struct swap_cluster_info *info)
224{
225 return info->data;
226}
227
228static inline void cluster_set_next(struct swap_cluster_info *info,
229 unsigned int n)
230{
231 info->data = n;
232}
233
234static inline void cluster_set_next_flag(struct swap_cluster_info *info,
235 unsigned int n, unsigned int f)
236{
237 info->flags = f;
238 info->data = n;
239}
240
241static inline bool cluster_is_free(struct swap_cluster_info *info)
242{
243 return info->flags & CLUSTER_FLAG_FREE;
244}
245
246static inline bool cluster_is_null(struct swap_cluster_info *info)
247{
248 return info->flags & CLUSTER_FLAG_NEXT_NULL;
249}
250
251static inline void cluster_set_null(struct swap_cluster_info *info)
252{
253 info->flags = CLUSTER_FLAG_NEXT_NULL;
254 info->data = 0;
255}
256
257
258static void swap_cluster_schedule_discard(struct swap_info_struct *si,
259 unsigned int idx)
260{
261
262
263
264
265
266
267 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
268 SWAP_MAP_BAD, SWAPFILE_CLUSTER);
269
270 if (cluster_is_null(&si->discard_cluster_head)) {
271 cluster_set_next_flag(&si->discard_cluster_head,
272 idx, 0);
273 cluster_set_next_flag(&si->discard_cluster_tail,
274 idx, 0);
275 } else {
276 unsigned int tail = cluster_next(&si->discard_cluster_tail);
277 cluster_set_next(&si->cluster_info[tail], idx);
278 cluster_set_next_flag(&si->discard_cluster_tail,
279 idx, 0);
280 }
281
282 schedule_work(&si->discard_work);
283}
284
285
286
287
288
289static void swap_do_scheduled_discard(struct swap_info_struct *si)
290{
291 struct swap_cluster_info *info;
292 unsigned int idx;
293
294 info = si->cluster_info;
295
296 while (!cluster_is_null(&si->discard_cluster_head)) {
297 idx = cluster_next(&si->discard_cluster_head);
298
299 cluster_set_next_flag(&si->discard_cluster_head,
300 cluster_next(&info[idx]), 0);
301 if (cluster_next(&si->discard_cluster_tail) == idx) {
302 cluster_set_null(&si->discard_cluster_head);
303 cluster_set_null(&si->discard_cluster_tail);
304 }
305 spin_unlock(&si->lock);
306
307 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
308 SWAPFILE_CLUSTER);
309
310 spin_lock(&si->lock);
311 cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
312 if (cluster_is_null(&si->free_cluster_head)) {
313 cluster_set_next_flag(&si->free_cluster_head,
314 idx, 0);
315 cluster_set_next_flag(&si->free_cluster_tail,
316 idx, 0);
317 } else {
318 unsigned int tail;
319
320 tail = cluster_next(&si->free_cluster_tail);
321 cluster_set_next(&info[tail], idx);
322 cluster_set_next_flag(&si->free_cluster_tail,
323 idx, 0);
324 }
325 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
326 0, SWAPFILE_CLUSTER);
327 }
328}
329
330static void swap_discard_work(struct work_struct *work)
331{
332 struct swap_info_struct *si;
333
334 si = container_of(work, struct swap_info_struct, discard_work);
335
336 spin_lock(&si->lock);
337 swap_do_scheduled_discard(si);
338 spin_unlock(&si->lock);
339}
340
341
342
343
344
345static void inc_cluster_info_page(struct swap_info_struct *p,
346 struct swap_cluster_info *cluster_info, unsigned long page_nr)
347{
348 unsigned long idx = page_nr / SWAPFILE_CLUSTER;
349
350 if (!cluster_info)
351 return;
352 if (cluster_is_free(&cluster_info[idx])) {
353 VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx);
354 cluster_set_next_flag(&p->free_cluster_head,
355 cluster_next(&cluster_info[idx]), 0);
356 if (cluster_next(&p->free_cluster_tail) == idx) {
357 cluster_set_null(&p->free_cluster_tail);
358 cluster_set_null(&p->free_cluster_head);
359 }
360 cluster_set_count_flag(&cluster_info[idx], 0, 0);
361 }
362
363 VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
364 cluster_set_count(&cluster_info[idx],
365 cluster_count(&cluster_info[idx]) + 1);
366}
367
368
369
370
371
372
373static void dec_cluster_info_page(struct swap_info_struct *p,
374 struct swap_cluster_info *cluster_info, unsigned long page_nr)
375{
376 unsigned long idx = page_nr / SWAPFILE_CLUSTER;
377
378 if (!cluster_info)
379 return;
380
381 VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
382 cluster_set_count(&cluster_info[idx],
383 cluster_count(&cluster_info[idx]) - 1);
384
385 if (cluster_count(&cluster_info[idx]) == 0) {
386
387
388
389
390
391 if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
392 (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
393 swap_cluster_schedule_discard(p, idx);
394 return;
395 }
396
397 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
398 if (cluster_is_null(&p->free_cluster_head)) {
399 cluster_set_next_flag(&p->free_cluster_head, idx, 0);
400 cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
401 } else {
402 unsigned int tail = cluster_next(&p->free_cluster_tail);
403 cluster_set_next(&cluster_info[tail], idx);
404 cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
405 }
406 }
407}
408
409
410
411
412
413static bool
414scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
415 unsigned long offset)
416{
417 struct percpu_cluster *percpu_cluster;
418 bool conflict;
419
420 offset /= SWAPFILE_CLUSTER;
421 conflict = !cluster_is_null(&si->free_cluster_head) &&
422 offset != cluster_next(&si->free_cluster_head) &&
423 cluster_is_free(&si->cluster_info[offset]);
424
425 if (!conflict)
426 return false;
427
428 percpu_cluster = this_cpu_ptr(si->percpu_cluster);
429 cluster_set_null(&percpu_cluster->index);
430 return true;
431}
432
433
434
435
436
437static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
438 unsigned long *offset, unsigned long *scan_base)
439{
440 struct percpu_cluster *cluster;
441 bool found_free;
442 unsigned long tmp;
443
444new_cluster:
445 cluster = this_cpu_ptr(si->percpu_cluster);
446 if (cluster_is_null(&cluster->index)) {
447 if (!cluster_is_null(&si->free_cluster_head)) {
448 cluster->index = si->free_cluster_head;
449 cluster->next = cluster_next(&cluster->index) *
450 SWAPFILE_CLUSTER;
451 } else if (!cluster_is_null(&si->discard_cluster_head)) {
452
453
454
455
456 swap_do_scheduled_discard(si);
457 *scan_base = *offset = si->cluster_next;
458 goto new_cluster;
459 } else
460 return;
461 }
462
463 found_free = false;
464
465
466
467
468
469 tmp = cluster->next;
470 while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) *
471 SWAPFILE_CLUSTER) {
472 if (!si->swap_map[tmp]) {
473 found_free = true;
474 break;
475 }
476 tmp++;
477 }
478 if (!found_free) {
479 cluster_set_null(&cluster->index);
480 goto new_cluster;
481 }
482 cluster->next = tmp + 1;
483 *offset = tmp;
484 *scan_base = tmp;
485}
486
487static unsigned long scan_swap_map(struct swap_info_struct *si,
488 unsigned char usage)
489{
490 unsigned long offset;
491 unsigned long scan_base;
492 unsigned long last_in_cluster = 0;
493 int latency_ration = LATENCY_LIMIT;
494
495
496
497
498
499
500
501
502
503
504
505
506 si->flags += SWP_SCANNING;
507 scan_base = offset = si->cluster_next;
508
509
510 if (si->cluster_info) {
511 scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
512 goto checks;
513 }
514
515 if (unlikely(!si->cluster_nr--)) {
516 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
517 si->cluster_nr = SWAPFILE_CLUSTER - 1;
518 goto checks;
519 }
520
521 spin_unlock(&si->lock);
522
523
524
525
526
527
528
529 scan_base = offset = si->lowest_bit;
530 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
531
532
533 for (; last_in_cluster <= si->highest_bit; offset++) {
534 if (si->swap_map[offset])
535 last_in_cluster = offset + SWAPFILE_CLUSTER;
536 else if (offset == last_in_cluster) {
537 spin_lock(&si->lock);
538 offset -= SWAPFILE_CLUSTER - 1;
539 si->cluster_next = offset;
540 si->cluster_nr = SWAPFILE_CLUSTER - 1;
541 goto checks;
542 }
543 if (unlikely(--latency_ration < 0)) {
544 cond_resched();
545 latency_ration = LATENCY_LIMIT;
546 }
547 }
548
549 offset = scan_base;
550 spin_lock(&si->lock);
551 si->cluster_nr = SWAPFILE_CLUSTER - 1;
552 }
553
554checks:
555 if (si->cluster_info) {
556 while (scan_swap_map_ssd_cluster_conflict(si, offset))
557 scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
558 }
559 if (!(si->flags & SWP_WRITEOK))
560 goto no_page;
561 if (!si->highest_bit)
562 goto no_page;
563 if (offset > si->highest_bit)
564 scan_base = offset = si->lowest_bit;
565
566
567 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
568 int swap_was_freed;
569 spin_unlock(&si->lock);
570 swap_was_freed = __try_to_reclaim_swap(si, offset);
571 spin_lock(&si->lock);
572
573 if (swap_was_freed)
574 goto checks;
575 goto scan;
576 }
577
578 if (si->swap_map[offset])
579 goto scan;
580
581 if (offset == si->lowest_bit)
582 si->lowest_bit++;
583 if (offset == si->highest_bit)
584 si->highest_bit--;
585 si->inuse_pages++;
586 if (si->inuse_pages == si->pages) {
587 si->lowest_bit = si->max;
588 si->highest_bit = 0;
589 spin_lock(&swap_avail_lock);
590 plist_del(&si->avail_list, &swap_avail_head);
591 spin_unlock(&swap_avail_lock);
592 }
593 si->swap_map[offset] = usage;
594 inc_cluster_info_page(si, si->cluster_info, offset);
595 si->cluster_next = offset + 1;
596 si->flags -= SWP_SCANNING;
597
598 return offset;
599
600scan:
601 spin_unlock(&si->lock);
602 while (++offset <= si->highest_bit) {
603 if (!si->swap_map[offset]) {
604 spin_lock(&si->lock);
605 goto checks;
606 }
607 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
608 spin_lock(&si->lock);
609 goto checks;
610 }
611 if (unlikely(--latency_ration < 0)) {
612 cond_resched();
613 latency_ration = LATENCY_LIMIT;
614 }
615 }
616 offset = si->lowest_bit;
617 while (offset < scan_base) {
618 if (!si->swap_map[offset]) {
619 spin_lock(&si->lock);
620 goto checks;
621 }
622 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
623 spin_lock(&si->lock);
624 goto checks;
625 }
626 if (unlikely(--latency_ration < 0)) {
627 cond_resched();
628 latency_ration = LATENCY_LIMIT;
629 }
630 offset++;
631 }
632 spin_lock(&si->lock);
633
634no_page:
635 si->flags -= SWP_SCANNING;
636 return 0;
637}
638
639swp_entry_t get_swap_page(void)
640{
641 struct swap_info_struct *si, *next;
642 pgoff_t offset;
643
644 if (atomic_long_read(&nr_swap_pages) <= 0)
645 goto noswap;
646 atomic_long_dec(&nr_swap_pages);
647
648 spin_lock(&swap_avail_lock);
649
650start_over:
651 plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
652
653 plist_requeue(&si->avail_list, &swap_avail_head);
654 spin_unlock(&swap_avail_lock);
655 spin_lock(&si->lock);
656 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
657 spin_lock(&swap_avail_lock);
658 if (plist_node_empty(&si->avail_list)) {
659 spin_unlock(&si->lock);
660 goto nextsi;
661 }
662 WARN(!si->highest_bit,
663 "swap_info %d in list but !highest_bit\n",
664 si->type);
665 WARN(!(si->flags & SWP_WRITEOK),
666 "swap_info %d in list but !SWP_WRITEOK\n",
667 si->type);
668 plist_del(&si->avail_list, &swap_avail_head);
669 spin_unlock(&si->lock);
670 goto nextsi;
671 }
672
673
674 offset = scan_swap_map(si, SWAP_HAS_CACHE);
675 spin_unlock(&si->lock);
676 if (offset)
677 return swp_entry(si->type, offset);
678 pr_debug("scan_swap_map of si %d failed to find offset\n",
679 si->type);
680 spin_lock(&swap_avail_lock);
681nextsi:
682
683
684
685
686
687
688
689
690
691
692 if (plist_node_empty(&next->avail_list))
693 goto start_over;
694 }
695
696 spin_unlock(&swap_avail_lock);
697
698 atomic_long_inc(&nr_swap_pages);
699noswap:
700 return (swp_entry_t) {0};
701}
702
703
704swp_entry_t get_swap_page_of_type(int type)
705{
706 struct swap_info_struct *si;
707 pgoff_t offset;
708
709 si = swap_info[type];
710 spin_lock(&si->lock);
711 if (si && (si->flags & SWP_WRITEOK)) {
712 atomic_long_dec(&nr_swap_pages);
713
714 offset = scan_swap_map(si, 1);
715 if (offset) {
716 spin_unlock(&si->lock);
717 return swp_entry(type, offset);
718 }
719 atomic_long_inc(&nr_swap_pages);
720 }
721 spin_unlock(&si->lock);
722 return (swp_entry_t) {0};
723}
724
725static struct swap_info_struct *swap_info_get(swp_entry_t entry)
726{
727 struct swap_info_struct *p;
728 unsigned long offset, type;
729
730 if (!entry.val)
731 goto out;
732 type = swp_type(entry);
733 if (type >= nr_swapfiles)
734 goto bad_nofile;
735 p = swap_info[type];
736 if (!(p->flags & SWP_USED))
737 goto bad_device;
738 offset = swp_offset(entry);
739 if (offset >= p->max)
740 goto bad_offset;
741 if (!p->swap_map[offset])
742 goto bad_free;
743 spin_lock(&p->lock);
744 return p;
745
746bad_free:
747 pr_err("swap_free: %s%08lx\n", Unused_offset, entry.val);
748 goto out;
749bad_offset:
750 pr_err("swap_free: %s%08lx\n", Bad_offset, entry.val);
751 goto out;
752bad_device:
753 pr_err("swap_free: %s%08lx\n", Unused_file, entry.val);
754 goto out;
755bad_nofile:
756 pr_err("swap_free: %s%08lx\n", Bad_file, entry.val);
757out:
758 return NULL;
759}
760
761static unsigned char swap_entry_free(struct swap_info_struct *p,
762 swp_entry_t entry, unsigned char usage)
763{
764 unsigned long offset = swp_offset(entry);
765 unsigned char count;
766 unsigned char has_cache;
767
768 count = p->swap_map[offset];
769 has_cache = count & SWAP_HAS_CACHE;
770 count &= ~SWAP_HAS_CACHE;
771
772 if (usage == SWAP_HAS_CACHE) {
773 VM_BUG_ON(!has_cache);
774 has_cache = 0;
775 } else if (count == SWAP_MAP_SHMEM) {
776
777
778
779
780 count = 0;
781 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
782 if (count == COUNT_CONTINUED) {
783 if (swap_count_continued(p, offset, count))
784 count = SWAP_MAP_MAX | COUNT_CONTINUED;
785 else
786 count = SWAP_MAP_MAX;
787 } else
788 count--;
789 }
790
791 if (!count)
792 mem_cgroup_uncharge_swap(entry);
793
794 usage = count | has_cache;
795 p->swap_map[offset] = usage;
796
797
798 if (!usage) {
799 dec_cluster_info_page(p, p->cluster_info, offset);
800 if (offset < p->lowest_bit)
801 p->lowest_bit = offset;
802 if (offset > p->highest_bit) {
803 bool was_full = !p->highest_bit;
804 p->highest_bit = offset;
805 if (was_full && (p->flags & SWP_WRITEOK)) {
806 spin_lock(&swap_avail_lock);
807 WARN_ON(!plist_node_empty(&p->avail_list));
808 if (plist_node_empty(&p->avail_list))
809 plist_add(&p->avail_list,
810 &swap_avail_head);
811 spin_unlock(&swap_avail_lock);
812 }
813 }
814 atomic_long_inc(&nr_swap_pages);
815 p->inuse_pages--;
816 frontswap_invalidate_page(p->type, offset);
817 if (p->flags & SWP_BLKDEV) {
818 struct gendisk *disk = p->bdev->bd_disk;
819 if (disk->fops->swap_slot_free_notify)
820 disk->fops->swap_slot_free_notify(p->bdev,
821 offset);
822 }
823 }
824
825 return usage;
826}
827
828
829
830
831
832void swap_free(swp_entry_t entry)
833{
834 struct swap_info_struct *p;
835
836 p = swap_info_get(entry);
837 if (p) {
838 swap_entry_free(p, entry, 1);
839 spin_unlock(&p->lock);
840 }
841}
842
843
844
845
846void swapcache_free(swp_entry_t entry)
847{
848 struct swap_info_struct *p;
849
850 p = swap_info_get(entry);
851 if (p) {
852 swap_entry_free(p, entry, SWAP_HAS_CACHE);
853 spin_unlock(&p->lock);
854 }
855}
856
857
858
859
860
861
862int page_swapcount(struct page *page)
863{
864 int count = 0;
865 struct swap_info_struct *p;
866 swp_entry_t entry;
867
868 entry.val = page_private(page);
869 p = swap_info_get(entry);
870 if (p) {
871 count = swap_count(p->swap_map[swp_offset(entry)]);
872 spin_unlock(&p->lock);
873 }
874 return count;
875}
876
877
878
879
880
881
882
883int reuse_swap_page(struct page *page)
884{
885 int count;
886
887 VM_BUG_ON_PAGE(!PageLocked(page), page);
888 if (unlikely(PageKsm(page)))
889 return 0;
890 count = page_mapcount(page);
891 if (count <= 1 && PageSwapCache(page)) {
892 count += page_swapcount(page);
893 if (count == 1 && !PageWriteback(page)) {
894 delete_from_swap_cache(page);
895 SetPageDirty(page);
896 }
897 }
898 return count <= 1;
899}
900
901
902
903
904
905int try_to_free_swap(struct page *page)
906{
907 VM_BUG_ON_PAGE(!PageLocked(page), page);
908
909 if (!PageSwapCache(page))
910 return 0;
911 if (PageWriteback(page))
912 return 0;
913 if (page_swapcount(page))
914 return 0;
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931 if (pm_suspended_storage())
932 return 0;
933
934 delete_from_swap_cache(page);
935 SetPageDirty(page);
936 return 1;
937}
938
939
940
941
942
943int free_swap_and_cache(swp_entry_t entry)
944{
945 struct swap_info_struct *p;
946 struct page *page = NULL;
947
948 if (non_swap_entry(entry))
949 return 1;
950
951 p = swap_info_get(entry);
952 if (p) {
953 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
954 page = find_get_page(swap_address_space(entry),
955 entry.val);
956 if (page && !trylock_page(page)) {
957 page_cache_release(page);
958 page = NULL;
959 }
960 }
961 spin_unlock(&p->lock);
962 }
963 if (page) {
964
965
966
967
968 if (PageSwapCache(page) && !PageWriteback(page) &&
969 (!page_mapped(page) || vm_swap_full())) {
970 delete_from_swap_cache(page);
971 SetPageDirty(page);
972 }
973 unlock_page(page);
974 page_cache_release(page);
975 }
976 return p != NULL;
977}
978
979#ifdef CONFIG_HIBERNATION
980
981
982
983
984
985
986
987
988int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
989{
990 struct block_device *bdev = NULL;
991 int type;
992
993 if (device)
994 bdev = bdget(device);
995
996 spin_lock(&swap_lock);
997 for (type = 0; type < nr_swapfiles; type++) {
998 struct swap_info_struct *sis = swap_info[type];
999
1000 if (!(sis->flags & SWP_WRITEOK))
1001 continue;
1002
1003 if (!bdev) {
1004 if (bdev_p)
1005 *bdev_p = bdgrab(sis->bdev);
1006
1007 spin_unlock(&swap_lock);
1008 return type;
1009 }
1010 if (bdev == sis->bdev) {
1011 struct swap_extent *se = &sis->first_swap_extent;
1012
1013 if (se->start_block == offset) {
1014 if (bdev_p)
1015 *bdev_p = bdgrab(sis->bdev);
1016
1017 spin_unlock(&swap_lock);
1018 bdput(bdev);
1019 return type;
1020 }
1021 }
1022 }
1023 spin_unlock(&swap_lock);
1024 if (bdev)
1025 bdput(bdev);
1026
1027 return -ENODEV;
1028}
1029
1030
1031
1032
1033
1034sector_t swapdev_block(int type, pgoff_t offset)
1035{
1036 struct block_device *bdev;
1037
1038 if ((unsigned int)type >= nr_swapfiles)
1039 return 0;
1040 if (!(swap_info[type]->flags & SWP_WRITEOK))
1041 return 0;
1042 return map_swap_entry(swp_entry(type, offset), &bdev);
1043}
1044
1045
1046
1047
1048
1049
1050
1051unsigned int count_swap_pages(int type, int free)
1052{
1053 unsigned int n = 0;
1054
1055 spin_lock(&swap_lock);
1056 if ((unsigned int)type < nr_swapfiles) {
1057 struct swap_info_struct *sis = swap_info[type];
1058
1059 spin_lock(&sis->lock);
1060 if (sis->flags & SWP_WRITEOK) {
1061 n = sis->pages;
1062 if (free)
1063 n -= sis->inuse_pages;
1064 }
1065 spin_unlock(&sis->lock);
1066 }
1067 spin_unlock(&swap_lock);
1068 return n;
1069}
1070#endif
1071
1072static inline int maybe_same_pte(pte_t pte, pte_t swp_pte)
1073{
1074#ifdef CONFIG_MEM_SOFT_DIRTY
1075
1076
1077
1078
1079
1080 pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte);
1081 return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty);
1082#else
1083 return pte_same(pte, swp_pte);
1084#endif
1085}
1086
1087
1088
1089
1090
1091
1092static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
1093 unsigned long addr, swp_entry_t entry, struct page *page)
1094{
1095 struct page *swapcache;
1096 struct mem_cgroup *memcg;
1097 spinlock_t *ptl;
1098 pte_t *pte;
1099 int ret = 1;
1100
1101 swapcache = page;
1102 page = ksm_might_need_to_copy(page, vma, addr);
1103 if (unlikely(!page))
1104 return -ENOMEM;
1105
1106 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) {
1107 ret = -ENOMEM;
1108 goto out_nolock;
1109 }
1110
1111 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1112 if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) {
1113 mem_cgroup_cancel_charge(page, memcg);
1114 ret = 0;
1115 goto out;
1116 }
1117
1118 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
1119 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
1120 get_page(page);
1121 set_pte_at(vma->vm_mm, addr, pte,
1122 pte_mkold(mk_pte(page, vma->vm_page_prot)));
1123 if (page == swapcache) {
1124 page_add_anon_rmap(page, vma, addr);
1125 mem_cgroup_commit_charge(page, memcg, true);
1126 } else {
1127 page_add_new_anon_rmap(page, vma, addr);
1128 mem_cgroup_commit_charge(page, memcg, false);
1129 lru_cache_add_active_or_unevictable(page, vma);
1130 }
1131 swap_free(entry);
1132
1133
1134
1135
1136 activate_page(page);
1137out:
1138 pte_unmap_unlock(pte, ptl);
1139out_nolock:
1140 if (page != swapcache) {
1141 unlock_page(page);
1142 put_page(page);
1143 }
1144 return ret;
1145}
1146
1147static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
1148 unsigned long addr, unsigned long end,
1149 swp_entry_t entry, struct page *page)
1150{
1151 pte_t swp_pte = swp_entry_to_pte(entry);
1152 pte_t *pte;
1153 int ret = 0;
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164 pte = pte_offset_map(pmd, addr);
1165 do {
1166
1167
1168
1169
1170 if (unlikely(maybe_same_pte(*pte, swp_pte))) {
1171 pte_unmap(pte);
1172 ret = unuse_pte(vma, pmd, addr, entry, page);
1173 if (ret)
1174 goto out;
1175 pte = pte_offset_map(pmd, addr);
1176 }
1177 } while (pte++, addr += PAGE_SIZE, addr != end);
1178 pte_unmap(pte - 1);
1179out:
1180 return ret;
1181}
1182
1183static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
1184 unsigned long addr, unsigned long end,
1185 swp_entry_t entry, struct page *page)
1186{
1187 pmd_t *pmd;
1188 unsigned long next;
1189 int ret;
1190
1191 pmd = pmd_offset(pud, addr);
1192 do {
1193 next = pmd_addr_end(addr, end);
1194 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1195 continue;
1196 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
1197 if (ret)
1198 return ret;
1199 } while (pmd++, addr = next, addr != end);
1200 return 0;
1201}
1202
1203static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
1204 unsigned long addr, unsigned long end,
1205 swp_entry_t entry, struct page *page)
1206{
1207 pud_t *pud;
1208 unsigned long next;
1209 int ret;
1210
1211 pud = pud_offset(pgd, addr);
1212 do {
1213 next = pud_addr_end(addr, end);
1214 if (pud_none_or_clear_bad(pud))
1215 continue;
1216 ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
1217 if (ret)
1218 return ret;
1219 } while (pud++, addr = next, addr != end);
1220 return 0;
1221}
1222
1223static int unuse_vma(struct vm_area_struct *vma,
1224 swp_entry_t entry, struct page *page)
1225{
1226 pgd_t *pgd;
1227 unsigned long addr, end, next;
1228 int ret;
1229
1230 if (page_anon_vma(page)) {
1231 addr = page_address_in_vma(page, vma);
1232 if (addr == -EFAULT)
1233 return 0;
1234 else
1235 end = addr + PAGE_SIZE;
1236 } else {
1237 addr = vma->vm_start;
1238 end = vma->vm_end;
1239 }
1240
1241 pgd = pgd_offset(vma->vm_mm, addr);
1242 do {
1243 next = pgd_addr_end(addr, end);
1244 if (pgd_none_or_clear_bad(pgd))
1245 continue;
1246 ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
1247 if (ret)
1248 return ret;
1249 } while (pgd++, addr = next, addr != end);
1250 return 0;
1251}
1252
1253static int unuse_mm(struct mm_struct *mm,
1254 swp_entry_t entry, struct page *page)
1255{
1256 struct vm_area_struct *vma;
1257 int ret = 0;
1258
1259 if (!down_read_trylock(&mm->mmap_sem)) {
1260
1261
1262
1263
1264 activate_page(page);
1265 unlock_page(page);
1266 down_read(&mm->mmap_sem);
1267 lock_page(page);
1268 }
1269 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1270 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
1271 break;
1272 }
1273 up_read(&mm->mmap_sem);
1274 return (ret < 0)? ret: 0;
1275}
1276
1277
1278
1279
1280
1281
1282static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1283 unsigned int prev, bool frontswap)
1284{
1285 unsigned int max = si->max;
1286 unsigned int i = prev;
1287 unsigned char count;
1288
1289
1290
1291
1292
1293
1294
1295 for (;;) {
1296 if (++i >= max) {
1297 if (!prev) {
1298 i = 0;
1299 break;
1300 }
1301
1302
1303
1304
1305 max = prev + 1;
1306 prev = 0;
1307 i = 1;
1308 }
1309 if (frontswap) {
1310 if (frontswap_test(si, i))
1311 break;
1312 else
1313 continue;
1314 }
1315 count = READ_ONCE(si->swap_map[i]);
1316 if (count && swap_count(count) != SWAP_MAP_BAD)
1317 break;
1318 }
1319 return i;
1320}
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330int try_to_unuse(unsigned int type, bool frontswap,
1331 unsigned long pages_to_unuse)
1332{
1333 struct swap_info_struct *si = swap_info[type];
1334 struct mm_struct *start_mm;
1335 volatile unsigned char *swap_map;
1336
1337
1338
1339
1340 unsigned char swcount;
1341 struct page *page;
1342 swp_entry_t entry;
1343 unsigned int i = 0;
1344 int retval = 0;
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360 start_mm = &init_mm;
1361 atomic_inc(&init_mm.mm_users);
1362
1363
1364
1365
1366
1367
1368 while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
1369 if (signal_pending(current)) {
1370 retval = -EINTR;
1371 break;
1372 }
1373
1374
1375
1376
1377
1378
1379 swap_map = &si->swap_map[i];
1380 entry = swp_entry(type, i);
1381 page = read_swap_cache_async(entry,
1382 GFP_HIGHUSER_MOVABLE, NULL, 0);
1383 if (!page) {
1384
1385
1386
1387
1388
1389
1390 swcount = *swap_map;
1391
1392
1393
1394
1395
1396
1397
1398 if (!swcount || swcount == SWAP_MAP_BAD)
1399 continue;
1400 retval = -ENOMEM;
1401 break;
1402 }
1403
1404
1405
1406
1407 if (atomic_read(&start_mm->mm_users) == 1) {
1408 mmput(start_mm);
1409 start_mm = &init_mm;
1410 atomic_inc(&init_mm.mm_users);
1411 }
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421 wait_on_page_locked(page);
1422 wait_on_page_writeback(page);
1423 lock_page(page);
1424 wait_on_page_writeback(page);
1425
1426
1427
1428
1429 swcount = *swap_map;
1430 if (swap_count(swcount) == SWAP_MAP_SHMEM) {
1431 retval = shmem_unuse(entry, page);
1432
1433 if (retval < 0)
1434 break;
1435 continue;
1436 }
1437 if (swap_count(swcount) && start_mm != &init_mm)
1438 retval = unuse_mm(start_mm, entry, page);
1439
1440 if (swap_count(*swap_map)) {
1441 int set_start_mm = (*swap_map >= swcount);
1442 struct list_head *p = &start_mm->mmlist;
1443 struct mm_struct *new_start_mm = start_mm;
1444 struct mm_struct *prev_mm = start_mm;
1445 struct mm_struct *mm;
1446
1447 atomic_inc(&new_start_mm->mm_users);
1448 atomic_inc(&prev_mm->mm_users);
1449 spin_lock(&mmlist_lock);
1450 while (swap_count(*swap_map) && !retval &&
1451 (p = p->next) != &start_mm->mmlist) {
1452 mm = list_entry(p, struct mm_struct, mmlist);
1453 if (!atomic_inc_not_zero(&mm->mm_users))
1454 continue;
1455 spin_unlock(&mmlist_lock);
1456 mmput(prev_mm);
1457 prev_mm = mm;
1458
1459 cond_resched();
1460
1461 swcount = *swap_map;
1462 if (!swap_count(swcount))
1463 ;
1464 else if (mm == &init_mm)
1465 set_start_mm = 1;
1466 else
1467 retval = unuse_mm(mm, entry, page);
1468
1469 if (set_start_mm && *swap_map < swcount) {
1470 mmput(new_start_mm);
1471 atomic_inc(&mm->mm_users);
1472 new_start_mm = mm;
1473 set_start_mm = 0;
1474 }
1475 spin_lock(&mmlist_lock);
1476 }
1477 spin_unlock(&mmlist_lock);
1478 mmput(prev_mm);
1479 mmput(start_mm);
1480 start_mm = new_start_mm;
1481 }
1482 if (retval) {
1483 unlock_page(page);
1484 page_cache_release(page);
1485 break;
1486 }
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507 if (swap_count(*swap_map) &&
1508 PageDirty(page) && PageSwapCache(page)) {
1509 struct writeback_control wbc = {
1510 .sync_mode = WB_SYNC_NONE,
1511 };
1512
1513 swap_writepage(page, &wbc);
1514 lock_page(page);
1515 wait_on_page_writeback(page);
1516 }
1517
1518
1519
1520
1521
1522
1523
1524
1525 if (PageSwapCache(page) &&
1526 likely(page_private(page) == entry.val))
1527 delete_from_swap_cache(page);
1528
1529
1530
1531
1532
1533
1534 SetPageDirty(page);
1535 unlock_page(page);
1536 page_cache_release(page);
1537
1538
1539
1540
1541
1542 cond_resched();
1543 if (frontswap && pages_to_unuse > 0) {
1544 if (!--pages_to_unuse)
1545 break;
1546 }
1547 }
1548
1549 mmput(start_mm);
1550 return retval;
1551}
1552
1553
1554
1555
1556
1557
1558
1559static void drain_mmlist(void)
1560{
1561 struct list_head *p, *next;
1562 unsigned int type;
1563
1564 for (type = 0; type < nr_swapfiles; type++)
1565 if (swap_info[type]->inuse_pages)
1566 return;
1567 spin_lock(&mmlist_lock);
1568 list_for_each_safe(p, next, &init_mm.mmlist)
1569 list_del_init(p);
1570 spin_unlock(&mmlist_lock);
1571}
1572
1573
1574
1575
1576
1577
1578
1579static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
1580{
1581 struct swap_info_struct *sis;
1582 struct swap_extent *start_se;
1583 struct swap_extent *se;
1584 pgoff_t offset;
1585
1586 sis = swap_info[swp_type(entry)];
1587 *bdev = sis->bdev;
1588
1589 offset = swp_offset(entry);
1590 start_se = sis->curr_swap_extent;
1591 se = start_se;
1592
1593 for ( ; ; ) {
1594 struct list_head *lh;
1595
1596 if (se->start_page <= offset &&
1597 offset < (se->start_page + se->nr_pages)) {
1598 return se->start_block + (offset - se->start_page);
1599 }
1600 lh = se->list.next;
1601 se = list_entry(lh, struct swap_extent, list);
1602 sis->curr_swap_extent = se;
1603 BUG_ON(se == start_se);
1604 }
1605}
1606
1607
1608
1609
1610sector_t map_swap_page(struct page *page, struct block_device **bdev)
1611{
1612 swp_entry_t entry;
1613 entry.val = page_private(page);
1614 return map_swap_entry(entry, bdev);
1615}
1616
1617
1618
1619
1620static void destroy_swap_extents(struct swap_info_struct *sis)
1621{
1622 while (!list_empty(&sis->first_swap_extent.list)) {
1623 struct swap_extent *se;
1624
1625 se = list_entry(sis->first_swap_extent.list.next,
1626 struct swap_extent, list);
1627 list_del(&se->list);
1628 kfree(se);
1629 }
1630
1631 if (sis->flags & SWP_FILE) {
1632 struct file *swap_file = sis->swap_file;
1633 struct address_space *mapping = swap_file->f_mapping;
1634
1635 sis->flags &= ~SWP_FILE;
1636 mapping->a_ops->swap_deactivate(swap_file);
1637 }
1638}
1639
1640
1641
1642
1643
1644
1645
1646int
1647add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1648 unsigned long nr_pages, sector_t start_block)
1649{
1650 struct swap_extent *se;
1651 struct swap_extent *new_se;
1652 struct list_head *lh;
1653
1654 if (start_page == 0) {
1655 se = &sis->first_swap_extent;
1656 sis->curr_swap_extent = se;
1657 se->start_page = 0;
1658 se->nr_pages = nr_pages;
1659 se->start_block = start_block;
1660 return 1;
1661 } else {
1662 lh = sis->first_swap_extent.list.prev;
1663 se = list_entry(lh, struct swap_extent, list);
1664 BUG_ON(se->start_page + se->nr_pages != start_page);
1665 if (se->start_block + se->nr_pages == start_block) {
1666
1667 se->nr_pages += nr_pages;
1668 return 0;
1669 }
1670 }
1671
1672
1673
1674
1675 new_se = kmalloc(sizeof(*se), GFP_KERNEL);
1676 if (new_se == NULL)
1677 return -ENOMEM;
1678 new_se->start_page = start_page;
1679 new_se->nr_pages = nr_pages;
1680 new_se->start_block = start_block;
1681
1682 list_add_tail(&new_se->list, &sis->first_swap_extent.list);
1683 return 1;
1684}
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1718{
1719 struct file *swap_file = sis->swap_file;
1720 struct address_space *mapping = swap_file->f_mapping;
1721 struct inode *inode = mapping->host;
1722 int ret;
1723
1724 if (S_ISBLK(inode->i_mode)) {
1725 ret = add_swap_extent(sis, 0, sis->max, 0);
1726 *span = sis->pages;
1727 return ret;
1728 }
1729
1730 if (mapping->a_ops->swap_activate) {
1731 ret = mapping->a_ops->swap_activate(sis, swap_file, span);
1732 if (!ret) {
1733 sis->flags |= SWP_FILE;
1734 ret = add_swap_extent(sis, 0, sis->max, 0);
1735 *span = sis->pages;
1736 }
1737 return ret;
1738 }
1739
1740 return generic_swapfile_activate(sis, swap_file, span);
1741}
1742
1743static void _enable_swap_info(struct swap_info_struct *p, int prio,
1744 unsigned char *swap_map,
1745 struct swap_cluster_info *cluster_info)
1746{
1747 if (prio >= 0)
1748 p->prio = prio;
1749 else
1750 p->prio = --least_priority;
1751
1752
1753
1754
1755 p->list.prio = -p->prio;
1756 p->avail_list.prio = -p->prio;
1757 p->swap_map = swap_map;
1758 p->cluster_info = cluster_info;
1759 p->flags |= SWP_WRITEOK;
1760 atomic_long_add(p->pages, &nr_swap_pages);
1761 total_swap_pages += p->pages;
1762
1763 assert_spin_locked(&swap_lock);
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774 plist_add(&p->list, &swap_active_head);
1775 spin_lock(&swap_avail_lock);
1776 plist_add(&p->avail_list, &swap_avail_head);
1777 spin_unlock(&swap_avail_lock);
1778}
1779
1780static void enable_swap_info(struct swap_info_struct *p, int prio,
1781 unsigned char *swap_map,
1782 struct swap_cluster_info *cluster_info,
1783 unsigned long *frontswap_map)
1784{
1785 frontswap_init(p->type, frontswap_map);
1786 spin_lock(&swap_lock);
1787 spin_lock(&p->lock);
1788 _enable_swap_info(p, prio, swap_map, cluster_info);
1789 spin_unlock(&p->lock);
1790 spin_unlock(&swap_lock);
1791}
1792
1793static void reinsert_swap_info(struct swap_info_struct *p)
1794{
1795 spin_lock(&swap_lock);
1796 spin_lock(&p->lock);
1797 _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
1798 spin_unlock(&p->lock);
1799 spin_unlock(&swap_lock);
1800}
1801
1802SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1803{
1804 struct swap_info_struct *p = NULL;
1805 unsigned char *swap_map;
1806 struct swap_cluster_info *cluster_info;
1807 unsigned long *frontswap_map;
1808 struct file *swap_file, *victim;
1809 struct address_space *mapping;
1810 struct inode *inode;
1811 struct filename *pathname;
1812 int err, found = 0;
1813 unsigned int old_block_size;
1814
1815 if (!capable(CAP_SYS_ADMIN))
1816 return -EPERM;
1817
1818 BUG_ON(!current->mm);
1819
1820 pathname = getname(specialfile);
1821 if (IS_ERR(pathname))
1822 return PTR_ERR(pathname);
1823
1824 victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
1825 err = PTR_ERR(victim);
1826 if (IS_ERR(victim))
1827 goto out;
1828
1829 mapping = victim->f_mapping;
1830 spin_lock(&swap_lock);
1831 plist_for_each_entry(p, &swap_active_head, list) {
1832 if (p->flags & SWP_WRITEOK) {
1833 if (p->swap_file->f_mapping == mapping) {
1834 found = 1;
1835 break;
1836 }
1837 }
1838 }
1839 if (!found) {
1840 err = -EINVAL;
1841 spin_unlock(&swap_lock);
1842 goto out_dput;
1843 }
1844 if (!security_vm_enough_memory_mm(current->mm, p->pages))
1845 vm_unacct_memory(p->pages);
1846 else {
1847 err = -ENOMEM;
1848 spin_unlock(&swap_lock);
1849 goto out_dput;
1850 }
1851 spin_lock(&swap_avail_lock);
1852 plist_del(&p->avail_list, &swap_avail_head);
1853 spin_unlock(&swap_avail_lock);
1854 spin_lock(&p->lock);
1855 if (p->prio < 0) {
1856 struct swap_info_struct *si = p;
1857
1858 plist_for_each_entry_continue(si, &swap_active_head, list) {
1859 si->prio++;
1860 si->list.prio--;
1861 si->avail_list.prio--;
1862 }
1863 least_priority++;
1864 }
1865 plist_del(&p->list, &swap_active_head);
1866 atomic_long_sub(p->pages, &nr_swap_pages);
1867 total_swap_pages -= p->pages;
1868 p->flags &= ~SWP_WRITEOK;
1869 spin_unlock(&p->lock);
1870 spin_unlock(&swap_lock);
1871
1872 set_current_oom_origin();
1873 err = try_to_unuse(p->type, false, 0);
1874 clear_current_oom_origin();
1875
1876 if (err) {
1877
1878 reinsert_swap_info(p);
1879 goto out_dput;
1880 }
1881
1882 flush_work(&p->discard_work);
1883
1884 destroy_swap_extents(p);
1885 if (p->flags & SWP_CONTINUED)
1886 free_swap_count_continuations(p);
1887
1888 mutex_lock(&swapon_mutex);
1889 spin_lock(&swap_lock);
1890 spin_lock(&p->lock);
1891 drain_mmlist();
1892
1893
1894 p->highest_bit = 0;
1895 while (p->flags >= SWP_SCANNING) {
1896 spin_unlock(&p->lock);
1897 spin_unlock(&swap_lock);
1898 schedule_timeout_uninterruptible(1);
1899 spin_lock(&swap_lock);
1900 spin_lock(&p->lock);
1901 }
1902
1903 swap_file = p->swap_file;
1904 old_block_size = p->old_block_size;
1905 p->swap_file = NULL;
1906 p->max = 0;
1907 swap_map = p->swap_map;
1908 p->swap_map = NULL;
1909 cluster_info = p->cluster_info;
1910 p->cluster_info = NULL;
1911 frontswap_map = frontswap_map_get(p);
1912 spin_unlock(&p->lock);
1913 spin_unlock(&swap_lock);
1914 frontswap_invalidate_area(p->type);
1915 frontswap_map_set(p, NULL);
1916 mutex_unlock(&swapon_mutex);
1917 free_percpu(p->percpu_cluster);
1918 p->percpu_cluster = NULL;
1919 vfree(swap_map);
1920 vfree(cluster_info);
1921 vfree(frontswap_map);
1922
1923 swap_cgroup_swapoff(p->type);
1924
1925 inode = mapping->host;
1926 if (S_ISBLK(inode->i_mode)) {
1927 struct block_device *bdev = I_BDEV(inode);
1928 set_blocksize(bdev, old_block_size);
1929 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1930 } else {
1931 mutex_lock(&inode->i_mutex);
1932 inode->i_flags &= ~S_SWAPFILE;
1933 mutex_unlock(&inode->i_mutex);
1934 }
1935 filp_close(swap_file, NULL);
1936
1937
1938
1939
1940
1941
1942 spin_lock(&swap_lock);
1943 p->flags = 0;
1944 spin_unlock(&swap_lock);
1945
1946 err = 0;
1947 atomic_inc(&proc_poll_event);
1948 wake_up_interruptible(&proc_poll_wait);
1949
1950out_dput:
1951 filp_close(victim, NULL);
1952out:
1953 putname(pathname);
1954 return err;
1955}
1956
1957#ifdef CONFIG_PROC_FS
1958static unsigned swaps_poll(struct file *file, poll_table *wait)
1959{
1960 struct seq_file *seq = file->private_data;
1961
1962 poll_wait(file, &proc_poll_wait, wait);
1963
1964 if (seq->poll_event != atomic_read(&proc_poll_event)) {
1965 seq->poll_event = atomic_read(&proc_poll_event);
1966 return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
1967 }
1968
1969 return POLLIN | POLLRDNORM;
1970}
1971
1972
1973static void *swap_start(struct seq_file *swap, loff_t *pos)
1974{
1975 struct swap_info_struct *si;
1976 int type;
1977 loff_t l = *pos;
1978
1979 mutex_lock(&swapon_mutex);
1980
1981 if (!l)
1982 return SEQ_START_TOKEN;
1983
1984 for (type = 0; type < nr_swapfiles; type++) {
1985 smp_rmb();
1986 si = swap_info[type];
1987 if (!(si->flags & SWP_USED) || !si->swap_map)
1988 continue;
1989 if (!--l)
1990 return si;
1991 }
1992
1993 return NULL;
1994}
1995
1996static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1997{
1998 struct swap_info_struct *si = v;
1999 int type;
2000
2001 if (v == SEQ_START_TOKEN)
2002 type = 0;
2003 else
2004 type = si->type + 1;
2005
2006 for (; type < nr_swapfiles; type++) {
2007 smp_rmb();
2008 si = swap_info[type];
2009 if (!(si->flags & SWP_USED) || !si->swap_map)
2010 continue;
2011 ++*pos;
2012 return si;
2013 }
2014
2015 return NULL;
2016}
2017
2018static void swap_stop(struct seq_file *swap, void *v)
2019{
2020 mutex_unlock(&swapon_mutex);
2021}
2022
2023static int swap_show(struct seq_file *swap, void *v)
2024{
2025 struct swap_info_struct *si = v;
2026 struct file *file;
2027 int len;
2028
2029 if (si == SEQ_START_TOKEN) {
2030 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
2031 return 0;
2032 }
2033
2034 file = si->swap_file;
2035 len = seq_path(swap, &file->f_path, " \t\n\\");
2036 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
2037 len < 40 ? 40 - len : 1, " ",
2038 S_ISBLK(file_inode(file)->i_mode) ?
2039 "partition" : "file\t",
2040 si->pages << (PAGE_SHIFT - 10),
2041 si->inuse_pages << (PAGE_SHIFT - 10),
2042 si->prio);
2043 return 0;
2044}
2045
2046static const struct seq_operations swaps_op = {
2047 .start = swap_start,
2048 .next = swap_next,
2049 .stop = swap_stop,
2050 .show = swap_show
2051};
2052
2053static int swaps_open(struct inode *inode, struct file *file)
2054{
2055 struct seq_file *seq;
2056 int ret;
2057
2058 ret = seq_open(file, &swaps_op);
2059 if (ret)
2060 return ret;
2061
2062 seq = file->private_data;
2063 seq->poll_event = atomic_read(&proc_poll_event);
2064 return 0;
2065}
2066
2067static const struct file_operations proc_swaps_operations = {
2068 .open = swaps_open,
2069 .read = seq_read,
2070 .llseek = seq_lseek,
2071 .release = seq_release,
2072 .poll = swaps_poll,
2073};
2074
2075static int __init procswaps_init(void)
2076{
2077 proc_create("swaps", 0, NULL, &proc_swaps_operations);
2078 return 0;
2079}
2080__initcall(procswaps_init);
2081#endif
2082
2083#ifdef MAX_SWAPFILES_CHECK
2084static int __init max_swapfiles_check(void)
2085{
2086 MAX_SWAPFILES_CHECK();
2087 return 0;
2088}
2089late_initcall(max_swapfiles_check);
2090#endif
2091
2092static struct swap_info_struct *alloc_swap_info(void)
2093{
2094 struct swap_info_struct *p;
2095 unsigned int type;
2096
2097 p = kzalloc(sizeof(*p), GFP_KERNEL);
2098 if (!p)
2099 return ERR_PTR(-ENOMEM);
2100
2101 spin_lock(&swap_lock);
2102 for (type = 0; type < nr_swapfiles; type++) {
2103 if (!(swap_info[type]->flags & SWP_USED))
2104 break;
2105 }
2106 if (type >= MAX_SWAPFILES) {
2107 spin_unlock(&swap_lock);
2108 kfree(p);
2109 return ERR_PTR(-EPERM);
2110 }
2111 if (type >= nr_swapfiles) {
2112 p->type = type;
2113 swap_info[type] = p;
2114
2115
2116
2117
2118
2119 smp_wmb();
2120 nr_swapfiles++;
2121 } else {
2122 kfree(p);
2123 p = swap_info[type];
2124
2125
2126
2127
2128 }
2129 INIT_LIST_HEAD(&p->first_swap_extent.list);
2130 plist_node_init(&p->list, 0);
2131 plist_node_init(&p->avail_list, 0);
2132 p->flags = SWP_USED;
2133 spin_unlock(&swap_lock);
2134 spin_lock_init(&p->lock);
2135
2136 return p;
2137}
2138
2139static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
2140{
2141 int error;
2142
2143 if (S_ISBLK(inode->i_mode)) {
2144 p->bdev = bdgrab(I_BDEV(inode));
2145 error = blkdev_get(p->bdev,
2146 FMODE_READ | FMODE_WRITE | FMODE_EXCL,
2147 sys_swapon);
2148 if (error < 0) {
2149 p->bdev = NULL;
2150 return -EINVAL;
2151 }
2152 p->old_block_size = block_size(p->bdev);
2153 error = set_blocksize(p->bdev, PAGE_SIZE);
2154 if (error < 0)
2155 return error;
2156 p->flags |= SWP_BLKDEV;
2157 } else if (S_ISREG(inode->i_mode)) {
2158 p->bdev = inode->i_sb->s_bdev;
2159 mutex_lock(&inode->i_mutex);
2160 if (IS_SWAPFILE(inode))
2161 return -EBUSY;
2162 } else
2163 return -EINVAL;
2164
2165 return 0;
2166}
2167
2168static unsigned long read_swap_header(struct swap_info_struct *p,
2169 union swap_header *swap_header,
2170 struct inode *inode)
2171{
2172 int i;
2173 unsigned long maxpages;
2174 unsigned long swapfilepages;
2175 unsigned long last_page;
2176
2177 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
2178 pr_err("Unable to find swap-space signature\n");
2179 return 0;
2180 }
2181
2182
2183 if (swab32(swap_header->info.version) == 1) {
2184 swab32s(&swap_header->info.version);
2185 swab32s(&swap_header->info.last_page);
2186 swab32s(&swap_header->info.nr_badpages);
2187 for (i = 0; i < swap_header->info.nr_badpages; i++)
2188 swab32s(&swap_header->info.badpages[i]);
2189 }
2190
2191 if (swap_header->info.version != 1) {
2192 pr_warn("Unable to handle swap header version %d\n",
2193 swap_header->info.version);
2194 return 0;
2195 }
2196
2197 p->lowest_bit = 1;
2198 p->cluster_next = 1;
2199 p->cluster_nr = 0;
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215 maxpages = swp_offset(pte_to_swp_entry(
2216 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
2217 last_page = swap_header->info.last_page;
2218 if (last_page > maxpages) {
2219 pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
2220 maxpages << (PAGE_SHIFT - 10),
2221 last_page << (PAGE_SHIFT - 10));
2222 }
2223 if (maxpages > last_page) {
2224 maxpages = last_page + 1;
2225
2226 if ((unsigned int)maxpages == 0)
2227 maxpages = UINT_MAX;
2228 }
2229 p->highest_bit = maxpages - 1;
2230
2231 if (!maxpages)
2232 return 0;
2233 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
2234 if (swapfilepages && maxpages > swapfilepages) {
2235 pr_warn("Swap area shorter than signature indicates\n");
2236 return 0;
2237 }
2238 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
2239 return 0;
2240 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2241 return 0;
2242
2243 return maxpages;
2244}
2245
2246static int setup_swap_map_and_extents(struct swap_info_struct *p,
2247 union swap_header *swap_header,
2248 unsigned char *swap_map,
2249 struct swap_cluster_info *cluster_info,
2250 unsigned long maxpages,
2251 sector_t *span)
2252{
2253 int i;
2254 unsigned int nr_good_pages;
2255 int nr_extents;
2256 unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
2257 unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER;
2258
2259 nr_good_pages = maxpages - 1;
2260
2261 cluster_set_null(&p->free_cluster_head);
2262 cluster_set_null(&p->free_cluster_tail);
2263 cluster_set_null(&p->discard_cluster_head);
2264 cluster_set_null(&p->discard_cluster_tail);
2265
2266 for (i = 0; i < swap_header->info.nr_badpages; i++) {
2267 unsigned int page_nr = swap_header->info.badpages[i];
2268 if (page_nr == 0 || page_nr > swap_header->info.last_page)
2269 return -EINVAL;
2270 if (page_nr < maxpages) {
2271 swap_map[page_nr] = SWAP_MAP_BAD;
2272 nr_good_pages--;
2273
2274
2275
2276
2277 inc_cluster_info_page(p, cluster_info, page_nr);
2278 }
2279 }
2280
2281
2282 for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
2283 inc_cluster_info_page(p, cluster_info, i);
2284
2285 if (nr_good_pages) {
2286 swap_map[0] = SWAP_MAP_BAD;
2287
2288
2289
2290
2291 inc_cluster_info_page(p, cluster_info, 0);
2292 p->max = maxpages;
2293 p->pages = nr_good_pages;
2294 nr_extents = setup_swap_extents(p, span);
2295 if (nr_extents < 0)
2296 return nr_extents;
2297 nr_good_pages = p->pages;
2298 }
2299 if (!nr_good_pages) {
2300 pr_warn("Empty swap-file\n");
2301 return -EINVAL;
2302 }
2303
2304 if (!cluster_info)
2305 return nr_extents;
2306
2307 for (i = 0; i < nr_clusters; i++) {
2308 if (!cluster_count(&cluster_info[idx])) {
2309 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
2310 if (cluster_is_null(&p->free_cluster_head)) {
2311 cluster_set_next_flag(&p->free_cluster_head,
2312 idx, 0);
2313 cluster_set_next_flag(&p->free_cluster_tail,
2314 idx, 0);
2315 } else {
2316 unsigned int tail;
2317
2318 tail = cluster_next(&p->free_cluster_tail);
2319 cluster_set_next(&cluster_info[tail], idx);
2320 cluster_set_next_flag(&p->free_cluster_tail,
2321 idx, 0);
2322 }
2323 }
2324 idx++;
2325 if (idx == nr_clusters)
2326 idx = 0;
2327 }
2328 return nr_extents;
2329}
2330
2331
2332
2333
2334
2335static bool swap_discardable(struct swap_info_struct *si)
2336{
2337 struct request_queue *q = bdev_get_queue(si->bdev);
2338
2339 if (!q || !blk_queue_discard(q))
2340 return false;
2341
2342 return true;
2343}
2344
2345SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2346{
2347 struct swap_info_struct *p;
2348 struct filename *name;
2349 struct file *swap_file = NULL;
2350 struct address_space *mapping;
2351 int i;
2352 int prio;
2353 int error;
2354 union swap_header *swap_header;
2355 int nr_extents;
2356 sector_t span;
2357 unsigned long maxpages;
2358 unsigned char *swap_map = NULL;
2359 struct swap_cluster_info *cluster_info = NULL;
2360 unsigned long *frontswap_map = NULL;
2361 struct page *page = NULL;
2362 struct inode *inode = NULL;
2363
2364 if (swap_flags & ~SWAP_FLAGS_VALID)
2365 return -EINVAL;
2366
2367 if (!capable(CAP_SYS_ADMIN))
2368 return -EPERM;
2369
2370 p = alloc_swap_info();
2371 if (IS_ERR(p))
2372 return PTR_ERR(p);
2373
2374 INIT_WORK(&p->discard_work, swap_discard_work);
2375
2376 name = getname(specialfile);
2377 if (IS_ERR(name)) {
2378 error = PTR_ERR(name);
2379 name = NULL;
2380 goto bad_swap;
2381 }
2382 swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
2383 if (IS_ERR(swap_file)) {
2384 error = PTR_ERR(swap_file);
2385 swap_file = NULL;
2386 goto bad_swap;
2387 }
2388
2389 p->swap_file = swap_file;
2390 mapping = swap_file->f_mapping;
2391
2392 for (i = 0; i < nr_swapfiles; i++) {
2393 struct swap_info_struct *q = swap_info[i];
2394
2395 if (q == p || !q->swap_file)
2396 continue;
2397 if (mapping == q->swap_file->f_mapping) {
2398 error = -EBUSY;
2399 goto bad_swap;
2400 }
2401 }
2402
2403 inode = mapping->host;
2404
2405 error = claim_swapfile(p, inode);
2406 if (unlikely(error))
2407 goto bad_swap;
2408
2409
2410
2411
2412 if (!mapping->a_ops->readpage) {
2413 error = -EINVAL;
2414 goto bad_swap;
2415 }
2416 page = read_mapping_page(mapping, 0, swap_file);
2417 if (IS_ERR(page)) {
2418 error = PTR_ERR(page);
2419 goto bad_swap;
2420 }
2421 swap_header = kmap(page);
2422
2423 maxpages = read_swap_header(p, swap_header, inode);
2424 if (unlikely(!maxpages)) {
2425 error = -EINVAL;
2426 goto bad_swap;
2427 }
2428
2429
2430 swap_map = vzalloc(maxpages);
2431 if (!swap_map) {
2432 error = -ENOMEM;
2433 goto bad_swap;
2434 }
2435 if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
2436 p->flags |= SWP_SOLIDSTATE;
2437
2438
2439
2440
2441 p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
2442
2443 cluster_info = vzalloc(DIV_ROUND_UP(maxpages,
2444 SWAPFILE_CLUSTER) * sizeof(*cluster_info));
2445 if (!cluster_info) {
2446 error = -ENOMEM;
2447 goto bad_swap;
2448 }
2449 p->percpu_cluster = alloc_percpu(struct percpu_cluster);
2450 if (!p->percpu_cluster) {
2451 error = -ENOMEM;
2452 goto bad_swap;
2453 }
2454 for_each_possible_cpu(i) {
2455 struct percpu_cluster *cluster;
2456 cluster = per_cpu_ptr(p->percpu_cluster, i);
2457 cluster_set_null(&cluster->index);
2458 }
2459 }
2460
2461 error = swap_cgroup_swapon(p->type, maxpages);
2462 if (error)
2463 goto bad_swap;
2464
2465 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
2466 cluster_info, maxpages, &span);
2467 if (unlikely(nr_extents < 0)) {
2468 error = nr_extents;
2469 goto bad_swap;
2470 }
2471
2472 if (frontswap_enabled)
2473 frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long));
2474
2475 if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
2476
2477
2478
2479
2480
2481
2482 p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
2483 SWP_PAGE_DISCARD);
2484
2485
2486
2487
2488
2489
2490
2491 if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
2492 p->flags &= ~SWP_PAGE_DISCARD;
2493 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
2494 p->flags &= ~SWP_AREA_DISCARD;
2495
2496
2497 if (p->flags & SWP_AREA_DISCARD) {
2498 int err = discard_swap(p);
2499 if (unlikely(err))
2500 pr_err("swapon: discard_swap(%p): %d\n",
2501 p, err);
2502 }
2503 }
2504
2505 mutex_lock(&swapon_mutex);
2506 prio = -1;
2507 if (swap_flags & SWAP_FLAG_PREFER)
2508 prio =
2509 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2510 enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
2511
2512 pr_info("Adding %uk swap on %s. "
2513 "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
2514 p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
2515 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2516 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2517 (p->flags & SWP_DISCARDABLE) ? "D" : "",
2518 (p->flags & SWP_AREA_DISCARD) ? "s" : "",
2519 (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
2520 (frontswap_map) ? "FS" : "");
2521
2522 mutex_unlock(&swapon_mutex);
2523 atomic_inc(&proc_poll_event);
2524 wake_up_interruptible(&proc_poll_wait);
2525
2526 if (S_ISREG(inode->i_mode))
2527 inode->i_flags |= S_SWAPFILE;
2528 error = 0;
2529 goto out;
2530bad_swap:
2531 free_percpu(p->percpu_cluster);
2532 p->percpu_cluster = NULL;
2533 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
2534 set_blocksize(p->bdev, p->old_block_size);
2535 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2536 }
2537 destroy_swap_extents(p);
2538 swap_cgroup_swapoff(p->type);
2539 spin_lock(&swap_lock);
2540 p->swap_file = NULL;
2541 p->flags = 0;
2542 spin_unlock(&swap_lock);
2543 vfree(swap_map);
2544 vfree(cluster_info);
2545 if (swap_file) {
2546 if (inode && S_ISREG(inode->i_mode)) {
2547 mutex_unlock(&inode->i_mutex);
2548 inode = NULL;
2549 }
2550 filp_close(swap_file, NULL);
2551 }
2552out:
2553 if (page && !IS_ERR(page)) {
2554 kunmap(page);
2555 page_cache_release(page);
2556 }
2557 if (name)
2558 putname(name);
2559 if (inode && S_ISREG(inode->i_mode))
2560 mutex_unlock(&inode->i_mutex);
2561 return error;
2562}
2563
2564void si_swapinfo(struct sysinfo *val)
2565{
2566 unsigned int type;
2567 unsigned long nr_to_be_unused = 0;
2568
2569 spin_lock(&swap_lock);
2570 for (type = 0; type < nr_swapfiles; type++) {
2571 struct swap_info_struct *si = swap_info[type];
2572
2573 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
2574 nr_to_be_unused += si->inuse_pages;
2575 }
2576 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
2577 val->totalswap = total_swap_pages + nr_to_be_unused;
2578 spin_unlock(&swap_lock);
2579}
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2593{
2594 struct swap_info_struct *p;
2595 unsigned long offset, type;
2596 unsigned char count;
2597 unsigned char has_cache;
2598 int err = -EINVAL;
2599
2600 if (non_swap_entry(entry))
2601 goto out;
2602
2603 type = swp_type(entry);
2604 if (type >= nr_swapfiles)
2605 goto bad_file;
2606 p = swap_info[type];
2607 offset = swp_offset(entry);
2608
2609 spin_lock(&p->lock);
2610 if (unlikely(offset >= p->max))
2611 goto unlock_out;
2612
2613 count = p->swap_map[offset];
2614
2615
2616
2617
2618
2619 if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
2620 err = -ENOENT;
2621 goto unlock_out;
2622 }
2623
2624 has_cache = count & SWAP_HAS_CACHE;
2625 count &= ~SWAP_HAS_CACHE;
2626 err = 0;
2627
2628 if (usage == SWAP_HAS_CACHE) {
2629
2630
2631 if (!has_cache && count)
2632 has_cache = SWAP_HAS_CACHE;
2633 else if (has_cache)
2634 err = -EEXIST;
2635 else
2636 err = -ENOENT;
2637
2638 } else if (count || has_cache) {
2639
2640 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
2641 count += usage;
2642 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
2643 err = -EINVAL;
2644 else if (swap_count_continued(p, offset, count))
2645 count = COUNT_CONTINUED;
2646 else
2647 err = -ENOMEM;
2648 } else
2649 err = -ENOENT;
2650
2651 p->swap_map[offset] = count | has_cache;
2652
2653unlock_out:
2654 spin_unlock(&p->lock);
2655out:
2656 return err;
2657
2658bad_file:
2659 pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
2660 goto out;
2661}
2662
2663
2664
2665
2666
2667void swap_shmem_alloc(swp_entry_t entry)
2668{
2669 __swap_duplicate(entry, SWAP_MAP_SHMEM);
2670}
2671
2672
2673
2674
2675
2676
2677
2678
2679int swap_duplicate(swp_entry_t entry)
2680{
2681 int err = 0;
2682
2683 while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
2684 err = add_swap_count_continuation(entry, GFP_ATOMIC);
2685 return err;
2686}
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696int swapcache_prepare(swp_entry_t entry)
2697{
2698 return __swap_duplicate(entry, SWAP_HAS_CACHE);
2699}
2700
2701struct swap_info_struct *page_swap_info(struct page *page)
2702{
2703 swp_entry_t swap = { .val = page_private(page) };
2704 BUG_ON(!PageSwapCache(page));
2705 return swap_info[swp_type(swap)];
2706}
2707
2708
2709
2710
2711struct address_space *__page_file_mapping(struct page *page)
2712{
2713 VM_BUG_ON_PAGE(!PageSwapCache(page), page);
2714 return page_swap_info(page)->swap_file->f_mapping;
2715}
2716EXPORT_SYMBOL_GPL(__page_file_mapping);
2717
2718pgoff_t __page_file_index(struct page *page)
2719{
2720 swp_entry_t swap = { .val = page_private(page) };
2721 VM_BUG_ON_PAGE(!PageSwapCache(page), page);
2722 return swp_offset(swap);
2723}
2724EXPORT_SYMBOL_GPL(__page_file_index);
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2742{
2743 struct swap_info_struct *si;
2744 struct page *head;
2745 struct page *page;
2746 struct page *list_page;
2747 pgoff_t offset;
2748 unsigned char count;
2749
2750
2751
2752
2753
2754 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
2755
2756 si = swap_info_get(entry);
2757 if (!si) {
2758
2759
2760
2761
2762
2763 goto outer;
2764 }
2765
2766 offset = swp_offset(entry);
2767 count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
2768
2769 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
2770
2771
2772
2773
2774
2775 goto out;
2776 }
2777
2778 if (!page) {
2779 spin_unlock(&si->lock);
2780 return -ENOMEM;
2781 }
2782
2783
2784
2785
2786
2787
2788 head = vmalloc_to_page(si->swap_map + offset);
2789 offset &= ~PAGE_MASK;
2790
2791
2792
2793
2794
2795 if (!page_private(head)) {
2796 BUG_ON(count & COUNT_CONTINUED);
2797 INIT_LIST_HEAD(&head->lru);
2798 set_page_private(head, SWP_CONTINUED);
2799 si->flags |= SWP_CONTINUED;
2800 }
2801
2802 list_for_each_entry(list_page, &head->lru, lru) {
2803 unsigned char *map;
2804
2805
2806
2807
2808
2809 if (!(count & COUNT_CONTINUED))
2810 goto out;
2811
2812 map = kmap_atomic(list_page) + offset;
2813 count = *map;
2814 kunmap_atomic(map);
2815
2816
2817
2818
2819
2820 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
2821 goto out;
2822 }
2823
2824 list_add_tail(&page->lru, &head->lru);
2825 page = NULL;
2826out:
2827 spin_unlock(&si->lock);
2828outer:
2829 if (page)
2830 __free_page(page);
2831 return 0;
2832}
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842static bool swap_count_continued(struct swap_info_struct *si,
2843 pgoff_t offset, unsigned char count)
2844{
2845 struct page *head;
2846 struct page *page;
2847 unsigned char *map;
2848
2849 head = vmalloc_to_page(si->swap_map + offset);
2850 if (page_private(head) != SWP_CONTINUED) {
2851 BUG_ON(count & COUNT_CONTINUED);
2852 return false;
2853 }
2854
2855 offset &= ~PAGE_MASK;
2856 page = list_entry(head->lru.next, struct page, lru);
2857 map = kmap_atomic(page) + offset;
2858
2859 if (count == SWAP_MAP_MAX)
2860 goto init_map;
2861
2862 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) {
2863
2864
2865
2866 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
2867 kunmap_atomic(map);
2868 page = list_entry(page->lru.next, struct page, lru);
2869 BUG_ON(page == head);
2870 map = kmap_atomic(page) + offset;
2871 }
2872 if (*map == SWAP_CONT_MAX) {
2873 kunmap_atomic(map);
2874 page = list_entry(page->lru.next, struct page, lru);
2875 if (page == head)
2876 return false;
2877 map = kmap_atomic(page) + offset;
2878init_map: *map = 0;
2879 }
2880 *map += 1;
2881 kunmap_atomic(map);
2882 page = list_entry(page->lru.prev, struct page, lru);
2883 while (page != head) {
2884 map = kmap_atomic(page) + offset;
2885 *map = COUNT_CONTINUED;
2886 kunmap_atomic(map);
2887 page = list_entry(page->lru.prev, struct page, lru);
2888 }
2889 return true;
2890
2891 } else {
2892
2893
2894
2895 BUG_ON(count != COUNT_CONTINUED);
2896 while (*map == COUNT_CONTINUED) {
2897 kunmap_atomic(map);
2898 page = list_entry(page->lru.next, struct page, lru);
2899 BUG_ON(page == head);
2900 map = kmap_atomic(page) + offset;
2901 }
2902 BUG_ON(*map == 0);
2903 *map -= 1;
2904 if (*map == 0)
2905 count = 0;
2906 kunmap_atomic(map);
2907 page = list_entry(page->lru.prev, struct page, lru);
2908 while (page != head) {
2909 map = kmap_atomic(page) + offset;
2910 *map = SWAP_CONT_MAX | count;
2911 count = COUNT_CONTINUED;
2912 kunmap_atomic(map);
2913 page = list_entry(page->lru.prev, struct page, lru);
2914 }
2915 return count == COUNT_CONTINUED;
2916 }
2917}
2918
2919
2920
2921
2922
2923static void free_swap_count_continuations(struct swap_info_struct *si)
2924{
2925 pgoff_t offset;
2926
2927 for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
2928 struct page *head;
2929 head = vmalloc_to_page(si->swap_map + offset);
2930 if (page_private(head)) {
2931 struct list_head *this, *next;
2932 list_for_each_safe(this, next, &head->lru) {
2933 struct page *page;
2934 page = list_entry(this, struct page, lru);
2935 list_del(this);
2936 __free_page(page);
2937 }
2938 }
2939 }
2940}
2941