1
2
3
4
5
6
7
8#include <linux/mm.h>
9#include <linux/hugetlb.h>
10#include <linux/mman.h>
11#include <linux/slab.h>
12#include <linux/kernel_stat.h>
13#include <linux/swap.h>
14#include <linux/vmalloc.h>
15#include <linux/pagemap.h>
16#include <linux/namei.h>
17#include <linux/shmem_fs.h>
18#include <linux/blkdev.h>
19#include <linux/random.h>
20#include <linux/writeback.h>
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/init.h>
24#include <linux/ksm.h>
25#include <linux/rmap.h>
26#include <linux/security.h>
27#include <linux/backing-dev.h>
28#include <linux/mutex.h>
29#include <linux/capability.h>
30#include <linux/syscalls.h>
31#include <linux/memcontrol.h>
32#include <linux/poll.h>
33#include <linux/oom.h>
34#include <linux/frontswap.h>
35#include <linux/swapfile.h>
36#include <linux/export.h>
37
38#include <asm/pgtable.h>
39#include <asm/tlbflush.h>
40#include <linux/swapops.h>
41#include <linux/swap_cgroup.h>
42
43static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
44 unsigned char);
45static void free_swap_count_continuations(struct swap_info_struct *);
46static sector_t map_swap_entry(swp_entry_t, struct block_device**);
47
48DEFINE_SPINLOCK(swap_lock);
49static unsigned int nr_swapfiles;
50atomic_long_t nr_swap_pages;
51
52
53
54
55
56EXPORT_SYMBOL_GPL(nr_swap_pages);
57
58long total_swap_pages;
59static int least_priority;
60
61static const char Bad_file[] = "Bad swap file entry ";
62static const char Unused_file[] = "Unused swap file entry ";
63static const char Bad_offset[] = "Bad swap offset entry ";
64static const char Unused_offset[] = "Unused swap offset entry ";
65
66
67
68
69
70PLIST_HEAD(swap_active_head);
71
72
73
74
75
76
77
78
79
80
81
82
83
84static PLIST_HEAD(swap_avail_head);
85static DEFINE_SPINLOCK(swap_avail_lock);
86
87struct swap_info_struct *swap_info[MAX_SWAPFILES];
88
89static DEFINE_MUTEX(swapon_mutex);
90
91static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
92
93static atomic_t proc_poll_event = ATOMIC_INIT(0);
94
95static inline unsigned char swap_count(unsigned char ent)
96{
97 return ent & ~SWAP_HAS_CACHE;
98}
99
100
101static int
102__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
103{
104 swp_entry_t entry = swp_entry(si->type, offset);
105 struct page *page;
106 int ret = 0;
107
108 page = find_get_page(swap_address_space(entry), entry.val);
109 if (!page)
110 return 0;
111
112
113
114
115
116
117
118 if (trylock_page(page)) {
119 ret = try_to_free_swap(page);
120 unlock_page(page);
121 }
122 put_page(page);
123 return ret;
124}
125
126
127
128
129
130static int discard_swap(struct swap_info_struct *si)
131{
132 struct swap_extent *se;
133 sector_t start_block;
134 sector_t nr_blocks;
135 int err = 0;
136
137
138 se = &si->first_swap_extent;
139 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
140 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
141 if (nr_blocks) {
142 err = blkdev_issue_discard(si->bdev, start_block,
143 nr_blocks, GFP_KERNEL, 0);
144 if (err)
145 return err;
146 cond_resched();
147 }
148
149 list_for_each_entry(se, &si->first_swap_extent.list, list) {
150 start_block = se->start_block << (PAGE_SHIFT - 9);
151 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
152
153 err = blkdev_issue_discard(si->bdev, start_block,
154 nr_blocks, GFP_KERNEL, 0);
155 if (err)
156 break;
157
158 cond_resched();
159 }
160 return err;
161}
162
163
164
165
166
167static void discard_swap_cluster(struct swap_info_struct *si,
168 pgoff_t start_page, pgoff_t nr_pages)
169{
170 struct swap_extent *se = si->curr_swap_extent;
171 int found_extent = 0;
172
173 while (nr_pages) {
174 if (se->start_page <= start_page &&
175 start_page < se->start_page + se->nr_pages) {
176 pgoff_t offset = start_page - se->start_page;
177 sector_t start_block = se->start_block + offset;
178 sector_t nr_blocks = se->nr_pages - offset;
179
180 if (nr_blocks > nr_pages)
181 nr_blocks = nr_pages;
182 start_page += nr_blocks;
183 nr_pages -= nr_blocks;
184
185 if (!found_extent++)
186 si->curr_swap_extent = se;
187
188 start_block <<= PAGE_SHIFT - 9;
189 nr_blocks <<= PAGE_SHIFT - 9;
190 if (blkdev_issue_discard(si->bdev, start_block,
191 nr_blocks, GFP_NOIO, 0))
192 break;
193 }
194
195 se = list_next_entry(se, list);
196 }
197}
198
199#define SWAPFILE_CLUSTER 256
200#define LATENCY_LIMIT 256
201
202static inline void cluster_set_flag(struct swap_cluster_info *info,
203 unsigned int flag)
204{
205 info->flags = flag;
206}
207
208static inline unsigned int cluster_count(struct swap_cluster_info *info)
209{
210 return info->data;
211}
212
213static inline void cluster_set_count(struct swap_cluster_info *info,
214 unsigned int c)
215{
216 info->data = c;
217}
218
219static inline void cluster_set_count_flag(struct swap_cluster_info *info,
220 unsigned int c, unsigned int f)
221{
222 info->flags = f;
223 info->data = c;
224}
225
226static inline unsigned int cluster_next(struct swap_cluster_info *info)
227{
228 return info->data;
229}
230
231static inline void cluster_set_next(struct swap_cluster_info *info,
232 unsigned int n)
233{
234 info->data = n;
235}
236
237static inline void cluster_set_next_flag(struct swap_cluster_info *info,
238 unsigned int n, unsigned int f)
239{
240 info->flags = f;
241 info->data = n;
242}
243
244static inline bool cluster_is_free(struct swap_cluster_info *info)
245{
246 return info->flags & CLUSTER_FLAG_FREE;
247}
248
249static inline bool cluster_is_null(struct swap_cluster_info *info)
250{
251 return info->flags & CLUSTER_FLAG_NEXT_NULL;
252}
253
254static inline void cluster_set_null(struct swap_cluster_info *info)
255{
256 info->flags = CLUSTER_FLAG_NEXT_NULL;
257 info->data = 0;
258}
259
260
261static void swap_cluster_schedule_discard(struct swap_info_struct *si,
262 unsigned int idx)
263{
264
265
266
267
268
269
270 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
271 SWAP_MAP_BAD, SWAPFILE_CLUSTER);
272
273 if (cluster_is_null(&si->discard_cluster_head)) {
274 cluster_set_next_flag(&si->discard_cluster_head,
275 idx, 0);
276 cluster_set_next_flag(&si->discard_cluster_tail,
277 idx, 0);
278 } else {
279 unsigned int tail = cluster_next(&si->discard_cluster_tail);
280 cluster_set_next(&si->cluster_info[tail], idx);
281 cluster_set_next_flag(&si->discard_cluster_tail,
282 idx, 0);
283 }
284
285 schedule_work(&si->discard_work);
286}
287
288
289
290
291
292static void swap_do_scheduled_discard(struct swap_info_struct *si)
293{
294 struct swap_cluster_info *info;
295 unsigned int idx;
296
297 info = si->cluster_info;
298
299 while (!cluster_is_null(&si->discard_cluster_head)) {
300 idx = cluster_next(&si->discard_cluster_head);
301
302 cluster_set_next_flag(&si->discard_cluster_head,
303 cluster_next(&info[idx]), 0);
304 if (cluster_next(&si->discard_cluster_tail) == idx) {
305 cluster_set_null(&si->discard_cluster_head);
306 cluster_set_null(&si->discard_cluster_tail);
307 }
308 spin_unlock(&si->lock);
309
310 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
311 SWAPFILE_CLUSTER);
312
313 spin_lock(&si->lock);
314 cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
315 if (cluster_is_null(&si->free_cluster_head)) {
316 cluster_set_next_flag(&si->free_cluster_head,
317 idx, 0);
318 cluster_set_next_flag(&si->free_cluster_tail,
319 idx, 0);
320 } else {
321 unsigned int tail;
322
323 tail = cluster_next(&si->free_cluster_tail);
324 cluster_set_next(&info[tail], idx);
325 cluster_set_next_flag(&si->free_cluster_tail,
326 idx, 0);
327 }
328 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
329 0, SWAPFILE_CLUSTER);
330 }
331}
332
333static void swap_discard_work(struct work_struct *work)
334{
335 struct swap_info_struct *si;
336
337 si = container_of(work, struct swap_info_struct, discard_work);
338
339 spin_lock(&si->lock);
340 swap_do_scheduled_discard(si);
341 spin_unlock(&si->lock);
342}
343
344
345
346
347
348static void inc_cluster_info_page(struct swap_info_struct *p,
349 struct swap_cluster_info *cluster_info, unsigned long page_nr)
350{
351 unsigned long idx = page_nr / SWAPFILE_CLUSTER;
352
353 if (!cluster_info)
354 return;
355 if (cluster_is_free(&cluster_info[idx])) {
356 VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx);
357 cluster_set_next_flag(&p->free_cluster_head,
358 cluster_next(&cluster_info[idx]), 0);
359 if (cluster_next(&p->free_cluster_tail) == idx) {
360 cluster_set_null(&p->free_cluster_tail);
361 cluster_set_null(&p->free_cluster_head);
362 }
363 cluster_set_count_flag(&cluster_info[idx], 0, 0);
364 }
365
366 VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
367 cluster_set_count(&cluster_info[idx],
368 cluster_count(&cluster_info[idx]) + 1);
369}
370
371
372
373
374
375
376static void dec_cluster_info_page(struct swap_info_struct *p,
377 struct swap_cluster_info *cluster_info, unsigned long page_nr)
378{
379 unsigned long idx = page_nr / SWAPFILE_CLUSTER;
380
381 if (!cluster_info)
382 return;
383
384 VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
385 cluster_set_count(&cluster_info[idx],
386 cluster_count(&cluster_info[idx]) - 1);
387
388 if (cluster_count(&cluster_info[idx]) == 0) {
389
390
391
392
393
394 if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
395 (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
396 swap_cluster_schedule_discard(p, idx);
397 return;
398 }
399
400 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
401 if (cluster_is_null(&p->free_cluster_head)) {
402 cluster_set_next_flag(&p->free_cluster_head, idx, 0);
403 cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
404 } else {
405 unsigned int tail = cluster_next(&p->free_cluster_tail);
406 cluster_set_next(&cluster_info[tail], idx);
407 cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
408 }
409 }
410}
411
412
413
414
415
416static bool
417scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
418 unsigned long offset)
419{
420 struct percpu_cluster *percpu_cluster;
421 bool conflict;
422
423 offset /= SWAPFILE_CLUSTER;
424 conflict = !cluster_is_null(&si->free_cluster_head) &&
425 offset != cluster_next(&si->free_cluster_head) &&
426 cluster_is_free(&si->cluster_info[offset]);
427
428 if (!conflict)
429 return false;
430
431 percpu_cluster = this_cpu_ptr(si->percpu_cluster);
432 cluster_set_null(&percpu_cluster->index);
433 return true;
434}
435
436
437
438
439
440static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
441 unsigned long *offset, unsigned long *scan_base)
442{
443 struct percpu_cluster *cluster;
444 bool found_free;
445 unsigned long tmp;
446
447new_cluster:
448 cluster = this_cpu_ptr(si->percpu_cluster);
449 if (cluster_is_null(&cluster->index)) {
450 if (!cluster_is_null(&si->free_cluster_head)) {
451 cluster->index = si->free_cluster_head;
452 cluster->next = cluster_next(&cluster->index) *
453 SWAPFILE_CLUSTER;
454 } else if (!cluster_is_null(&si->discard_cluster_head)) {
455
456
457
458
459 swap_do_scheduled_discard(si);
460 *scan_base = *offset = si->cluster_next;
461 goto new_cluster;
462 } else
463 return;
464 }
465
466 found_free = false;
467
468
469
470
471
472 tmp = cluster->next;
473 while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) *
474 SWAPFILE_CLUSTER) {
475 if (!si->swap_map[tmp]) {
476 found_free = true;
477 break;
478 }
479 tmp++;
480 }
481 if (!found_free) {
482 cluster_set_null(&cluster->index);
483 goto new_cluster;
484 }
485 cluster->next = tmp + 1;
486 *offset = tmp;
487 *scan_base = tmp;
488}
489
490static unsigned long scan_swap_map(struct swap_info_struct *si,
491 unsigned char usage)
492{
493 unsigned long offset;
494 unsigned long scan_base;
495 unsigned long last_in_cluster = 0;
496 int latency_ration = LATENCY_LIMIT;
497
498
499
500
501
502
503
504
505
506
507
508
509 si->flags += SWP_SCANNING;
510 scan_base = offset = si->cluster_next;
511
512
513 if (si->cluster_info) {
514 scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
515 goto checks;
516 }
517
518 if (unlikely(!si->cluster_nr--)) {
519 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
520 si->cluster_nr = SWAPFILE_CLUSTER - 1;
521 goto checks;
522 }
523
524 spin_unlock(&si->lock);
525
526
527
528
529
530
531
532 scan_base = offset = si->lowest_bit;
533 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
534
535
536 for (; last_in_cluster <= si->highest_bit; offset++) {
537 if (si->swap_map[offset])
538 last_in_cluster = offset + SWAPFILE_CLUSTER;
539 else if (offset == last_in_cluster) {
540 spin_lock(&si->lock);
541 offset -= SWAPFILE_CLUSTER - 1;
542 si->cluster_next = offset;
543 si->cluster_nr = SWAPFILE_CLUSTER - 1;
544 goto checks;
545 }
546 if (unlikely(--latency_ration < 0)) {
547 cond_resched();
548 latency_ration = LATENCY_LIMIT;
549 }
550 }
551
552 offset = scan_base;
553 spin_lock(&si->lock);
554 si->cluster_nr = SWAPFILE_CLUSTER - 1;
555 }
556
557checks:
558 if (si->cluster_info) {
559 while (scan_swap_map_ssd_cluster_conflict(si, offset))
560 scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
561 }
562 if (!(si->flags & SWP_WRITEOK))
563 goto no_page;
564 if (!si->highest_bit)
565 goto no_page;
566 if (offset > si->highest_bit)
567 scan_base = offset = si->lowest_bit;
568
569
570 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
571 int swap_was_freed;
572 spin_unlock(&si->lock);
573 swap_was_freed = __try_to_reclaim_swap(si, offset);
574 spin_lock(&si->lock);
575
576 if (swap_was_freed)
577 goto checks;
578 goto scan;
579 }
580
581 if (si->swap_map[offset])
582 goto scan;
583
584 if (offset == si->lowest_bit)
585 si->lowest_bit++;
586 if (offset == si->highest_bit)
587 si->highest_bit--;
588 si->inuse_pages++;
589 if (si->inuse_pages == si->pages) {
590 si->lowest_bit = si->max;
591 si->highest_bit = 0;
592 spin_lock(&swap_avail_lock);
593 plist_del(&si->avail_list, &swap_avail_head);
594 spin_unlock(&swap_avail_lock);
595 }
596 si->swap_map[offset] = usage;
597 inc_cluster_info_page(si, si->cluster_info, offset);
598 si->cluster_next = offset + 1;
599 si->flags -= SWP_SCANNING;
600
601 return offset;
602
603scan:
604 spin_unlock(&si->lock);
605 while (++offset <= si->highest_bit) {
606 if (!si->swap_map[offset]) {
607 spin_lock(&si->lock);
608 goto checks;
609 }
610 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
611 spin_lock(&si->lock);
612 goto checks;
613 }
614 if (unlikely(--latency_ration < 0)) {
615 cond_resched();
616 latency_ration = LATENCY_LIMIT;
617 }
618 }
619 offset = si->lowest_bit;
620 while (offset < scan_base) {
621 if (!si->swap_map[offset]) {
622 spin_lock(&si->lock);
623 goto checks;
624 }
625 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
626 spin_lock(&si->lock);
627 goto checks;
628 }
629 if (unlikely(--latency_ration < 0)) {
630 cond_resched();
631 latency_ration = LATENCY_LIMIT;
632 }
633 offset++;
634 }
635 spin_lock(&si->lock);
636
637no_page:
638 si->flags -= SWP_SCANNING;
639 return 0;
640}
641
642swp_entry_t get_swap_page(void)
643{
644 struct swap_info_struct *si, *next;
645 pgoff_t offset;
646
647 if (atomic_long_read(&nr_swap_pages) <= 0)
648 goto noswap;
649 atomic_long_dec(&nr_swap_pages);
650
651 spin_lock(&swap_avail_lock);
652
653start_over:
654 plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
655
656 plist_requeue(&si->avail_list, &swap_avail_head);
657 spin_unlock(&swap_avail_lock);
658 spin_lock(&si->lock);
659 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
660 spin_lock(&swap_avail_lock);
661 if (plist_node_empty(&si->avail_list)) {
662 spin_unlock(&si->lock);
663 goto nextsi;
664 }
665 WARN(!si->highest_bit,
666 "swap_info %d in list but !highest_bit\n",
667 si->type);
668 WARN(!(si->flags & SWP_WRITEOK),
669 "swap_info %d in list but !SWP_WRITEOK\n",
670 si->type);
671 plist_del(&si->avail_list, &swap_avail_head);
672 spin_unlock(&si->lock);
673 goto nextsi;
674 }
675
676
677 offset = scan_swap_map(si, SWAP_HAS_CACHE);
678 spin_unlock(&si->lock);
679 if (offset)
680 return swp_entry(si->type, offset);
681 pr_debug("scan_swap_map of si %d failed to find offset\n",
682 si->type);
683 spin_lock(&swap_avail_lock);
684nextsi:
685
686
687
688
689
690
691
692
693
694
695 if (plist_node_empty(&next->avail_list))
696 goto start_over;
697 }
698
699 spin_unlock(&swap_avail_lock);
700
701 atomic_long_inc(&nr_swap_pages);
702noswap:
703 return (swp_entry_t) {0};
704}
705
706
707swp_entry_t get_swap_page_of_type(int type)
708{
709 struct swap_info_struct *si;
710 pgoff_t offset;
711
712 si = swap_info[type];
713 spin_lock(&si->lock);
714 if (si && (si->flags & SWP_WRITEOK)) {
715 atomic_long_dec(&nr_swap_pages);
716
717 offset = scan_swap_map(si, 1);
718 if (offset) {
719 spin_unlock(&si->lock);
720 return swp_entry(type, offset);
721 }
722 atomic_long_inc(&nr_swap_pages);
723 }
724 spin_unlock(&si->lock);
725 return (swp_entry_t) {0};
726}
727
728static struct swap_info_struct *swap_info_get(swp_entry_t entry)
729{
730 struct swap_info_struct *p;
731 unsigned long offset, type;
732
733 if (!entry.val)
734 goto out;
735 type = swp_type(entry);
736 if (type >= nr_swapfiles)
737 goto bad_nofile;
738 p = swap_info[type];
739 if (!(p->flags & SWP_USED))
740 goto bad_device;
741 offset = swp_offset(entry);
742 if (offset >= p->max)
743 goto bad_offset;
744 if (!p->swap_map[offset])
745 goto bad_free;
746 spin_lock(&p->lock);
747 return p;
748
749bad_free:
750 pr_err("swap_free: %s%08lx\n", Unused_offset, entry.val);
751 goto out;
752bad_offset:
753 pr_err("swap_free: %s%08lx\n", Bad_offset, entry.val);
754 goto out;
755bad_device:
756 pr_err("swap_free: %s%08lx\n", Unused_file, entry.val);
757 goto out;
758bad_nofile:
759 pr_err("swap_free: %s%08lx\n", Bad_file, entry.val);
760out:
761 return NULL;
762}
763
764static unsigned char swap_entry_free(struct swap_info_struct *p,
765 swp_entry_t entry, unsigned char usage)
766{
767 unsigned long offset = swp_offset(entry);
768 unsigned char count;
769 unsigned char has_cache;
770
771 count = p->swap_map[offset];
772 has_cache = count & SWAP_HAS_CACHE;
773 count &= ~SWAP_HAS_CACHE;
774
775 if (usage == SWAP_HAS_CACHE) {
776 VM_BUG_ON(!has_cache);
777 has_cache = 0;
778 } else if (count == SWAP_MAP_SHMEM) {
779
780
781
782
783 count = 0;
784 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
785 if (count == COUNT_CONTINUED) {
786 if (swap_count_continued(p, offset, count))
787 count = SWAP_MAP_MAX | COUNT_CONTINUED;
788 else
789 count = SWAP_MAP_MAX;
790 } else
791 count--;
792 }
793
794 usage = count | has_cache;
795 p->swap_map[offset] = usage;
796
797
798 if (!usage) {
799 mem_cgroup_uncharge_swap(entry);
800 dec_cluster_info_page(p, p->cluster_info, offset);
801 if (offset < p->lowest_bit)
802 p->lowest_bit = offset;
803 if (offset > p->highest_bit) {
804 bool was_full = !p->highest_bit;
805 p->highest_bit = offset;
806 if (was_full && (p->flags & SWP_WRITEOK)) {
807 spin_lock(&swap_avail_lock);
808 WARN_ON(!plist_node_empty(&p->avail_list));
809 if (plist_node_empty(&p->avail_list))
810 plist_add(&p->avail_list,
811 &swap_avail_head);
812 spin_unlock(&swap_avail_lock);
813 }
814 }
815 atomic_long_inc(&nr_swap_pages);
816 p->inuse_pages--;
817 frontswap_invalidate_page(p->type, offset);
818 if (p->flags & SWP_BLKDEV) {
819 struct gendisk *disk = p->bdev->bd_disk;
820 if (disk->fops->swap_slot_free_notify)
821 disk->fops->swap_slot_free_notify(p->bdev,
822 offset);
823 }
824 }
825
826 return usage;
827}
828
829
830
831
832
833void swap_free(swp_entry_t entry)
834{
835 struct swap_info_struct *p;
836
837 p = swap_info_get(entry);
838 if (p) {
839 swap_entry_free(p, entry, 1);
840 spin_unlock(&p->lock);
841 }
842}
843
844
845
846
847void swapcache_free(swp_entry_t entry)
848{
849 struct swap_info_struct *p;
850
851 p = swap_info_get(entry);
852 if (p) {
853 swap_entry_free(p, entry, SWAP_HAS_CACHE);
854 spin_unlock(&p->lock);
855 }
856}
857
858
859
860
861
862
863int page_swapcount(struct page *page)
864{
865 int count = 0;
866 struct swap_info_struct *p;
867 swp_entry_t entry;
868
869 entry.val = page_private(page);
870 p = swap_info_get(entry);
871 if (p) {
872 count = swap_count(p->swap_map[swp_offset(entry)]);
873 spin_unlock(&p->lock);
874 }
875 return count;
876}
877
878
879
880
881
882int swp_swapcount(swp_entry_t entry)
883{
884 int count, tmp_count, n;
885 struct swap_info_struct *p;
886 struct page *page;
887 pgoff_t offset;
888 unsigned char *map;
889
890 p = swap_info_get(entry);
891 if (!p)
892 return 0;
893
894 count = swap_count(p->swap_map[swp_offset(entry)]);
895 if (!(count & COUNT_CONTINUED))
896 goto out;
897
898 count &= ~COUNT_CONTINUED;
899 n = SWAP_MAP_MAX + 1;
900
901 offset = swp_offset(entry);
902 page = vmalloc_to_page(p->swap_map + offset);
903 offset &= ~PAGE_MASK;
904 VM_BUG_ON(page_private(page) != SWP_CONTINUED);
905
906 do {
907 page = list_next_entry(page, lru);
908 map = kmap_atomic(page);
909 tmp_count = map[offset];
910 kunmap_atomic(map);
911
912 count += (tmp_count & ~COUNT_CONTINUED) * n;
913 n *= (SWAP_CONT_MAX + 1);
914 } while (tmp_count & COUNT_CONTINUED);
915out:
916 spin_unlock(&p->lock);
917 return count;
918}
919
920
921
922
923
924
925
926
927
928
929
930bool reuse_swap_page(struct page *page, int *total_mapcount)
931{
932 int count;
933
934 VM_BUG_ON_PAGE(!PageLocked(page), page);
935 if (unlikely(PageKsm(page)))
936 return false;
937 count = page_trans_huge_mapcount(page, total_mapcount);
938 if (count <= 1 && PageSwapCache(page)) {
939 count += page_swapcount(page);
940 if (count == 1 && !PageWriteback(page)) {
941 delete_from_swap_cache(page);
942 SetPageDirty(page);
943 }
944 }
945 return count <= 1;
946}
947
948
949
950
951
952int try_to_free_swap(struct page *page)
953{
954 VM_BUG_ON_PAGE(!PageLocked(page), page);
955
956 if (!PageSwapCache(page))
957 return 0;
958 if (PageWriteback(page))
959 return 0;
960 if (page_swapcount(page))
961 return 0;
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978 if (pm_suspended_storage())
979 return 0;
980
981 delete_from_swap_cache(page);
982 SetPageDirty(page);
983 return 1;
984}
985
986
987
988
989
990int free_swap_and_cache(swp_entry_t entry)
991{
992 struct swap_info_struct *p;
993 struct page *page = NULL;
994
995 if (non_swap_entry(entry))
996 return 1;
997
998 p = swap_info_get(entry);
999 if (p) {
1000 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
1001 page = find_get_page(swap_address_space(entry),
1002 entry.val);
1003 if (page && !trylock_page(page)) {
1004 put_page(page);
1005 page = NULL;
1006 }
1007 }
1008 spin_unlock(&p->lock);
1009 }
1010 if (page) {
1011
1012
1013
1014
1015 if (PageSwapCache(page) && !PageWriteback(page) &&
1016 (!page_mapped(page) || mem_cgroup_swap_full(page))) {
1017 delete_from_swap_cache(page);
1018 SetPageDirty(page);
1019 }
1020 unlock_page(page);
1021 put_page(page);
1022 }
1023 return p != NULL;
1024}
1025
1026#ifdef CONFIG_HIBERNATION
1027
1028
1029
1030
1031
1032
1033
1034
1035int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
1036{
1037 struct block_device *bdev = NULL;
1038 int type;
1039
1040 if (device)
1041 bdev = bdget(device);
1042
1043 spin_lock(&swap_lock);
1044 for (type = 0; type < nr_swapfiles; type++) {
1045 struct swap_info_struct *sis = swap_info[type];
1046
1047 if (!(sis->flags & SWP_WRITEOK))
1048 continue;
1049
1050 if (!bdev) {
1051 if (bdev_p)
1052 *bdev_p = bdgrab(sis->bdev);
1053
1054 spin_unlock(&swap_lock);
1055 return type;
1056 }
1057 if (bdev == sis->bdev) {
1058 struct swap_extent *se = &sis->first_swap_extent;
1059
1060 if (se->start_block == offset) {
1061 if (bdev_p)
1062 *bdev_p = bdgrab(sis->bdev);
1063
1064 spin_unlock(&swap_lock);
1065 bdput(bdev);
1066 return type;
1067 }
1068 }
1069 }
1070 spin_unlock(&swap_lock);
1071 if (bdev)
1072 bdput(bdev);
1073
1074 return -ENODEV;
1075}
1076
1077
1078
1079
1080
1081sector_t swapdev_block(int type, pgoff_t offset)
1082{
1083 struct block_device *bdev;
1084
1085 if ((unsigned int)type >= nr_swapfiles)
1086 return 0;
1087 if (!(swap_info[type]->flags & SWP_WRITEOK))
1088 return 0;
1089 return map_swap_entry(swp_entry(type, offset), &bdev);
1090}
1091
1092
1093
1094
1095
1096
1097
1098unsigned int count_swap_pages(int type, int free)
1099{
1100 unsigned int n = 0;
1101
1102 spin_lock(&swap_lock);
1103 if ((unsigned int)type < nr_swapfiles) {
1104 struct swap_info_struct *sis = swap_info[type];
1105
1106 spin_lock(&sis->lock);
1107 if (sis->flags & SWP_WRITEOK) {
1108 n = sis->pages;
1109 if (free)
1110 n -= sis->inuse_pages;
1111 }
1112 spin_unlock(&sis->lock);
1113 }
1114 spin_unlock(&swap_lock);
1115 return n;
1116}
1117#endif
1118
1119static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
1120{
1121 return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
1122}
1123
1124
1125
1126
1127
1128
1129static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
1130 unsigned long addr, swp_entry_t entry, struct page *page)
1131{
1132 struct page *swapcache;
1133 struct mem_cgroup *memcg;
1134 spinlock_t *ptl;
1135 pte_t *pte;
1136 int ret = 1;
1137
1138 swapcache = page;
1139 page = ksm_might_need_to_copy(page, vma, addr);
1140 if (unlikely(!page))
1141 return -ENOMEM;
1142
1143 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
1144 &memcg, false)) {
1145 ret = -ENOMEM;
1146 goto out_nolock;
1147 }
1148
1149 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1150 if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
1151 mem_cgroup_cancel_charge(page, memcg, false);
1152 ret = 0;
1153 goto out;
1154 }
1155
1156 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
1157 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
1158 get_page(page);
1159 set_pte_at(vma->vm_mm, addr, pte,
1160 pte_mkold(mk_pte(page, vma->vm_page_prot)));
1161 if (page == swapcache) {
1162 page_add_anon_rmap(page, vma, addr, false);
1163 mem_cgroup_commit_charge(page, memcg, true, false);
1164 } else {
1165 page_add_new_anon_rmap(page, vma, addr, false);
1166 mem_cgroup_commit_charge(page, memcg, false, false);
1167 lru_cache_add_active_or_unevictable(page, vma);
1168 }
1169 swap_free(entry);
1170
1171
1172
1173
1174 activate_page(page);
1175out:
1176 pte_unmap_unlock(pte, ptl);
1177out_nolock:
1178 if (page != swapcache) {
1179 unlock_page(page);
1180 put_page(page);
1181 }
1182 return ret;
1183}
1184
1185static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
1186 unsigned long addr, unsigned long end,
1187 swp_entry_t entry, struct page *page)
1188{
1189 pte_t swp_pte = swp_entry_to_pte(entry);
1190 pte_t *pte;
1191 int ret = 0;
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202 pte = pte_offset_map(pmd, addr);
1203 do {
1204
1205
1206
1207
1208 if (unlikely(pte_same_as_swp(*pte, swp_pte))) {
1209 pte_unmap(pte);
1210 ret = unuse_pte(vma, pmd, addr, entry, page);
1211 if (ret)
1212 goto out;
1213 pte = pte_offset_map(pmd, addr);
1214 }
1215 } while (pte++, addr += PAGE_SIZE, addr != end);
1216 pte_unmap(pte - 1);
1217out:
1218 return ret;
1219}
1220
1221static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
1222 unsigned long addr, unsigned long end,
1223 swp_entry_t entry, struct page *page)
1224{
1225 pmd_t *pmd;
1226 unsigned long next;
1227 int ret;
1228
1229 pmd = pmd_offset(pud, addr);
1230 do {
1231 next = pmd_addr_end(addr, end);
1232 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1233 continue;
1234 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
1235 if (ret)
1236 return ret;
1237 } while (pmd++, addr = next, addr != end);
1238 return 0;
1239}
1240
1241static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
1242 unsigned long addr, unsigned long end,
1243 swp_entry_t entry, struct page *page)
1244{
1245 pud_t *pud;
1246 unsigned long next;
1247 int ret;
1248
1249 pud = pud_offset(pgd, addr);
1250 do {
1251 next = pud_addr_end(addr, end);
1252 if (pud_none_or_clear_bad(pud))
1253 continue;
1254 ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
1255 if (ret)
1256 return ret;
1257 } while (pud++, addr = next, addr != end);
1258 return 0;
1259}
1260
1261static int unuse_vma(struct vm_area_struct *vma,
1262 swp_entry_t entry, struct page *page)
1263{
1264 pgd_t *pgd;
1265 unsigned long addr, end, next;
1266 int ret;
1267
1268 if (page_anon_vma(page)) {
1269 addr = page_address_in_vma(page, vma);
1270 if (addr == -EFAULT)
1271 return 0;
1272 else
1273 end = addr + PAGE_SIZE;
1274 } else {
1275 addr = vma->vm_start;
1276 end = vma->vm_end;
1277 }
1278
1279 pgd = pgd_offset(vma->vm_mm, addr);
1280 do {
1281 next = pgd_addr_end(addr, end);
1282 if (pgd_none_or_clear_bad(pgd))
1283 continue;
1284 ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
1285 if (ret)
1286 return ret;
1287 } while (pgd++, addr = next, addr != end);
1288 return 0;
1289}
1290
1291static int unuse_mm(struct mm_struct *mm,
1292 swp_entry_t entry, struct page *page)
1293{
1294 struct vm_area_struct *vma;
1295 int ret = 0;
1296
1297 if (!down_read_trylock(&mm->mmap_sem)) {
1298
1299
1300
1301
1302 activate_page(page);
1303 unlock_page(page);
1304 down_read(&mm->mmap_sem);
1305 lock_page(page);
1306 }
1307 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1308 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
1309 break;
1310 }
1311 up_read(&mm->mmap_sem);
1312 return (ret < 0)? ret: 0;
1313}
1314
1315
1316
1317
1318
1319
1320static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1321 unsigned int prev, bool frontswap)
1322{
1323 unsigned int max = si->max;
1324 unsigned int i = prev;
1325 unsigned char count;
1326
1327
1328
1329
1330
1331
1332
1333 for (;;) {
1334 if (++i >= max) {
1335 if (!prev) {
1336 i = 0;
1337 break;
1338 }
1339
1340
1341
1342
1343 max = prev + 1;
1344 prev = 0;
1345 i = 1;
1346 }
1347 if (frontswap) {
1348 if (frontswap_test(si, i))
1349 break;
1350 else
1351 continue;
1352 }
1353 count = READ_ONCE(si->swap_map[i]);
1354 if (count && swap_count(count) != SWAP_MAP_BAD)
1355 break;
1356 }
1357 return i;
1358}
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368int try_to_unuse(unsigned int type, bool frontswap,
1369 unsigned long pages_to_unuse)
1370{
1371 struct swap_info_struct *si = swap_info[type];
1372 struct mm_struct *start_mm;
1373 volatile unsigned char *swap_map;
1374
1375
1376
1377
1378 unsigned char swcount;
1379 struct page *page;
1380 swp_entry_t entry;
1381 unsigned int i = 0;
1382 int retval = 0;
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398 start_mm = &init_mm;
1399 atomic_inc(&init_mm.mm_users);
1400
1401
1402
1403
1404
1405
1406 while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
1407 if (signal_pending(current)) {
1408 retval = -EINTR;
1409 break;
1410 }
1411
1412
1413
1414
1415
1416
1417 swap_map = &si->swap_map[i];
1418 entry = swp_entry(type, i);
1419 page = read_swap_cache_async(entry,
1420 GFP_HIGHUSER_MOVABLE, NULL, 0);
1421 if (!page) {
1422
1423
1424
1425
1426
1427
1428 swcount = *swap_map;
1429
1430
1431
1432
1433
1434
1435
1436 if (!swcount || swcount == SWAP_MAP_BAD)
1437 continue;
1438 retval = -ENOMEM;
1439 break;
1440 }
1441
1442
1443
1444
1445 if (atomic_read(&start_mm->mm_users) == 1) {
1446 mmput(start_mm);
1447 start_mm = &init_mm;
1448 atomic_inc(&init_mm.mm_users);
1449 }
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459 wait_on_page_locked(page);
1460 wait_on_page_writeback(page);
1461 lock_page(page);
1462 wait_on_page_writeback(page);
1463
1464
1465
1466
1467 swcount = *swap_map;
1468 if (swap_count(swcount) == SWAP_MAP_SHMEM) {
1469 retval = shmem_unuse(entry, page);
1470
1471 if (retval < 0)
1472 break;
1473 continue;
1474 }
1475 if (swap_count(swcount) && start_mm != &init_mm)
1476 retval = unuse_mm(start_mm, entry, page);
1477
1478 if (swap_count(*swap_map)) {
1479 int set_start_mm = (*swap_map >= swcount);
1480 struct list_head *p = &start_mm->mmlist;
1481 struct mm_struct *new_start_mm = start_mm;
1482 struct mm_struct *prev_mm = start_mm;
1483 struct mm_struct *mm;
1484
1485 atomic_inc(&new_start_mm->mm_users);
1486 atomic_inc(&prev_mm->mm_users);
1487 spin_lock(&mmlist_lock);
1488 while (swap_count(*swap_map) && !retval &&
1489 (p = p->next) != &start_mm->mmlist) {
1490 mm = list_entry(p, struct mm_struct, mmlist);
1491 if (!atomic_inc_not_zero(&mm->mm_users))
1492 continue;
1493 spin_unlock(&mmlist_lock);
1494 mmput(prev_mm);
1495 prev_mm = mm;
1496
1497 cond_resched();
1498
1499 swcount = *swap_map;
1500 if (!swap_count(swcount))
1501 ;
1502 else if (mm == &init_mm)
1503 set_start_mm = 1;
1504 else
1505 retval = unuse_mm(mm, entry, page);
1506
1507 if (set_start_mm && *swap_map < swcount) {
1508 mmput(new_start_mm);
1509 atomic_inc(&mm->mm_users);
1510 new_start_mm = mm;
1511 set_start_mm = 0;
1512 }
1513 spin_lock(&mmlist_lock);
1514 }
1515 spin_unlock(&mmlist_lock);
1516 mmput(prev_mm);
1517 mmput(start_mm);
1518 start_mm = new_start_mm;
1519 }
1520 if (retval) {
1521 unlock_page(page);
1522 put_page(page);
1523 break;
1524 }
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545 if (swap_count(*swap_map) &&
1546 PageDirty(page) && PageSwapCache(page)) {
1547 struct writeback_control wbc = {
1548 .sync_mode = WB_SYNC_NONE,
1549 };
1550
1551 swap_writepage(page, &wbc);
1552 lock_page(page);
1553 wait_on_page_writeback(page);
1554 }
1555
1556
1557
1558
1559
1560
1561
1562
1563 if (PageSwapCache(page) &&
1564 likely(page_private(page) == entry.val))
1565 delete_from_swap_cache(page);
1566
1567
1568
1569
1570
1571
1572 SetPageDirty(page);
1573 unlock_page(page);
1574 put_page(page);
1575
1576
1577
1578
1579
1580 cond_resched();
1581 if (frontswap && pages_to_unuse > 0) {
1582 if (!--pages_to_unuse)
1583 break;
1584 }
1585 }
1586
1587 mmput(start_mm);
1588 return retval;
1589}
1590
1591
1592
1593
1594
1595
1596
1597static void drain_mmlist(void)
1598{
1599 struct list_head *p, *next;
1600 unsigned int type;
1601
1602 for (type = 0; type < nr_swapfiles; type++)
1603 if (swap_info[type]->inuse_pages)
1604 return;
1605 spin_lock(&mmlist_lock);
1606 list_for_each_safe(p, next, &init_mm.mmlist)
1607 list_del_init(p);
1608 spin_unlock(&mmlist_lock);
1609}
1610
1611
1612
1613
1614
1615
1616
1617static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
1618{
1619 struct swap_info_struct *sis;
1620 struct swap_extent *start_se;
1621 struct swap_extent *se;
1622 pgoff_t offset;
1623
1624 sis = swap_info[swp_type(entry)];
1625 *bdev = sis->bdev;
1626
1627 offset = swp_offset(entry);
1628 start_se = sis->curr_swap_extent;
1629 se = start_se;
1630
1631 for ( ; ; ) {
1632 if (se->start_page <= offset &&
1633 offset < (se->start_page + se->nr_pages)) {
1634 return se->start_block + (offset - se->start_page);
1635 }
1636 se = list_next_entry(se, list);
1637 sis->curr_swap_extent = se;
1638 BUG_ON(se == start_se);
1639 }
1640}
1641
1642
1643
1644
1645sector_t map_swap_page(struct page *page, struct block_device **bdev)
1646{
1647 swp_entry_t entry;
1648 entry.val = page_private(page);
1649 return map_swap_entry(entry, bdev);
1650}
1651
1652
1653
1654
1655static void destroy_swap_extents(struct swap_info_struct *sis)
1656{
1657 while (!list_empty(&sis->first_swap_extent.list)) {
1658 struct swap_extent *se;
1659
1660 se = list_first_entry(&sis->first_swap_extent.list,
1661 struct swap_extent, list);
1662 list_del(&se->list);
1663 kfree(se);
1664 }
1665
1666 if (sis->flags & SWP_FILE) {
1667 struct file *swap_file = sis->swap_file;
1668 struct address_space *mapping = swap_file->f_mapping;
1669
1670 sis->flags &= ~SWP_FILE;
1671 mapping->a_ops->swap_deactivate(swap_file);
1672 }
1673}
1674
1675
1676
1677
1678
1679
1680
1681int
1682add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1683 unsigned long nr_pages, sector_t start_block)
1684{
1685 struct swap_extent *se;
1686 struct swap_extent *new_se;
1687 struct list_head *lh;
1688
1689 if (start_page == 0) {
1690 se = &sis->first_swap_extent;
1691 sis->curr_swap_extent = se;
1692 se->start_page = 0;
1693 se->nr_pages = nr_pages;
1694 se->start_block = start_block;
1695 return 1;
1696 } else {
1697 lh = sis->first_swap_extent.list.prev;
1698 se = list_entry(lh, struct swap_extent, list);
1699 BUG_ON(se->start_page + se->nr_pages != start_page);
1700 if (se->start_block + se->nr_pages == start_block) {
1701
1702 se->nr_pages += nr_pages;
1703 return 0;
1704 }
1705 }
1706
1707
1708
1709
1710 new_se = kmalloc(sizeof(*se), GFP_KERNEL);
1711 if (new_se == NULL)
1712 return -ENOMEM;
1713 new_se->start_page = start_page;
1714 new_se->nr_pages = nr_pages;
1715 new_se->start_block = start_block;
1716
1717 list_add_tail(&new_se->list, &sis->first_swap_extent.list);
1718 return 1;
1719}
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1753{
1754 struct file *swap_file = sis->swap_file;
1755 struct address_space *mapping = swap_file->f_mapping;
1756 struct inode *inode = mapping->host;
1757 int ret;
1758
1759 if (S_ISBLK(inode->i_mode)) {
1760 ret = add_swap_extent(sis, 0, sis->max, 0);
1761 *span = sis->pages;
1762 return ret;
1763 }
1764
1765 if (mapping->a_ops->swap_activate) {
1766 ret = mapping->a_ops->swap_activate(sis, swap_file, span);
1767 if (!ret) {
1768 sis->flags |= SWP_FILE;
1769 ret = add_swap_extent(sis, 0, sis->max, 0);
1770 *span = sis->pages;
1771 }
1772 return ret;
1773 }
1774
1775 return generic_swapfile_activate(sis, swap_file, span);
1776}
1777
1778static void _enable_swap_info(struct swap_info_struct *p, int prio,
1779 unsigned char *swap_map,
1780 struct swap_cluster_info *cluster_info)
1781{
1782 if (prio >= 0)
1783 p->prio = prio;
1784 else
1785 p->prio = --least_priority;
1786
1787
1788
1789
1790 p->list.prio = -p->prio;
1791 p->avail_list.prio = -p->prio;
1792 p->swap_map = swap_map;
1793 p->cluster_info = cluster_info;
1794 p->flags |= SWP_WRITEOK;
1795 atomic_long_add(p->pages, &nr_swap_pages);
1796 total_swap_pages += p->pages;
1797
1798 assert_spin_locked(&swap_lock);
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809 plist_add(&p->list, &swap_active_head);
1810 spin_lock(&swap_avail_lock);
1811 plist_add(&p->avail_list, &swap_avail_head);
1812 spin_unlock(&swap_avail_lock);
1813}
1814
1815static void enable_swap_info(struct swap_info_struct *p, int prio,
1816 unsigned char *swap_map,
1817 struct swap_cluster_info *cluster_info,
1818 unsigned long *frontswap_map)
1819{
1820 frontswap_init(p->type, frontswap_map);
1821 spin_lock(&swap_lock);
1822 spin_lock(&p->lock);
1823 _enable_swap_info(p, prio, swap_map, cluster_info);
1824 spin_unlock(&p->lock);
1825 spin_unlock(&swap_lock);
1826}
1827
1828static void reinsert_swap_info(struct swap_info_struct *p)
1829{
1830 spin_lock(&swap_lock);
1831 spin_lock(&p->lock);
1832 _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
1833 spin_unlock(&p->lock);
1834 spin_unlock(&swap_lock);
1835}
1836
1837SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1838{
1839 struct swap_info_struct *p = NULL;
1840 unsigned char *swap_map;
1841 struct swap_cluster_info *cluster_info;
1842 unsigned long *frontswap_map;
1843 struct file *swap_file, *victim;
1844 struct address_space *mapping;
1845 struct inode *inode;
1846 struct filename *pathname;
1847 int err, found = 0;
1848 unsigned int old_block_size;
1849
1850 if (!capable(CAP_SYS_ADMIN))
1851 return -EPERM;
1852
1853 BUG_ON(!current->mm);
1854
1855 pathname = getname(specialfile);
1856 if (IS_ERR(pathname))
1857 return PTR_ERR(pathname);
1858
1859 victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
1860 err = PTR_ERR(victim);
1861 if (IS_ERR(victim))
1862 goto out;
1863
1864 mapping = victim->f_mapping;
1865 spin_lock(&swap_lock);
1866 plist_for_each_entry(p, &swap_active_head, list) {
1867 if (p->flags & SWP_WRITEOK) {
1868 if (p->swap_file->f_mapping == mapping) {
1869 found = 1;
1870 break;
1871 }
1872 }
1873 }
1874 if (!found) {
1875 err = -EINVAL;
1876 spin_unlock(&swap_lock);
1877 goto out_dput;
1878 }
1879 if (!security_vm_enough_memory_mm(current->mm, p->pages))
1880 vm_unacct_memory(p->pages);
1881 else {
1882 err = -ENOMEM;
1883 spin_unlock(&swap_lock);
1884 goto out_dput;
1885 }
1886 spin_lock(&swap_avail_lock);
1887 plist_del(&p->avail_list, &swap_avail_head);
1888 spin_unlock(&swap_avail_lock);
1889 spin_lock(&p->lock);
1890 if (p->prio < 0) {
1891 struct swap_info_struct *si = p;
1892
1893 plist_for_each_entry_continue(si, &swap_active_head, list) {
1894 si->prio++;
1895 si->list.prio--;
1896 si->avail_list.prio--;
1897 }
1898 least_priority++;
1899 }
1900 plist_del(&p->list, &swap_active_head);
1901 atomic_long_sub(p->pages, &nr_swap_pages);
1902 total_swap_pages -= p->pages;
1903 p->flags &= ~SWP_WRITEOK;
1904 spin_unlock(&p->lock);
1905 spin_unlock(&swap_lock);
1906
1907 set_current_oom_origin();
1908 err = try_to_unuse(p->type, false, 0);
1909 clear_current_oom_origin();
1910
1911 if (err) {
1912
1913 reinsert_swap_info(p);
1914 goto out_dput;
1915 }
1916
1917 flush_work(&p->discard_work);
1918
1919 destroy_swap_extents(p);
1920 if (p->flags & SWP_CONTINUED)
1921 free_swap_count_continuations(p);
1922
1923 mutex_lock(&swapon_mutex);
1924 spin_lock(&swap_lock);
1925 spin_lock(&p->lock);
1926 drain_mmlist();
1927
1928
1929 p->highest_bit = 0;
1930 while (p->flags >= SWP_SCANNING) {
1931 spin_unlock(&p->lock);
1932 spin_unlock(&swap_lock);
1933 schedule_timeout_uninterruptible(1);
1934 spin_lock(&swap_lock);
1935 spin_lock(&p->lock);
1936 }
1937
1938 swap_file = p->swap_file;
1939 old_block_size = p->old_block_size;
1940 p->swap_file = NULL;
1941 p->max = 0;
1942 swap_map = p->swap_map;
1943 p->swap_map = NULL;
1944 cluster_info = p->cluster_info;
1945 p->cluster_info = NULL;
1946 frontswap_map = frontswap_map_get(p);
1947 spin_unlock(&p->lock);
1948 spin_unlock(&swap_lock);
1949 frontswap_invalidate_area(p->type);
1950 frontswap_map_set(p, NULL);
1951 mutex_unlock(&swapon_mutex);
1952 free_percpu(p->percpu_cluster);
1953 p->percpu_cluster = NULL;
1954 vfree(swap_map);
1955 vfree(cluster_info);
1956 vfree(frontswap_map);
1957
1958 swap_cgroup_swapoff(p->type);
1959
1960 inode = mapping->host;
1961 if (S_ISBLK(inode->i_mode)) {
1962 struct block_device *bdev = I_BDEV(inode);
1963 set_blocksize(bdev, old_block_size);
1964 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1965 } else {
1966 inode_lock(inode);
1967 inode->i_flags &= ~S_SWAPFILE;
1968 inode_unlock(inode);
1969 }
1970 filp_close(swap_file, NULL);
1971
1972
1973
1974
1975
1976
1977 spin_lock(&swap_lock);
1978 p->flags = 0;
1979 spin_unlock(&swap_lock);
1980
1981 err = 0;
1982 atomic_inc(&proc_poll_event);
1983 wake_up_interruptible(&proc_poll_wait);
1984
1985out_dput:
1986 filp_close(victim, NULL);
1987out:
1988 putname(pathname);
1989 return err;
1990}
1991
1992#ifdef CONFIG_PROC_FS
1993static unsigned swaps_poll(struct file *file, poll_table *wait)
1994{
1995 struct seq_file *seq = file->private_data;
1996
1997 poll_wait(file, &proc_poll_wait, wait);
1998
1999 if (seq->poll_event != atomic_read(&proc_poll_event)) {
2000 seq->poll_event = atomic_read(&proc_poll_event);
2001 return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
2002 }
2003
2004 return POLLIN | POLLRDNORM;
2005}
2006
2007
2008static void *swap_start(struct seq_file *swap, loff_t *pos)
2009{
2010 struct swap_info_struct *si;
2011 int type;
2012 loff_t l = *pos;
2013
2014 mutex_lock(&swapon_mutex);
2015
2016 if (!l)
2017 return SEQ_START_TOKEN;
2018
2019 for (type = 0; type < nr_swapfiles; type++) {
2020 smp_rmb();
2021 si = swap_info[type];
2022 if (!(si->flags & SWP_USED) || !si->swap_map)
2023 continue;
2024 if (!--l)
2025 return si;
2026 }
2027
2028 return NULL;
2029}
2030
2031static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
2032{
2033 struct swap_info_struct *si = v;
2034 int type;
2035
2036 if (v == SEQ_START_TOKEN)
2037 type = 0;
2038 else
2039 type = si->type + 1;
2040
2041 for (; type < nr_swapfiles; type++) {
2042 smp_rmb();
2043 si = swap_info[type];
2044 if (!(si->flags & SWP_USED) || !si->swap_map)
2045 continue;
2046 ++*pos;
2047 return si;
2048 }
2049
2050 return NULL;
2051}
2052
2053static void swap_stop(struct seq_file *swap, void *v)
2054{
2055 mutex_unlock(&swapon_mutex);
2056}
2057
2058static int swap_show(struct seq_file *swap, void *v)
2059{
2060 struct swap_info_struct *si = v;
2061 struct file *file;
2062 int len;
2063
2064 if (si == SEQ_START_TOKEN) {
2065 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
2066 return 0;
2067 }
2068
2069 file = si->swap_file;
2070 len = seq_file_path(swap, file, " \t\n\\");
2071 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
2072 len < 40 ? 40 - len : 1, " ",
2073 S_ISBLK(file_inode(file)->i_mode) ?
2074 "partition" : "file\t",
2075 si->pages << (PAGE_SHIFT - 10),
2076 si->inuse_pages << (PAGE_SHIFT - 10),
2077 si->prio);
2078 return 0;
2079}
2080
2081static const struct seq_operations swaps_op = {
2082 .start = swap_start,
2083 .next = swap_next,
2084 .stop = swap_stop,
2085 .show = swap_show
2086};
2087
2088static int swaps_open(struct inode *inode, struct file *file)
2089{
2090 struct seq_file *seq;
2091 int ret;
2092
2093 ret = seq_open(file, &swaps_op);
2094 if (ret)
2095 return ret;
2096
2097 seq = file->private_data;
2098 seq->poll_event = atomic_read(&proc_poll_event);
2099 return 0;
2100}
2101
2102static const struct file_operations proc_swaps_operations = {
2103 .open = swaps_open,
2104 .read = seq_read,
2105 .llseek = seq_lseek,
2106 .release = seq_release,
2107 .poll = swaps_poll,
2108};
2109
2110static int __init procswaps_init(void)
2111{
2112 proc_create("swaps", 0, NULL, &proc_swaps_operations);
2113 return 0;
2114}
2115__initcall(procswaps_init);
2116#endif
2117
2118#ifdef MAX_SWAPFILES_CHECK
2119static int __init max_swapfiles_check(void)
2120{
2121 MAX_SWAPFILES_CHECK();
2122 return 0;
2123}
2124late_initcall(max_swapfiles_check);
2125#endif
2126
2127static struct swap_info_struct *alloc_swap_info(void)
2128{
2129 struct swap_info_struct *p;
2130 unsigned int type;
2131
2132 p = kzalloc(sizeof(*p), GFP_KERNEL);
2133 if (!p)
2134 return ERR_PTR(-ENOMEM);
2135
2136 spin_lock(&swap_lock);
2137 for (type = 0; type < nr_swapfiles; type++) {
2138 if (!(swap_info[type]->flags & SWP_USED))
2139 break;
2140 }
2141 if (type >= MAX_SWAPFILES) {
2142 spin_unlock(&swap_lock);
2143 kfree(p);
2144 return ERR_PTR(-EPERM);
2145 }
2146 if (type >= nr_swapfiles) {
2147 p->type = type;
2148 swap_info[type] = p;
2149
2150
2151
2152
2153
2154 smp_wmb();
2155 nr_swapfiles++;
2156 } else {
2157 kfree(p);
2158 p = swap_info[type];
2159
2160
2161
2162
2163 }
2164 INIT_LIST_HEAD(&p->first_swap_extent.list);
2165 plist_node_init(&p->list, 0);
2166 plist_node_init(&p->avail_list, 0);
2167 p->flags = SWP_USED;
2168 spin_unlock(&swap_lock);
2169 spin_lock_init(&p->lock);
2170
2171 return p;
2172}
2173
2174static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
2175{
2176 int error;
2177
2178 if (S_ISBLK(inode->i_mode)) {
2179 p->bdev = bdgrab(I_BDEV(inode));
2180 error = blkdev_get(p->bdev,
2181 FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
2182 if (error < 0) {
2183 p->bdev = NULL;
2184 return error;
2185 }
2186 p->old_block_size = block_size(p->bdev);
2187 error = set_blocksize(p->bdev, PAGE_SIZE);
2188 if (error < 0)
2189 return error;
2190 p->flags |= SWP_BLKDEV;
2191 } else if (S_ISREG(inode->i_mode)) {
2192 p->bdev = inode->i_sb->s_bdev;
2193 inode_lock(inode);
2194 if (IS_SWAPFILE(inode))
2195 return -EBUSY;
2196 } else
2197 return -EINVAL;
2198
2199 return 0;
2200}
2201
2202static unsigned long read_swap_header(struct swap_info_struct *p,
2203 union swap_header *swap_header,
2204 struct inode *inode)
2205{
2206 int i;
2207 unsigned long maxpages;
2208 unsigned long swapfilepages;
2209 unsigned long last_page;
2210
2211 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
2212 pr_err("Unable to find swap-space signature\n");
2213 return 0;
2214 }
2215
2216
2217 if (swab32(swap_header->info.version) == 1) {
2218 swab32s(&swap_header->info.version);
2219 swab32s(&swap_header->info.last_page);
2220 swab32s(&swap_header->info.nr_badpages);
2221 for (i = 0; i < swap_header->info.nr_badpages; i++)
2222 swab32s(&swap_header->info.badpages[i]);
2223 }
2224
2225 if (swap_header->info.version != 1) {
2226 pr_warn("Unable to handle swap header version %d\n",
2227 swap_header->info.version);
2228 return 0;
2229 }
2230
2231 p->lowest_bit = 1;
2232 p->cluster_next = 1;
2233 p->cluster_nr = 0;
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249 maxpages = swp_offset(pte_to_swp_entry(
2250 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
2251 last_page = swap_header->info.last_page;
2252 if (last_page > maxpages) {
2253 pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
2254 maxpages << (PAGE_SHIFT - 10),
2255 last_page << (PAGE_SHIFT - 10));
2256 }
2257 if (maxpages > last_page) {
2258 maxpages = last_page + 1;
2259
2260 if ((unsigned int)maxpages == 0)
2261 maxpages = UINT_MAX;
2262 }
2263 p->highest_bit = maxpages - 1;
2264
2265 if (!maxpages)
2266 return 0;
2267 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
2268 if (swapfilepages && maxpages > swapfilepages) {
2269 pr_warn("Swap area shorter than signature indicates\n");
2270 return 0;
2271 }
2272 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
2273 return 0;
2274 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2275 return 0;
2276
2277 return maxpages;
2278}
2279
2280static int setup_swap_map_and_extents(struct swap_info_struct *p,
2281 union swap_header *swap_header,
2282 unsigned char *swap_map,
2283 struct swap_cluster_info *cluster_info,
2284 unsigned long maxpages,
2285 sector_t *span)
2286{
2287 int i;
2288 unsigned int nr_good_pages;
2289 int nr_extents;
2290 unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
2291 unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER;
2292
2293 nr_good_pages = maxpages - 1;
2294
2295 cluster_set_null(&p->free_cluster_head);
2296 cluster_set_null(&p->free_cluster_tail);
2297 cluster_set_null(&p->discard_cluster_head);
2298 cluster_set_null(&p->discard_cluster_tail);
2299
2300 for (i = 0; i < swap_header->info.nr_badpages; i++) {
2301 unsigned int page_nr = swap_header->info.badpages[i];
2302 if (page_nr == 0 || page_nr > swap_header->info.last_page)
2303 return -EINVAL;
2304 if (page_nr < maxpages) {
2305 swap_map[page_nr] = SWAP_MAP_BAD;
2306 nr_good_pages--;
2307
2308
2309
2310
2311 inc_cluster_info_page(p, cluster_info, page_nr);
2312 }
2313 }
2314
2315
2316 for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
2317 inc_cluster_info_page(p, cluster_info, i);
2318
2319 if (nr_good_pages) {
2320 swap_map[0] = SWAP_MAP_BAD;
2321
2322
2323
2324
2325 inc_cluster_info_page(p, cluster_info, 0);
2326 p->max = maxpages;
2327 p->pages = nr_good_pages;
2328 nr_extents = setup_swap_extents(p, span);
2329 if (nr_extents < 0)
2330 return nr_extents;
2331 nr_good_pages = p->pages;
2332 }
2333 if (!nr_good_pages) {
2334 pr_warn("Empty swap-file\n");
2335 return -EINVAL;
2336 }
2337
2338 if (!cluster_info)
2339 return nr_extents;
2340
2341 for (i = 0; i < nr_clusters; i++) {
2342 if (!cluster_count(&cluster_info[idx])) {
2343 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
2344 if (cluster_is_null(&p->free_cluster_head)) {
2345 cluster_set_next_flag(&p->free_cluster_head,
2346 idx, 0);
2347 cluster_set_next_flag(&p->free_cluster_tail,
2348 idx, 0);
2349 } else {
2350 unsigned int tail;
2351
2352 tail = cluster_next(&p->free_cluster_tail);
2353 cluster_set_next(&cluster_info[tail], idx);
2354 cluster_set_next_flag(&p->free_cluster_tail,
2355 idx, 0);
2356 }
2357 }
2358 idx++;
2359 if (idx == nr_clusters)
2360 idx = 0;
2361 }
2362 return nr_extents;
2363}
2364
2365
2366
2367
2368
2369static bool swap_discardable(struct swap_info_struct *si)
2370{
2371 struct request_queue *q = bdev_get_queue(si->bdev);
2372
2373 if (!q || !blk_queue_discard(q))
2374 return false;
2375
2376 return true;
2377}
2378
2379SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2380{
2381 struct swap_info_struct *p;
2382 struct filename *name;
2383 struct file *swap_file = NULL;
2384 struct address_space *mapping;
2385 int prio;
2386 int error;
2387 union swap_header *swap_header;
2388 int nr_extents;
2389 sector_t span;
2390 unsigned long maxpages;
2391 unsigned char *swap_map = NULL;
2392 struct swap_cluster_info *cluster_info = NULL;
2393 unsigned long *frontswap_map = NULL;
2394 struct page *page = NULL;
2395 struct inode *inode = NULL;
2396
2397 if (swap_flags & ~SWAP_FLAGS_VALID)
2398 return -EINVAL;
2399
2400 if (!capable(CAP_SYS_ADMIN))
2401 return -EPERM;
2402
2403 p = alloc_swap_info();
2404 if (IS_ERR(p))
2405 return PTR_ERR(p);
2406
2407 INIT_WORK(&p->discard_work, swap_discard_work);
2408
2409 name = getname(specialfile);
2410 if (IS_ERR(name)) {
2411 error = PTR_ERR(name);
2412 name = NULL;
2413 goto bad_swap;
2414 }
2415 swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
2416 if (IS_ERR(swap_file)) {
2417 error = PTR_ERR(swap_file);
2418 swap_file = NULL;
2419 goto bad_swap;
2420 }
2421
2422 p->swap_file = swap_file;
2423 mapping = swap_file->f_mapping;
2424 inode = mapping->host;
2425
2426
2427 error = claim_swapfile(p, inode);
2428 if (unlikely(error))
2429 goto bad_swap;
2430
2431
2432
2433
2434 if (!mapping->a_ops->readpage) {
2435 error = -EINVAL;
2436 goto bad_swap;
2437 }
2438 page = read_mapping_page(mapping, 0, swap_file);
2439 if (IS_ERR(page)) {
2440 error = PTR_ERR(page);
2441 goto bad_swap;
2442 }
2443 swap_header = kmap(page);
2444
2445 maxpages = read_swap_header(p, swap_header, inode);
2446 if (unlikely(!maxpages)) {
2447 error = -EINVAL;
2448 goto bad_swap;
2449 }
2450
2451
2452 swap_map = vzalloc(maxpages);
2453 if (!swap_map) {
2454 error = -ENOMEM;
2455 goto bad_swap;
2456 }
2457 if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
2458 int cpu;
2459
2460 p->flags |= SWP_SOLIDSTATE;
2461
2462
2463
2464
2465 p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
2466
2467 cluster_info = vzalloc(DIV_ROUND_UP(maxpages,
2468 SWAPFILE_CLUSTER) * sizeof(*cluster_info));
2469 if (!cluster_info) {
2470 error = -ENOMEM;
2471 goto bad_swap;
2472 }
2473 p->percpu_cluster = alloc_percpu(struct percpu_cluster);
2474 if (!p->percpu_cluster) {
2475 error = -ENOMEM;
2476 goto bad_swap;
2477 }
2478 for_each_possible_cpu(cpu) {
2479 struct percpu_cluster *cluster;
2480 cluster = per_cpu_ptr(p->percpu_cluster, cpu);
2481 cluster_set_null(&cluster->index);
2482 }
2483 }
2484
2485 error = swap_cgroup_swapon(p->type, maxpages);
2486 if (error)
2487 goto bad_swap;
2488
2489 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
2490 cluster_info, maxpages, &span);
2491 if (unlikely(nr_extents < 0)) {
2492 error = nr_extents;
2493 goto bad_swap;
2494 }
2495
2496 if (frontswap_enabled)
2497 frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long));
2498
2499 if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
2500
2501
2502
2503
2504
2505
2506 p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
2507 SWP_PAGE_DISCARD);
2508
2509
2510
2511
2512
2513
2514
2515 if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
2516 p->flags &= ~SWP_PAGE_DISCARD;
2517 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
2518 p->flags &= ~SWP_AREA_DISCARD;
2519
2520
2521 if (p->flags & SWP_AREA_DISCARD) {
2522 int err = discard_swap(p);
2523 if (unlikely(err))
2524 pr_err("swapon: discard_swap(%p): %d\n",
2525 p, err);
2526 }
2527 }
2528
2529 mutex_lock(&swapon_mutex);
2530 prio = -1;
2531 if (swap_flags & SWAP_FLAG_PREFER)
2532 prio =
2533 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2534 enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
2535
2536 pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
2537 p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
2538 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2539 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2540 (p->flags & SWP_DISCARDABLE) ? "D" : "",
2541 (p->flags & SWP_AREA_DISCARD) ? "s" : "",
2542 (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
2543 (frontswap_map) ? "FS" : "");
2544
2545 mutex_unlock(&swapon_mutex);
2546 atomic_inc(&proc_poll_event);
2547 wake_up_interruptible(&proc_poll_wait);
2548
2549 if (S_ISREG(inode->i_mode))
2550 inode->i_flags |= S_SWAPFILE;
2551 error = 0;
2552 goto out;
2553bad_swap:
2554 free_percpu(p->percpu_cluster);
2555 p->percpu_cluster = NULL;
2556 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
2557 set_blocksize(p->bdev, p->old_block_size);
2558 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2559 }
2560 destroy_swap_extents(p);
2561 swap_cgroup_swapoff(p->type);
2562 spin_lock(&swap_lock);
2563 p->swap_file = NULL;
2564 p->flags = 0;
2565 spin_unlock(&swap_lock);
2566 vfree(swap_map);
2567 vfree(cluster_info);
2568 if (swap_file) {
2569 if (inode && S_ISREG(inode->i_mode)) {
2570 inode_unlock(inode);
2571 inode = NULL;
2572 }
2573 filp_close(swap_file, NULL);
2574 }
2575out:
2576 if (page && !IS_ERR(page)) {
2577 kunmap(page);
2578 put_page(page);
2579 }
2580 if (name)
2581 putname(name);
2582 if (inode && S_ISREG(inode->i_mode))
2583 inode_unlock(inode);
2584 return error;
2585}
2586
2587void si_swapinfo(struct sysinfo *val)
2588{
2589 unsigned int type;
2590 unsigned long nr_to_be_unused = 0;
2591
2592 spin_lock(&swap_lock);
2593 for (type = 0; type < nr_swapfiles; type++) {
2594 struct swap_info_struct *si = swap_info[type];
2595
2596 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
2597 nr_to_be_unused += si->inuse_pages;
2598 }
2599 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
2600 val->totalswap = total_swap_pages + nr_to_be_unused;
2601 spin_unlock(&swap_lock);
2602}
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2616{
2617 struct swap_info_struct *p;
2618 unsigned long offset, type;
2619 unsigned char count;
2620 unsigned char has_cache;
2621 int err = -EINVAL;
2622
2623 if (non_swap_entry(entry))
2624 goto out;
2625
2626 type = swp_type(entry);
2627 if (type >= nr_swapfiles)
2628 goto bad_file;
2629 p = swap_info[type];
2630 offset = swp_offset(entry);
2631
2632 spin_lock(&p->lock);
2633 if (unlikely(offset >= p->max))
2634 goto unlock_out;
2635
2636 count = p->swap_map[offset];
2637
2638
2639
2640
2641
2642 if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
2643 err = -ENOENT;
2644 goto unlock_out;
2645 }
2646
2647 has_cache = count & SWAP_HAS_CACHE;
2648 count &= ~SWAP_HAS_CACHE;
2649 err = 0;
2650
2651 if (usage == SWAP_HAS_CACHE) {
2652
2653
2654 if (!has_cache && count)
2655 has_cache = SWAP_HAS_CACHE;
2656 else if (has_cache)
2657 err = -EEXIST;
2658 else
2659 err = -ENOENT;
2660
2661 } else if (count || has_cache) {
2662
2663 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
2664 count += usage;
2665 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
2666 err = -EINVAL;
2667 else if (swap_count_continued(p, offset, count))
2668 count = COUNT_CONTINUED;
2669 else
2670 err = -ENOMEM;
2671 } else
2672 err = -ENOENT;
2673
2674 p->swap_map[offset] = count | has_cache;
2675
2676unlock_out:
2677 spin_unlock(&p->lock);
2678out:
2679 return err;
2680
2681bad_file:
2682 pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
2683 goto out;
2684}
2685
2686
2687
2688
2689
2690void swap_shmem_alloc(swp_entry_t entry)
2691{
2692 __swap_duplicate(entry, SWAP_MAP_SHMEM);
2693}
2694
2695
2696
2697
2698
2699
2700
2701
2702int swap_duplicate(swp_entry_t entry)
2703{
2704 int err = 0;
2705
2706 while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
2707 err = add_swap_count_continuation(entry, GFP_ATOMIC);
2708 return err;
2709}
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719int swapcache_prepare(swp_entry_t entry)
2720{
2721 return __swap_duplicate(entry, SWAP_HAS_CACHE);
2722}
2723
2724struct swap_info_struct *page_swap_info(struct page *page)
2725{
2726 swp_entry_t swap = { .val = page_private(page) };
2727 BUG_ON(!PageSwapCache(page));
2728 return swap_info[swp_type(swap)];
2729}
2730
2731
2732
2733
2734struct address_space *__page_file_mapping(struct page *page)
2735{
2736 VM_BUG_ON_PAGE(!PageSwapCache(page), page);
2737 return page_swap_info(page)->swap_file->f_mapping;
2738}
2739EXPORT_SYMBOL_GPL(__page_file_mapping);
2740
2741pgoff_t __page_file_index(struct page *page)
2742{
2743 swp_entry_t swap = { .val = page_private(page) };
2744 VM_BUG_ON_PAGE(!PageSwapCache(page), page);
2745 return swp_offset(swap);
2746}
2747EXPORT_SYMBOL_GPL(__page_file_index);
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2765{
2766 struct swap_info_struct *si;
2767 struct page *head;
2768 struct page *page;
2769 struct page *list_page;
2770 pgoff_t offset;
2771 unsigned char count;
2772
2773
2774
2775
2776
2777 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
2778
2779 si = swap_info_get(entry);
2780 if (!si) {
2781
2782
2783
2784
2785
2786 goto outer;
2787 }
2788
2789 offset = swp_offset(entry);
2790 count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
2791
2792 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
2793
2794
2795
2796
2797
2798 goto out;
2799 }
2800
2801 if (!page) {
2802 spin_unlock(&si->lock);
2803 return -ENOMEM;
2804 }
2805
2806
2807
2808
2809
2810
2811 head = vmalloc_to_page(si->swap_map + offset);
2812 offset &= ~PAGE_MASK;
2813
2814
2815
2816
2817
2818 if (!page_private(head)) {
2819 BUG_ON(count & COUNT_CONTINUED);
2820 INIT_LIST_HEAD(&head->lru);
2821 set_page_private(head, SWP_CONTINUED);
2822 si->flags |= SWP_CONTINUED;
2823 }
2824
2825 list_for_each_entry(list_page, &head->lru, lru) {
2826 unsigned char *map;
2827
2828
2829
2830
2831
2832 if (!(count & COUNT_CONTINUED))
2833 goto out;
2834
2835 map = kmap_atomic(list_page) + offset;
2836 count = *map;
2837 kunmap_atomic(map);
2838
2839
2840
2841
2842
2843 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
2844 goto out;
2845 }
2846
2847 list_add_tail(&page->lru, &head->lru);
2848 page = NULL;
2849out:
2850 spin_unlock(&si->lock);
2851outer:
2852 if (page)
2853 __free_page(page);
2854 return 0;
2855}
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865static bool swap_count_continued(struct swap_info_struct *si,
2866 pgoff_t offset, unsigned char count)
2867{
2868 struct page *head;
2869 struct page *page;
2870 unsigned char *map;
2871
2872 head = vmalloc_to_page(si->swap_map + offset);
2873 if (page_private(head) != SWP_CONTINUED) {
2874 BUG_ON(count & COUNT_CONTINUED);
2875 return false;
2876 }
2877
2878 offset &= ~PAGE_MASK;
2879 page = list_entry(head->lru.next, struct page, lru);
2880 map = kmap_atomic(page) + offset;
2881
2882 if (count == SWAP_MAP_MAX)
2883 goto init_map;
2884
2885 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) {
2886
2887
2888
2889 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
2890 kunmap_atomic(map);
2891 page = list_entry(page->lru.next, struct page, lru);
2892 BUG_ON(page == head);
2893 map = kmap_atomic(page) + offset;
2894 }
2895 if (*map == SWAP_CONT_MAX) {
2896 kunmap_atomic(map);
2897 page = list_entry(page->lru.next, struct page, lru);
2898 if (page == head)
2899 return false;
2900 map = kmap_atomic(page) + offset;
2901init_map: *map = 0;
2902 }
2903 *map += 1;
2904 kunmap_atomic(map);
2905 page = list_entry(page->lru.prev, struct page, lru);
2906 while (page != head) {
2907 map = kmap_atomic(page) + offset;
2908 *map = COUNT_CONTINUED;
2909 kunmap_atomic(map);
2910 page = list_entry(page->lru.prev, struct page, lru);
2911 }
2912 return true;
2913
2914 } else {
2915
2916
2917
2918 BUG_ON(count != COUNT_CONTINUED);
2919 while (*map == COUNT_CONTINUED) {
2920 kunmap_atomic(map);
2921 page = list_entry(page->lru.next, struct page, lru);
2922 BUG_ON(page == head);
2923 map = kmap_atomic(page) + offset;
2924 }
2925 BUG_ON(*map == 0);
2926 *map -= 1;
2927 if (*map == 0)
2928 count = 0;
2929 kunmap_atomic(map);
2930 page = list_entry(page->lru.prev, struct page, lru);
2931 while (page != head) {
2932 map = kmap_atomic(page) + offset;
2933 *map = SWAP_CONT_MAX | count;
2934 count = COUNT_CONTINUED;
2935 kunmap_atomic(map);
2936 page = list_entry(page->lru.prev, struct page, lru);
2937 }
2938 return count == COUNT_CONTINUED;
2939 }
2940}
2941
2942
2943
2944
2945
2946static void free_swap_count_continuations(struct swap_info_struct *si)
2947{
2948 pgoff_t offset;
2949
2950 for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
2951 struct page *head;
2952 head = vmalloc_to_page(si->swap_map + offset);
2953 if (page_private(head)) {
2954 struct page *page, *next;
2955
2956 list_for_each_entry_safe(page, next, &head->lru, lru) {
2957 list_del(&page->lru);
2958 __free_page(page);
2959 }
2960 }
2961 }
2962}
2963