LXR linux/mm/swapfile.c

   1/*
   2 *  linux/mm/swapfile.c
   3 *
   4 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5 *  Swap reorganised 29.12.95, Stephen Tweedie
   6 */
   7
   8#include <linux/mm.h>
   9#include <linux/sched/mm.h>
  10#include <linux/sched/task.h>
  11#include <linux/hugetlb.h>
  12#include <linux/mman.h>
  13#include <linux/slab.h>
  14#include <linux/kernel_stat.h>
  15#include <linux/swap.h>
  16#include <linux/vmalloc.h>
  17#include <linux/pagemap.h>
  18#include <linux/namei.h>
  19#include <linux/shmem_fs.h>
  20#include <linux/blkdev.h>
  21#include <linux/random.h>
  22#include <linux/writeback.h>
  23#include <linux/proc_fs.h>
  24#include <linux/seq_file.h>
  25#include <linux/init.h>
  26#include <linux/ksm.h>
  27#include <linux/rmap.h>
  28#include <linux/security.h>
  29#include <linux/backing-dev.h>
  30#include <linux/mutex.h>
  31#include <linux/capability.h>
  32#include <linux/syscalls.h>
  33#include <linux/memcontrol.h>
  34#include <linux/poll.h>
  35#include <linux/oom.h>
  36#include <linux/frontswap.h>
  37#include <linux/swapfile.h>
  38#include <linux/export.h>
  39#include <linux/swap_slots.h>
  40#include <linux/sort.h>
  41
  42#include <asm/pgtable.h>
  43#include <asm/tlbflush.h>
  44#include <linux/swapops.h>
  45#include <linux/swap_cgroup.h>
  46
  47static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
  48                                 unsigned char);
  49static void free_swap_count_continuations(struct swap_info_struct *);
  50static sector_t map_swap_entry(swp_entry_t, struct block_device**);
  51
  52DEFINE_SPINLOCK(swap_lock);
  53static unsigned int nr_swapfiles;
  54atomic_long_t nr_swap_pages;
  55/*
  56 * Some modules use swappable objects and may try to swap them out under
  57 * memory pressure (via the shrinker). Before doing so, they may wish to
  58 * check to see if any swap space is available.
  59 */
  60EXPORT_SYMBOL_GPL(nr_swap_pages);
  61/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
  62long total_swap_pages;
  63static int least_priority = -1;
  64
  65static const char Bad_file[] = "Bad swap file entry ";
  66static const char Unused_file[] = "Unused swap file entry ";
  67static const char Bad_offset[] = "Bad swap offset entry ";
  68static const char Unused_offset[] = "Unused swap offset entry ";
  69
  70/*
  71 * all active swap_info_structs
  72 * protected with swap_lock, and ordered by priority.
  73 */
  74PLIST_HEAD(swap_active_head);
  75
  76/*
  77 * all available (active, not full) swap_info_structs
  78 * protected with swap_avail_lock, ordered by priority.
  79 * This is used by get_swap_page() instead of swap_active_head
  80 * because swap_active_head includes all swap_info_structs,
  81 * but get_swap_page() doesn't need to look at full ones.
  82 * This uses its own lock instead of swap_lock because when a
  83 * swap_info_struct changes between not-full/full, it needs to
  84 * add/remove itself to/from this list, but the swap_info_struct->lock
  85 * is held and the locking order requires swap_lock to be taken
  86 * before any swap_info_struct->lock.
  87 */
  88static struct plist_head *swap_avail_heads;
  89static DEFINE_SPINLOCK(swap_avail_lock);
  90
  91struct swap_info_struct *swap_info[MAX_SWAPFILES];
  92
  93static DEFINE_MUTEX(swapon_mutex);
  94
  95static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
  96/* Activity counter to indicate that a swapon or swapoff has occurred */
  97static atomic_t proc_poll_event = ATOMIC_INIT(0);
  98
  99atomic_t nr_rotate_swap = ATOMIC_INIT(0);
 100
 101static inline unsigned char swap_count(unsigned char ent)
 102{
 103        return ent & ~SWAP_HAS_CACHE;   /* may include COUNT_CONTINUED flag */
 104}
 105
 106/* returns 1 if swap entry is freed */
 107static int
 108__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
 109{
 110        swp_entry_t entry = swp_entry(si->type, offset);
 111        struct page *page;
 112        int ret = 0;
 113
 114        page = find_get_page(swap_address_space(entry), swp_offset(entry));
 115        if (!page)
 116                return 0;
 117        /*
 118         * This function is called from scan_swap_map() and it's called
 119         * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
 120         * We have to use trylock for avoiding deadlock. This is a special
 121         * case and you should use try_to_free_swap() with explicit lock_page()
 122         * in usual operations.
 123         */
 124        if (trylock_page(page)) {
 125                ret = try_to_free_swap(page);
 126                unlock_page(page);
 127        }
 128        put_page(page);
 129        return ret;
 130}
 131
 132/*
 133 * swapon tell device that all the old swap contents can be discarded,
 134 * to allow the swap device to optimize its wear-levelling.
 135 */
 136static int discard_swap(struct swap_info_struct *si)
 137{
 138        struct swap_extent *se;
 139        sector_t start_block;
 140        sector_t nr_blocks;
 141        int err = 0;
 142
 143        /* Do not discard the swap header page! */
 144        se = &si->first_swap_extent;
 145        start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
 146        nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
 147        if (nr_blocks) {
 148                err = blkdev_issue_discard(si->bdev, start_block,
 149                                nr_blocks, GFP_KERNEL, 0);
 150                if (err)
 151                        return err;
 152                cond_resched();
 153        }
 154
 155        list_for_each_entry(se, &si->first_swap_extent.list, list) {
 156                start_block = se->start_block << (PAGE_SHIFT - 9);
 157                nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
 158
 159                err = blkdev_issue_discard(si->bdev, start_block,
 160                                nr_blocks, GFP_KERNEL, 0);
 161                if (err)
 162                        break;
 163
 164                cond_resched();
 165        }
 166        return err;             /* That will often be -EOPNOTSUPP */
 167}
 168
 169/*
 170 * swap allocation tell device that a cluster of swap can now be discarded,
 171 * to allow the swap device to optimize its wear-levelling.
 172 */
 173static void discard_swap_cluster(struct swap_info_struct *si,
 174                                 pgoff_t start_page, pgoff_t nr_pages)
 175{
 176        struct swap_extent *se = si->curr_swap_extent;
 177        int found_extent = 0;
 178
 179        while (nr_pages) {
 180                if (se->start_page <= start_page &&
 181                    start_page < se->start_page + se->nr_pages) {
 182                        pgoff_t offset = start_page - se->start_page;
 183                        sector_t start_block = se->start_block + offset;
 184                        sector_t nr_blocks = se->nr_pages - offset;
 185
 186                        if (nr_blocks > nr_pages)
 187                                nr_blocks = nr_pages;
 188                        start_page += nr_blocks;
 189                        nr_pages -= nr_blocks;
 190
 191                        if (!found_extent++)
 192                                si->curr_swap_extent = se;
 193
 194                        start_block <<= PAGE_SHIFT - 9;
 195                        nr_blocks <<= PAGE_SHIFT - 9;
 196                        if (blkdev_issue_discard(si->bdev, start_block,
 197                                    nr_blocks, GFP_NOIO, 0))
 198                                break;
 199                }
 200
 201                se = list_next_entry(se, list);
 202        }
 203}
 204
 205#ifdef CONFIG_THP_SWAP
 206#define SWAPFILE_CLUSTER        HPAGE_PMD_NR
 207
 208#define swap_entry_size(size)   (size)
 209#else
 210#define SWAPFILE_CLUSTER        256
 211
 212/*
 213 * Define swap_entry_size() as constant to let compiler to optimize
 214 * out some code if !CONFIG_THP_SWAP
 215 */
 216#define swap_entry_size(size)   1
 217#endif
 218#define LATENCY_LIMIT           256
 219
 220static inline void cluster_set_flag(struct swap_cluster_info *info,
 221        unsigned int flag)
 222{
 223        info->flags = flag;
 224}
 225
 226static inline unsigned int cluster_count(struct swap_cluster_info *info)
 227{
 228        return info->data;
 229}
 230
 231static inline void cluster_set_count(struct swap_cluster_info *info,
 232                                     unsigned int c)
 233{
 234        info->data = c;
 235}
 236
 237static inline void cluster_set_count_flag(struct swap_cluster_info *info,
 238                                         unsigned int c, unsigned int f)
 239{
 240        info->flags = f;
 241        info->data = c;
 242}
 243
 244static inline unsigned int cluster_next(struct swap_cluster_info *info)
 245{
 246        return info->data;
 247}
 248
 249static inline void cluster_set_next(struct swap_cluster_info *info,
 250                                    unsigned int n)
 251{
 252        info->data = n;
 253}
 254
 255static inline void cluster_set_next_flag(struct swap_cluster_info *info,
 256                                         unsigned int n, unsigned int f)
 257{
 258        info->flags = f;
 259        info->data = n;
 260}
 261
 262static inline bool cluster_is_free(struct swap_cluster_info *info)
 263{
 264        return info->flags & CLUSTER_FLAG_FREE;
 265}
 266
 267static inline bool cluster_is_null(struct swap_cluster_info *info)
 268{
 269        return info->flags & CLUSTER_FLAG_NEXT_NULL;
 270}
 271
 272static inline void cluster_set_null(struct swap_cluster_info *info)
 273{
 274        info->flags = CLUSTER_FLAG_NEXT_NULL;
 275        info->data = 0;
 276}
 277
 278static inline bool cluster_is_huge(struct swap_cluster_info *info)
 279{
 280        if (IS_ENABLED(CONFIG_THP_SWAP))
 281                return info->flags & CLUSTER_FLAG_HUGE;
 282        return false;
 283}
 284
 285static inline void cluster_clear_huge(struct swap_cluster_info *info)
 286{
 287        info->flags &= ~CLUSTER_FLAG_HUGE;
 288}
 289
 290static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
 291                                                     unsigned long offset)
 292{
 293        struct swap_cluster_info *ci;
 294
 295        ci = si->cluster_info;
 296        if (ci) {
 297                ci += offset / SWAPFILE_CLUSTER;
 298                spin_lock(&ci->lock);
 299        }
 300        return ci;
 301}
 302
 303static inline void unlock_cluster(struct swap_cluster_info *ci)
 304{
 305        if (ci)
 306                spin_unlock(&ci->lock);
 307}
 308
 309/*
 310 * Determine the locking method in use for this device.  Return
 311 * swap_cluster_info if SSD-style cluster-based locking is in place.
 312 */
 313static inline struct swap_cluster_info *lock_cluster_or_swap_info(
 314                struct swap_info_struct *si, unsigned long offset)
 315{
 316        struct swap_cluster_info *ci;
 317
 318        /* Try to use fine-grained SSD-style locking if available: */
 319        ci = lock_cluster(si, offset);
 320        /* Otherwise, fall back to traditional, coarse locking: */
 321        if (!ci)
 322                spin_lock(&si->lock);
 323
 324        return ci;
 325}
 326
 327static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
 328                                               struct swap_cluster_info *ci)
 329{
 330        if (ci)
 331                unlock_cluster(ci);
 332        else
 333                spin_unlock(&si->lock);
 334}
 335
 336static inline bool cluster_list_empty(struct swap_cluster_list *list)
 337{
 338        return cluster_is_null(&list->head);
 339}
 340
 341static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
 342{
 343        return cluster_next(&list->head);
 344}
 345
 346static void cluster_list_init(struct swap_cluster_list *list)
 347{
 348        cluster_set_null(&list->head);
 349        cluster_set_null(&list->tail);
 350}
 351
 352static void cluster_list_add_tail(struct swap_cluster_list *list,
 353                                  struct swap_cluster_info *ci,
 354                                  unsigned int idx)
 355{
 356        if (cluster_list_empty(list)) {
 357                cluster_set_next_flag(&list->head, idx, 0);
 358                cluster_set_next_flag(&list->tail, idx, 0);
 359        } else {
 360                struct swap_cluster_info *ci_tail;
 361                unsigned int tail = cluster_next(&list->tail);
 362
 363                /*
 364                 * Nested cluster lock, but both cluster locks are
 365                 * only acquired when we held swap_info_struct->lock
 366                 */
 367                ci_tail = ci + tail;
 368                spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
 369                cluster_set_next(ci_tail, idx);
 370                spin_unlock(&ci_tail->lock);
 371                cluster_set_next_flag(&list->tail, idx, 0);
 372        }
 373}
 374
 375static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
 376                                           struct swap_cluster_info *ci)
 377{
 378        unsigned int idx;
 379
 380        idx = cluster_next(&list->head);
 381        if (cluster_next(&list->tail) == idx) {
 382                cluster_set_null(&list->head);
 383                cluster_set_null(&list->tail);
 384        } else
 385                cluster_set_next_flag(&list->head,
 386                                      cluster_next(&ci[idx]), 0);
 387
 388        return idx;
 389}
 390
 391/* Add a cluster to discard list and schedule it to do discard */
 392static void swap_cluster_schedule_discard(struct swap_info_struct *si,
 393                unsigned int idx)
 394{
 395        /*
 396         * If scan_swap_map() can't find a free cluster, it will check
 397         * si->swap_map directly. To make sure the discarding cluster isn't
 398         * taken by scan_swap_map(), mark the swap entries bad (occupied). It
 399         * will be cleared after discard
 400         */
 401        memset(si->swap_map + idx * SWAPFILE_CLUSTER,
 402                        SWAP_MAP_BAD, SWAPFILE_CLUSTER);
 403
 404        cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
 405
 406        schedule_work(&si->discard_work);
 407}
 408
 409static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
 410{
 411        struct swap_cluster_info *ci = si->cluster_info;
 412
 413        cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
 414        cluster_list_add_tail(&si->free_clusters, ci, idx);
 415}
 416
 417/*
 418 * Doing discard actually. After a cluster discard is finished, the cluster
 419 * will be added to free cluster list. caller should hold si->lock.
 420*/
 421static void swap_do_scheduled_discard(struct swap_info_struct *si)
 422{
 423        struct swap_cluster_info *info, *ci;
 424        unsigned int idx;
 425
 426        info = si->cluster_info;
 427
 428        while (!cluster_list_empty(&si->discard_clusters)) {
 429                idx = cluster_list_del_first(&si->discard_clusters, info);
 430                spin_unlock(&si->lock);
 431
 432                discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
 433                                SWAPFILE_CLUSTER);
 434
 435                spin_lock(&si->lock);
 436                ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
 437                __free_cluster(si, idx);
 438                memset(si->swap_map + idx * SWAPFILE_CLUSTER,
 439                                0, SWAPFILE_CLUSTER);
 440                unlock_cluster(ci);
 441        }
 442}
 443
 444static void swap_discard_work(struct work_struct *work)
 445{
 446        struct swap_info_struct *si;
 447
 448        si = container_of(work, struct swap_info_struct, discard_work);
 449
 450        spin_lock(&si->lock);
 451        swap_do_scheduled_discard(si);
 452        spin_unlock(&si->lock);
 453}
 454
 455static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
 456{
 457        struct swap_cluster_info *ci = si->cluster_info;
 458
 459        VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
 460        cluster_list_del_first(&si->free_clusters, ci);
 461        cluster_set_count_flag(ci + idx, 0, 0);
 462}
 463
 464static void free_cluster(struct swap_info_struct *si, unsigned long idx)
 465{
 466        struct swap_cluster_info *ci = si->cluster_info + idx;
 467
 468        VM_BUG_ON(cluster_count(ci) != 0);
 469        /*
 470         * If the swap is discardable, prepare discard the cluster
 471         * instead of free it immediately. The cluster will be freed
 472         * after discard.
 473         */
 474        if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
 475            (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
 476                swap_cluster_schedule_discard(si, idx);
 477                return;
 478        }
 479
 480        __free_cluster(si, idx);
 481}
 482
 483/*
 484 * The cluster corresponding to page_nr will be used. The cluster will be
 485 * removed from free cluster list and its usage counter will be increased.
 486 */
 487static void inc_cluster_info_page(struct swap_info_struct *p,
 488        struct swap_cluster_info *cluster_info, unsigned long page_nr)
 489{
 490        unsigned long idx = page_nr / SWAPFILE_CLUSTER;
 491
 492        if (!cluster_info)
 493                return;
 494        if (cluster_is_free(&cluster_info[idx]))
 495                alloc_cluster(p, idx);
 496
 497        VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
 498        cluster_set_count(&cluster_info[idx],
 499                cluster_count(&cluster_info[idx]) + 1);
 500}
 501
 502/*
 503 * The cluster corresponding to page_nr decreases one usage. If the usage
 504 * counter becomes 0, which means no page in the cluster is in using, we can
 505 * optionally discard the cluster and add it to free cluster list.
 506 */
 507static void dec_cluster_info_page(struct swap_info_struct *p,
 508        struct swap_cluster_info *cluster_info, unsigned long page_nr)
 509{
 510        unsigned long idx = page_nr / SWAPFILE_CLUSTER;
 511
 512        if (!cluster_info)
 513                return;
 514
 515        VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
 516        cluster_set_count(&cluster_info[idx],
 517                cluster_count(&cluster_info[idx]) - 1);
 518
 519        if (cluster_count(&cluster_info[idx]) == 0)
 520                free_cluster(p, idx);
 521}
 522
 523/*
 524 * It's possible scan_swap_map() uses a free cluster in the middle of free
 525 * cluster list. Avoiding such abuse to avoid list corruption.
 526 */
 527static bool
 528scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
 529        unsigned long offset)
 530{
 531        struct percpu_cluster *percpu_cluster;
 532        bool conflict;
 533
 534        offset /= SWAPFILE_CLUSTER;
 535        conflict = !cluster_list_empty(&si->free_clusters) &&
 536                offset != cluster_list_first(&si->free_clusters) &&
 537                cluster_is_free(&si->cluster_info[offset]);
 538
 539        if (!conflict)
 540                return false;
 541
 542        percpu_cluster = this_cpu_ptr(si->percpu_cluster);
 543        cluster_set_null(&percpu_cluster->index);
 544        return true;
 545}
 546
 547/*
 548 * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
 549 * might involve allocating a new cluster for current CPU too.
 550 */
 551static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
 552        unsigned long *offset, unsigned long *scan_base)
 553{
 554        struct percpu_cluster *cluster;
 555        struct swap_cluster_info *ci;
 556        bool found_free;
 557        unsigned long tmp, max;
 558
 559new_cluster:
 560        cluster = this_cpu_ptr(si->percpu_cluster);
 561        if (cluster_is_null(&cluster->index)) {
 562                if (!cluster_list_empty(&si->free_clusters)) {
 563                        cluster->index = si->free_clusters.head;
 564                        cluster->next = cluster_next(&cluster->index) *
 565                                        SWAPFILE_CLUSTER;
 566                } else if (!cluster_list_empty(&si->discard_clusters)) {
 567                        /*
 568                         * we don't have free cluster but have some clusters in
 569                         * discarding, do discard now and reclaim them
 570                         */
 571                        swap_do_scheduled_discard(si);
 572                        *scan_base = *offset = si->cluster_next;
 573                        goto new_cluster;
 574                } else
 575                        return false;
 576        }
 577
 578        found_free = false;
 579
 580        /*
 581         * Other CPUs can use our cluster if they can't find a free cluster,
 582         * check if there is still free entry in the cluster
 583         */
 584        tmp = cluster->next;
 585        max = min_t(unsigned long, si->max,
 586                    (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
 587        if (tmp >= max) {
 588                cluster_set_null(&cluster->index);
 589                goto new_cluster;
 590        }
 591        ci = lock_cluster(si, tmp);
 592        while (tmp < max) {
 593                if (!si->swap_map[tmp]) {
 594                        found_free = true;
 595                        break;
 596                }
 597                tmp++;
 598        }
 599        unlock_cluster(ci);
 600        if (!found_free) {
 601                cluster_set_null(&cluster->index);
 602                goto new_cluster;
 603        }
 604        cluster->next = tmp + 1;
 605        *offset = tmp;
 606        *scan_base = tmp;
 607        return found_free;
 608}
 609
 610static void __del_from_avail_list(struct swap_info_struct *p)
 611{
 612        int nid;
 613
 614        for_each_node(nid)
 615                plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
 616}
 617
 618static void del_from_avail_list(struct swap_info_struct *p)
 619{
 620        spin_lock(&swap_avail_lock);
 621        __del_from_avail_list(p);
 622        spin_unlock(&swap_avail_lock);
 623}
 624
 625static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
 626                             unsigned int nr_entries)
 627{
 628        unsigned int end = offset + nr_entries - 1;
 629
 630        if (offset == si->lowest_bit)
 631                si->lowest_bit += nr_entries;
 632        if (end == si->highest_bit)
 633                si->highest_bit -= nr_entries;
 634        si->inuse_pages += nr_entries;
 635        if (si->inuse_pages == si->pages) {
 636                si->lowest_bit = si->max;
 637                si->highest_bit = 0;
 638                del_from_avail_list(si);
 639        }
 640}
 641
 642static void add_to_avail_list(struct swap_info_struct *p)
 643{
 644        int nid;
 645
 646        spin_lock(&swap_avail_lock);
 647        for_each_node(nid) {
 648                WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
 649                plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
 650        }
 651        spin_unlock(&swap_avail_lock);
 652}
 653
 654static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
 655                            unsigned int nr_entries)
 656{
 657        unsigned long end = offset + nr_entries - 1;
 658        void (*swap_slot_free_notify)(struct block_device *, unsigned long);
 659
 660        if (offset < si->lowest_bit)
 661                si->lowest_bit = offset;
 662        if (end > si->highest_bit) {
 663                bool was_full = !si->highest_bit;
 664
 665                si->highest_bit = end;
 666                if (was_full && (si->flags & SWP_WRITEOK))
 667                        add_to_avail_list(si);
 668        }
 669        atomic_long_add(nr_entries, &nr_swap_pages);
 670        si->inuse_pages -= nr_entries;
 671        if (si->flags & SWP_BLKDEV)
 672                swap_slot_free_notify =
 673                        si->bdev->bd_disk->fops->swap_slot_free_notify;
 674        else
 675                swap_slot_free_notify = NULL;
 676        while (offset <= end) {
 677                frontswap_invalidate_page(si->type, offset);
 678                if (swap_slot_free_notify)
 679                        swap_slot_free_notify(si->bdev, offset);
 680                offset++;
 681        }
 682}
 683
 684static int scan_swap_map_slots(struct swap_info_struct *si,
 685                               unsigned char usage, int nr,
 686                               swp_entry_t slots[])
 687{
 688        struct swap_cluster_info *ci;
 689        unsigned long offset;
 690        unsigned long scan_base;
 691        unsigned long last_in_cluster = 0;
 692        int latency_ration = LATENCY_LIMIT;
 693        int n_ret = 0;
 694
 695        if (nr > SWAP_BATCH)
 696                nr = SWAP_BATCH;
 697
 698        /*
 699         * We try to cluster swap pages by allocating them sequentially
 700         * in swap.  Once we've allocated SWAPFILE_CLUSTER pages this
 701         * way, however, we resort to first-free allocation, starting
 702         * a new cluster.  This prevents us from scattering swap pages
 703         * all over the entire swap partition, so that we reduce
 704         * overall disk seek times between swap pages.  -- sct
 705         * But we do now try to find an empty cluster.  -Andrea
 706         * And we let swap pages go all over an SSD partition.  Hugh
 707         */
 708
 709        si->flags += SWP_SCANNING;
 710        scan_base = offset = si->cluster_next;
 711
 712        /* SSD algorithm */
 713        if (si->cluster_info) {
 714                if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
 715                        goto checks;
 716                else
 717                        goto scan;
 718        }
 719
 720        if (unlikely(!si->cluster_nr--)) {
 721                if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
 722                        si->cluster_nr = SWAPFILE_CLUSTER - 1;
 723                        goto checks;
 724                }
 725
 726                spin_unlock(&si->lock);
 727
 728                /*
 729                 * If seek is expensive, start searching for new cluster from
 730                 * start of partition, to minimize the span of allocated swap.
 731                 * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
 732                 * case, just handled by scan_swap_map_try_ssd_cluster() above.
 733                 */
 734                scan_base = offset = si->lowest_bit;
 735                last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
 736
 737                /* Locate the first empty (unaligned) cluster */
 738                for (; last_in_cluster <= si->highest_bit; offset++) {
 739                        if (si->swap_map[offset])
 740                                last_in_cluster = offset + SWAPFILE_CLUSTER;
 741                        else if (offset == last_in_cluster) {
 742                                spin_lock(&si->lock);
 743                                offset -= SWAPFILE_CLUSTER - 1;
 744                                si->cluster_next = offset;
 745                                si->cluster_nr = SWAPFILE_CLUSTER - 1;
 746                                goto checks;
 747                        }
 748                        if (unlikely(--latency_ration < 0)) {
 749                                cond_resched();
 750                                latency_ration = LATENCY_LIMIT;
 751                        }
 752                }
 753
 754                offset = scan_base;
 755                spin_lock(&si->lock);
 756                si->cluster_nr = SWAPFILE_CLUSTER - 1;
 757        }
 758
 759checks:
 760        if (si->cluster_info) {
 761                while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
 762                /* take a break if we already got some slots */
 763                        if (n_ret)
 764                                goto done;
 765                        if (!scan_swap_map_try_ssd_cluster(si, &offset,
 766                                                        &scan_base))
 767                                goto scan;
 768                }
 769        }
 770        if (!(si->flags & SWP_WRITEOK))
 771                goto no_page;
 772        if (!si->highest_bit)
 773                goto no_page;
 774        if (offset > si->highest_bit)
 775                scan_base = offset = si->lowest_bit;
 776
 777        ci = lock_cluster(si, offset);
 778        /* reuse swap entry of cache-only swap if not busy. */
 779        if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
 780                int swap_was_freed;
 781                unlock_cluster(ci);
 782                spin_unlock(&si->lock);
 783                swap_was_freed = __try_to_reclaim_swap(si, offset);
 784                spin_lock(&si->lock);
 785                /* entry was freed successfully, try to use this again */
 786                if (swap_was_freed)
 787                        goto checks;
 788                goto scan; /* check next one */
 789        }
 790
 791        if (si->swap_map[offset]) {
 792                unlock_cluster(ci);
 793                if (!n_ret)
 794                        goto scan;
 795                else
 796                        goto done;
 797        }
 798        si->swap_map[offset] = usage;
 799        inc_cluster_info_page(si, si->cluster_info, offset);
 800        unlock_cluster(ci);
 801
 802        swap_range_alloc(si, offset, 1);
 803        si->cluster_next = offset + 1;
 804        slots[n_ret++] = swp_entry(si->type, offset);
 805
 806        /* got enough slots or reach max slots? */
 807        if ((n_ret == nr) || (offset >= si->highest_bit))
 808                goto done;
 809
 810        /* search for next available slot */
 811
 812        /* time to take a break? */
 813        if (unlikely(--latency_ration < 0)) {
 814                if (n_ret)
 815                        goto done;
 816                spin_unlock(&si->lock);
 817                cond_resched();
 818                spin_lock(&si->lock);
 819                latency_ration = LATENCY_LIMIT;
 820        }
 821
 822        /* try to get more slots in cluster */
 823        if (si->cluster_info) {
 824                if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
 825                        goto checks;
 826                else
 827                        goto done;
 828        }
 829        /* non-ssd case */
 830        ++offset;
 831
 832        /* non-ssd case, still more slots in cluster? */
 833        if (si->cluster_nr && !si->swap_map[offset]) {
 834                --si->cluster_nr;
 835                goto checks;
 836        }
 837
 838done:
 839        si->flags -= SWP_SCANNING;
 840        return n_ret;
 841
 842scan:
 843        spin_unlock(&si->lock);
 844        while (++offset <= si->highest_bit) {
 845                if (!si->swap_map[offset]) {
 846                        spin_lock(&si->lock);
 847                        goto checks;
 848                }
 849                if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
 850                        spin_lock(&si->lock);
 851                        goto checks;
 852                }
 853                if (unlikely(--latency_ration < 0)) {
 854                        cond_resched();
 855                        latency_ration = LATENCY_LIMIT;
 856                }
 857        }
 858        offset = si->lowest_bit;
 859        while (offset < scan_base) {
 860                if (!si->swap_map[offset]) {
 861                        spin_lock(&si->lock);
 862                        goto checks;
 863                }
 864                if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
 865                        spin_lock(&si->lock);
 866                        goto checks;
 867                }
 868                if (unlikely(--latency_ration < 0)) {
 869                        cond_resched();
 870                        latency_ration = LATENCY_LIMIT;
 871                }
 872                offset++;
 873        }
 874        spin_lock(&si->lock);
 875
 876no_page:
 877        si->flags -= SWP_SCANNING;
 878        return n_ret;
 879}
 880
 881static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
 882{
 883        unsigned long idx;
 884        struct swap_cluster_info *ci;
 885        unsigned long offset, i;
 886        unsigned char *map;
 887
 888        /*
 889         * Should not even be attempting cluster allocations when huge
 890         * page swap is disabled.  Warn and fail the allocation.
 891         */
 892        if (!IS_ENABLED(CONFIG_THP_SWAP)) {
 893                VM_WARN_ON_ONCE(1);
 894                return 0;
 895        }
 896
 897        if (cluster_list_empty(&si->free_clusters))
 898                return 0;
 899
 900        idx = cluster_list_first(&si->free_clusters);
 901        offset = idx * SWAPFILE_CLUSTER;
 902        ci = lock_cluster(si, offset);
 903        alloc_cluster(si, idx);
 904        cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
 905
 906        map = si->swap_map + offset;
 907        for (i = 0; i < SWAPFILE_CLUSTER; i++)
 908                map[i] = SWAP_HAS_CACHE;
 909        unlock_cluster(ci);
 910        swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
 911        *slot = swp_entry(si->type, offset);
 912
 913        return 1;
 914}
 915
 916static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
 917{
 918        unsigned long offset = idx * SWAPFILE_CLUSTER;
 919        struct swap_cluster_info *ci;
 920
 921        ci = lock_cluster(si, offset);
 922        cluster_set_count_flag(ci, 0, 0);
 923        free_cluster(si, idx);
 924        unlock_cluster(ci);
 925        swap_range_free(si, offset, SWAPFILE_CLUSTER);
 926}
 927
 928static unsigned long scan_swap_map(struct swap_info_struct *si,
 929                                   unsigned char usage)
 930{
 931        swp_entry_t entry;
 932        int n_ret;
 933
 934        n_ret = scan_swap_map_slots(si, usage, 1, &entry);
 935
 936        if (n_ret)
 937                return swp_offset(entry);
 938        else
 939                return 0;
 940
 941}
 942
 943int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
 944{
 945        unsigned long size = swap_entry_size(entry_size);
 946        struct swap_info_struct *si, *next;
 947        long avail_pgs;
 948        int n_ret = 0;
 949        int node;
 950
 951        /* Only single cluster request supported */
 952        WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
 953
 954        avail_pgs = atomic_long_read(&nr_swap_pages) / size;
 955        if (avail_pgs <= 0)
 956                goto noswap;
 957
 958        if (n_goal > SWAP_BATCH)
 959                n_goal = SWAP_BATCH;
 960
 961        if (n_goal > avail_pgs)
 962                n_goal = avail_pgs;
 963
 964        atomic_long_sub(n_goal * size, &nr_swap_pages);
 965
 966        spin_lock(&swap_avail_lock);
 967
 968start_over:
 969        node = numa_node_id();
 970        plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
 971                /* requeue si to after same-priority siblings */
 972                plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
 973                spin_unlock(&swap_avail_lock);
 974                spin_lock(&si->lock);
 975                if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
 976                        spin_lock(&swap_avail_lock);
 977                        if (plist_node_empty(&si->avail_lists[node])) {
 978                                spin_unlock(&si->lock);
 979                                goto nextsi;
 980                        }
 981                        WARN(!si->highest_bit,
 982                             "swap_info %d in list but !highest_bit\n",
 983                             si->type);
 984                        WARN(!(si->flags & SWP_WRITEOK),
 985                             "swap_info %d in list but !SWP_WRITEOK\n",
 986                             si->type);
 987                        __del_from_avail_list(si);
 988                        spin_unlock(&si->lock);
 989                        goto nextsi;
 990                }
 991                if (size == SWAPFILE_CLUSTER) {
 992                        if (!(si->flags & SWP_FILE))
 993                                n_ret = swap_alloc_cluster(si, swp_entries);
 994                } else
 995                        n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
 996                                                    n_goal, swp_entries);
 997                spin_unlock(&si->lock);
 998                if (n_ret || size == SWAPFILE_CLUSTER)
 999                        goto check_out;
1000                pr_debug("scan_swap_map of si %d failed to find offset\n",

1001                        si->type);
1002
1003                spin_lock(&swap_avail_lock);
1004nextsi:
1005                /*
1006                 * if we got here, it's likely that si was almost full before,
1007                 * and since scan_swap_map() can drop the si->lock, multiple
1008                 * callers probably all tried to get a page from the same si
1009                 * and it filled up before we could get one; or, the si filled
1010                 * up between us dropping swap_avail_lock and taking si->lock.
1011                 * Since we dropped the swap_avail_lock, the swap_avail_head
1012                 * list may have been modified; so if next is still in the
1013                 * swap_avail_head list then try it, otherwise start over
1014                 * if we have not gotten any slots.
1015                 */
1016                if (plist_node_empty(&next->avail_lists[node]))
1017                        goto start_over;
1018        }
1019
1020        spin_unlock(&swap_avail_lock);
1021
1022check_out:
1023        if (n_ret < n_goal)
1024                atomic_long_add((long)(n_goal - n_ret) * size,
1025                                &nr_swap_pages);
1026noswap:
1027        return n_ret;
1028}
1029
1030/* The only caller of this function is now suspend routine */
1031swp_entry_t get_swap_page_of_type(int type)
1032{
1033        struct swap_info_struct *si;
1034        pgoff_t offset;
1035
1036        si = swap_info[type];
1037        spin_lock(&si->lock);
1038        if (si && (si->flags & SWP_WRITEOK)) {
1039                atomic_long_dec(&nr_swap_pages);
1040                /* This is called for allocating swap entry, not cache */
1041                offset = scan_swap_map(si, 1);
1042                if (offset) {
1043                        spin_unlock(&si->lock);
1044                        return swp_entry(type, offset);
1045                }
1046                atomic_long_inc(&nr_swap_pages);
1047        }
1048        spin_unlock(&si->lock);
1049        return (swp_entry_t) {0};
1050}
1051
1052static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
1053{
1054        struct swap_info_struct *p;
1055        unsigned long offset, type;
1056
1057        if (!entry.val)
1058                goto out;
1059        type = swp_type(entry);
1060        if (type >= nr_swapfiles)
1061                goto bad_nofile;
1062        p = swap_info[type];
1063        if (!(p->flags & SWP_USED))
1064                goto bad_device;
1065        offset = swp_offset(entry);
1066        if (offset >= p->max)
1067                goto bad_offset;
1068        return p;
1069
1070bad_offset:
1071        pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val);
1072        goto out;
1073bad_device:
1074        pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val);
1075        goto out;
1076bad_nofile:
1077        pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val);
1078out:
1079        return NULL;
1080}
1081
1082static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
1083{
1084        struct swap_info_struct *p;
1085
1086        p = __swap_info_get(entry);
1087        if (!p)
1088                goto out;
1089        if (!p->swap_map[swp_offset(entry)])
1090                goto bad_free;
1091        return p;
1092
1093bad_free:
1094        pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
1095        goto out;
1096out:
1097        return NULL;
1098}
1099
1100static struct swap_info_struct *swap_info_get(swp_entry_t entry)
1101{
1102        struct swap_info_struct *p;
1103
1104        p = _swap_info_get(entry);
1105        if (p)
1106                spin_lock(&p->lock);
1107        return p;
1108}
1109
1110static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
1111                                        struct swap_info_struct *q)
1112{
1113        struct swap_info_struct *p;
1114
1115        p = _swap_info_get(entry);
1116
1117        if (p != q) {
1118                if (q != NULL)
1119                        spin_unlock(&q->lock);
1120                if (p != NULL)
1121                        spin_lock(&p->lock);
1122        }
1123        return p;
1124}
1125
1126static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
1127                                              unsigned long offset,
1128                                              unsigned char usage)
1129{
1130        unsigned char count;
1131        unsigned char has_cache;
1132
1133        count = p->swap_map[offset];
1134
1135        has_cache = count & SWAP_HAS_CACHE;
1136        count &= ~SWAP_HAS_CACHE;
1137
1138        if (usage == SWAP_HAS_CACHE) {
1139                VM_BUG_ON(!has_cache);
1140                has_cache = 0;
1141        } else if (count == SWAP_MAP_SHMEM) {
1142                /*
1143                 * Or we could insist on shmem.c using a special
1144                 * swap_shmem_free() and free_shmem_swap_and_cache()...
1145                 */
1146                count = 0;
1147        } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
1148                if (count == COUNT_CONTINUED) {
1149                        if (swap_count_continued(p, offset, count))
1150                                count = SWAP_MAP_MAX | COUNT_CONTINUED;
1151                        else
1152                                count = SWAP_MAP_MAX;
1153                } else
1154                        count--;
1155        }
1156
1157        usage = count | has_cache;
1158        p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
1159
1160        return usage;
1161}
1162
1163static unsigned char __swap_entry_free(struct swap_info_struct *p,
1164                                       swp_entry_t entry, unsigned char usage)
1165{
1166        struct swap_cluster_info *ci;
1167        unsigned long offset = swp_offset(entry);
1168
1169        ci = lock_cluster_or_swap_info(p, offset);
1170        usage = __swap_entry_free_locked(p, offset, usage);
1171        unlock_cluster_or_swap_info(p, ci);
1172
1173        return usage;
1174}
1175
1176static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
1177{
1178        struct swap_cluster_info *ci;
1179        unsigned long offset = swp_offset(entry);
1180        unsigned char count;
1181
1182        ci = lock_cluster(p, offset);
1183        count = p->swap_map[offset];
1184        VM_BUG_ON(count != SWAP_HAS_CACHE);
1185        p->swap_map[offset] = 0;
1186        dec_cluster_info_page(p, p->cluster_info, offset);
1187        unlock_cluster(ci);
1188
1189        mem_cgroup_uncharge_swap(entry, 1);
1190        swap_range_free(p, offset, 1);
1191}
1192
1193/*
1194 * Caller has made sure that the swap device corresponding to entry
1195 * is still around or has not been recycled.
1196 */
1197void swap_free(swp_entry_t entry)
1198{
1199        struct swap_info_struct *p;
1200
1201        p = _swap_info_get(entry);
1202        if (p) {
1203                if (!__swap_entry_free(p, entry, 1))
1204                        free_swap_slot(entry);
1205        }
1206}
1207
1208/*
1209 * Called after dropping swapcache to decrease refcnt to swap entries.
1210 */
1211void put_swap_page(struct page *page, swp_entry_t entry)
1212{
1213        unsigned long offset = swp_offset(entry);
1214        unsigned long idx = offset / SWAPFILE_CLUSTER;
1215        struct swap_cluster_info *ci;
1216        struct swap_info_struct *si;
1217        unsigned char *map;
1218        unsigned int i, free_entries = 0;
1219        unsigned char val;
1220        int size = swap_entry_size(hpage_nr_pages(page));
1221
1222        si = _swap_info_get(entry);
1223        if (!si)
1224                return;
1225
1226        ci = lock_cluster_or_swap_info(si, offset);
1227        if (size == SWAPFILE_CLUSTER) {
1228                VM_BUG_ON(!cluster_is_huge(ci));
1229                map = si->swap_map + offset;
1230                for (i = 0; i < SWAPFILE_CLUSTER; i++) {
1231                        val = map[i];
1232                        VM_BUG_ON(!(val & SWAP_HAS_CACHE));
1233                        if (val == SWAP_HAS_CACHE)
1234                                free_entries++;
1235                }
1236                cluster_clear_huge(ci);
1237                if (free_entries == SWAPFILE_CLUSTER) {
1238                        unlock_cluster_or_swap_info(si, ci);
1239                        spin_lock(&si->lock);
1240                        ci = lock_cluster(si, offset);
1241                        memset(map, 0, SWAPFILE_CLUSTER);
1242                        unlock_cluster(ci);
1243                        mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
1244                        swap_free_cluster(si, idx);
1245                        spin_unlock(&si->lock);
1246                        return;
1247                }
1248        }
1249        for (i = 0; i < size; i++, entry.val++) {
1250                if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
1251                        unlock_cluster_or_swap_info(si, ci);
1252                        free_swap_slot(entry);
1253                        if (i == size - 1)
1254                                return;
1255                        lock_cluster_or_swap_info(si, offset);
1256                }
1257        }
1258        unlock_cluster_or_swap_info(si, ci);
1259}
1260
1261#ifdef CONFIG_THP_SWAP
1262int split_swap_cluster(swp_entry_t entry)
1263{
1264        struct swap_info_struct *si;
1265        struct swap_cluster_info *ci;
1266        unsigned long offset = swp_offset(entry);
1267
1268        si = _swap_info_get(entry);
1269        if (!si)
1270                return -EBUSY;
1271        ci = lock_cluster(si, offset);
1272        cluster_clear_huge(ci);
1273        unlock_cluster(ci);
1274        return 0;
1275}
1276#endif
1277
1278static int swp_entry_cmp(const void *ent1, const void *ent2)
1279{
1280        const swp_entry_t *e1 = ent1, *e2 = ent2;
1281
1282        return (int)swp_type(*e1) - (int)swp_type(*e2);
1283}
1284
1285void swapcache_free_entries(swp_entry_t *entries, int n)
1286{
1287        struct swap_info_struct *p, *prev;
1288        int i;
1289
1290        if (n <= 0)
1291                return;
1292
1293        prev = NULL;
1294        p = NULL;
1295
1296        /*
1297         * Sort swap entries by swap device, so each lock is only taken once.
1298         * nr_swapfiles isn't absolutely correct, but the overhead of sort() is
1299         * so low that it isn't necessary to optimize further.
1300         */
1301        if (nr_swapfiles > 1)
1302                sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
1303        for (i = 0; i < n; ++i) {
1304                p = swap_info_get_cont(entries[i], prev);
1305                if (p)
1306                        swap_entry_free(p, entries[i]);
1307                prev = p;
1308        }
1309        if (p)
1310                spin_unlock(&p->lock);
1311}
1312
1313/*
1314 * How many references to page are currently swapped out?
1315 * This does not give an exact answer when swap count is continued,
1316 * but does include the high COUNT_CONTINUED flag to allow for that.
1317 */
1318int page_swapcount(struct page *page)
1319{
1320        int count = 0;
1321        struct swap_info_struct *p;
1322        struct swap_cluster_info *ci;
1323        swp_entry_t entry;
1324        unsigned long offset;
1325
1326        entry.val = page_private(page);
1327        p = _swap_info_get(entry);
1328        if (p) {
1329                offset = swp_offset(entry);
1330                ci = lock_cluster_or_swap_info(p, offset);
1331                count = swap_count(p->swap_map[offset]);
1332                unlock_cluster_or_swap_info(p, ci);
1333        }
1334        return count;
1335}
1336
1337int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
1338{
1339        pgoff_t offset = swp_offset(entry);
1340
1341        return swap_count(si->swap_map[offset]);
1342}
1343
1344static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
1345{
1346        int count = 0;
1347        pgoff_t offset = swp_offset(entry);
1348        struct swap_cluster_info *ci;
1349
1350        ci = lock_cluster_or_swap_info(si, offset);
1351        count = swap_count(si->swap_map[offset]);
1352        unlock_cluster_or_swap_info(si, ci);
1353        return count;
1354}
1355
1356/*
1357 * How many references to @entry are currently swapped out?
1358 * This does not give an exact answer when swap count is continued,
1359 * but does include the high COUNT_CONTINUED flag to allow for that.
1360 */
1361int __swp_swapcount(swp_entry_t entry)
1362{
1363        int count = 0;
1364        struct swap_info_struct *si;
1365
1366        si = __swap_info_get(entry);
1367        if (si)
1368                count = swap_swapcount(si, entry);
1369        return count;
1370}
1371
1372/*
1373 * How many references to @entry are currently swapped out?
1374 * This considers COUNT_CONTINUED so it returns exact answer.
1375 */
1376int swp_swapcount(swp_entry_t entry)
1377{
1378        int count, tmp_count, n;
1379        struct swap_info_struct *p;
1380        struct swap_cluster_info *ci;
1381        struct page *page;
1382        pgoff_t offset;
1383        unsigned char *map;
1384
1385        p = _swap_info_get(entry);
1386        if (!p)
1387                return 0;
1388
1389        offset = swp_offset(entry);
1390
1391        ci = lock_cluster_or_swap_info(p, offset);
1392
1393        count = swap_count(p->swap_map[offset]);
1394        if (!(count & COUNT_CONTINUED))
1395                goto out;
1396
1397        count &= ~COUNT_CONTINUED;
1398        n = SWAP_MAP_MAX + 1;
1399
1400        page = vmalloc_to_page(p->swap_map + offset);
1401        offset &= ~PAGE_MASK;
1402        VM_BUG_ON(page_private(page) != SWP_CONTINUED);
1403
1404        do {
1405                page = list_next_entry(page, lru);
1406                map = kmap_atomic(page);
1407                tmp_count = map[offset];
1408                kunmap_atomic(map);
1409
1410                count += (tmp_count & ~COUNT_CONTINUED) * n;
1411                n *= (SWAP_CONT_MAX + 1);
1412        } while (tmp_count & COUNT_CONTINUED);
1413out:
1414        unlock_cluster_or_swap_info(p, ci);
1415        return count;
1416}
1417
1418static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
1419                                         swp_entry_t entry)
1420{
1421        struct swap_cluster_info *ci;
1422        unsigned char *map = si->swap_map;
1423        unsigned long roffset = swp_offset(entry);
1424        unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
1425        int i;
1426        bool ret = false;
1427
1428        ci = lock_cluster_or_swap_info(si, offset);
1429        if (!ci || !cluster_is_huge(ci)) {
1430                if (swap_count(map[roffset]))
1431                        ret = true;
1432                goto unlock_out;
1433        }
1434        for (i = 0; i < SWAPFILE_CLUSTER; i++) {
1435                if (swap_count(map[offset + i])) {
1436                        ret = true;
1437                        break;
1438                }
1439        }
1440unlock_out:
1441        unlock_cluster_or_swap_info(si, ci);
1442        return ret;
1443}
1444
1445static bool page_swapped(struct page *page)
1446{
1447        swp_entry_t entry;
1448        struct swap_info_struct *si;
1449
1450        if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
1451                return page_swapcount(page) != 0;
1452
1453        page = compound_head(page);
1454        entry.val = page_private(page);
1455        si = _swap_info_get(entry);
1456        if (si)
1457                return swap_page_trans_huge_swapped(si, entry);
1458        return false;
1459}
1460
1461static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
1462                                         int *total_swapcount)
1463{
1464        int i, map_swapcount, _total_mapcount, _total_swapcount;
1465        unsigned long offset = 0;
1466        struct swap_info_struct *si;
1467        struct swap_cluster_info *ci = NULL;
1468        unsigned char *map = NULL;
1469        int mapcount, swapcount = 0;
1470
1471        /* hugetlbfs shouldn't call it */
1472        VM_BUG_ON_PAGE(PageHuge(page), page);
1473
1474        if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
1475                mapcount = page_trans_huge_mapcount(page, total_mapcount);
1476                if (PageSwapCache(page))
1477                        swapcount = page_swapcount(page);
1478                if (total_swapcount)
1479                        *total_swapcount = swapcount;
1480                return mapcount + swapcount;
1481        }
1482
1483        page = compound_head(page);
1484
1485        _total_mapcount = _total_swapcount = map_swapcount = 0;
1486        if (PageSwapCache(page)) {
1487                swp_entry_t entry;
1488
1489                entry.val = page_private(page);
1490                si = _swap_info_get(entry);
1491                if (si) {
1492                        map = si->swap_map;
1493                        offset = swp_offset(entry);
1494                }
1495        }
1496        if (map)
1497                ci = lock_cluster(si, offset);
1498        for (i = 0; i < HPAGE_PMD_NR; i++) {
1499                mapcount = atomic_read(&page[i]._mapcount) + 1;
1500                _total_mapcount += mapcount;
1501                if (map) {
1502                        swapcount = swap_count(map[offset + i]);
1503                        _total_swapcount += swapcount;
1504                }
1505                map_swapcount = max(map_swapcount, mapcount + swapcount);
1506        }
1507        unlock_cluster(ci);
1508        if (PageDoubleMap(page)) {
1509                map_swapcount -= 1;
1510                _total_mapcount -= HPAGE_PMD_NR;
1511        }
1512        mapcount = compound_mapcount(page);
1513        map_swapcount += mapcount;
1514        _total_mapcount += mapcount;
1515        if (total_mapcount)
1516                *total_mapcount = _total_mapcount;
1517        if (total_swapcount)
1518                *total_swapcount = _total_swapcount;
1519
1520        return map_swapcount;
1521}
1522
1523/*
1524 * We can write to an anon page without COW if there are no other references
1525 * to it.  And as a side-effect, free up its swap: because the old content
1526 * on disk will never be read, and seeking back there to write new content
1527 * later would only waste time away from clustering.
1528 *
1529 * NOTE: total_map_swapcount should not be relied upon by the caller if
1530 * reuse_swap_page() returns false, but it may be always overwritten
1531 * (see the other implementation for CONFIG_SWAP=n).
1532 */
1533bool reuse_swap_page(struct page *page, int *total_map_swapcount)
1534{
1535        int count, total_mapcount, total_swapcount;
1536
1537        VM_BUG_ON_PAGE(!PageLocked(page), page);
1538        if (unlikely(PageKsm(page)))
1539                return false;
1540        count = page_trans_huge_map_swapcount(page, &total_mapcount,
1541                                              &total_swapcount);
1542        if (total_map_swapcount)
1543                *total_map_swapcount = total_mapcount + total_swapcount;
1544        if (count == 1 && PageSwapCache(page) &&
1545            (likely(!PageTransCompound(page)) ||
1546             /* The remaining swap count will be freed soon */
1547             total_swapcount == page_swapcount(page))) {
1548                if (!PageWriteback(page)) {
1549                        page = compound_head(page);
1550                        delete_from_swap_cache(page);
1551                        SetPageDirty(page);
1552                } else {
1553                        swp_entry_t entry;
1554                        struct swap_info_struct *p;
1555
1556                        entry.val = page_private(page);
1557                        p = swap_info_get(entry);
1558                        if (p->flags & SWP_STABLE_WRITES) {
1559                                spin_unlock(&p->lock);
1560                                return false;
1561                        }
1562                        spin_unlock(&p->lock);
1563                }
1564        }
1565
1566        return count <= 1;
1567}
1568
1569/*
1570 * If swap is getting full, or if there are no more mappings of this page,
1571 * then try_to_free_swap is called to free its swap space.
1572 */
1573int try_to_free_swap(struct page *page)
1574{
1575        VM_BUG_ON_PAGE(!PageLocked(page), page);
1576
1577        if (!PageSwapCache(page))
1578                return 0;
1579        if (PageWriteback(page))
1580                return 0;
1581        if (page_swapped(page))
1582                return 0;
1583
1584        /*
1585         * Once hibernation has begun to create its image of memory,
1586         * there's a danger that one of the calls to try_to_free_swap()
1587         * - most probably a call from __try_to_reclaim_swap() while
1588         * hibernation is allocating its own swap pages for the image,
1589         * but conceivably even a call from memory reclaim - will free
1590         * the swap from a page which has already been recorded in the
1591         * image as a clean swapcache page, and then reuse its swap for
1592         * another page of the image.  On waking from hibernation, the
1593         * original page might be freed under memory pressure, then
1594         * later read back in from swap, now with the wrong data.
1595         *
1596         * Hibernation suspends storage while it is writing the image
1597         * to disk so check that here.
1598         */
1599        if (pm_suspended_storage())
1600                return 0;
1601
1602        page = compound_head(page);
1603        delete_from_swap_cache(page);
1604        SetPageDirty(page);
1605        return 1;
1606}
1607
1608/*
1609 * Free the swap entry like above, but also try to
1610 * free the page cache entry if it is the last user.
1611 */
1612int free_swap_and_cache(swp_entry_t entry)
1613{
1614        struct swap_info_struct *p;
1615        struct page *page = NULL;
1616        unsigned char count;
1617
1618        if (non_swap_entry(entry))
1619                return 1;
1620
1621        p = _swap_info_get(entry);
1622        if (p) {
1623                count = __swap_entry_free(p, entry, 1);
1624                if (count == SWAP_HAS_CACHE &&
1625                    !swap_page_trans_huge_swapped(p, entry)) {
1626                        page = find_get_page(swap_address_space(entry),
1627                                             swp_offset(entry));
1628                        if (page && !trylock_page(page)) {
1629                                put_page(page);
1630                                page = NULL;
1631                        }
1632                } else if (!count)
1633                        free_swap_slot(entry);
1634        }
1635        if (page) {
1636                /*
1637                 * Not mapped elsewhere, or swap space full? Free it!
1638                 * Also recheck PageSwapCache now page is locked (above).
1639                 */
1640                if (PageSwapCache(page) && !PageWriteback(page) &&
1641                    (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
1642                    !swap_page_trans_huge_swapped(p, entry)) {
1643                        page = compound_head(page);
1644                        delete_from_swap_cache(page);
1645                        SetPageDirty(page);
1646                }
1647                unlock_page(page);
1648                put_page(page);
1649        }
1650        return p != NULL;
1651}
1652
1653#ifdef CONFIG_HIBERNATION
1654/*
1655 * Find the swap type that corresponds to given device (if any).
1656 *
1657 * @offset - number of the PAGE_SIZE-sized block of the device, starting
1658 * from 0, in which the swap header is expected to be located.
1659 *
1660 * This is needed for the suspend to disk (aka swsusp).
1661 */
1662int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
1663{
1664        struct block_device *bdev = NULL;
1665        int type;
1666
1667        if (device)
1668                bdev = bdget(device);
1669
1670        spin_lock(&swap_lock);
1671        for (type = 0; type < nr_swapfiles; type++) {
1672                struct swap_info_struct *sis = swap_info[type];
1673
1674                if (!(sis->flags & SWP_WRITEOK))
1675                        continue;
1676
1677                if (!bdev) {
1678                        if (bdev_p)
1679                                *bdev_p = bdgrab(sis->bdev);
1680
1681                        spin_unlock(&swap_lock);
1682                        return type;
1683                }
1684                if (bdev == sis->bdev) {
1685                        struct swap_extent *se = &sis->first_swap_extent;
1686
1687                        if (se->start_block == offset) {
1688                                if (bdev_p)
1689                                        *bdev_p = bdgrab(sis->bdev);
1690
1691                                spin_unlock(&swap_lock);
1692                                bdput(bdev);
1693                                return type;
1694                        }
1695                }
1696        }
1697        spin_unlock(&swap_lock);
1698        if (bdev)
1699                bdput(bdev);
1700
1701        return -ENODEV;
1702}
1703
1704/*
1705 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
1706 * corresponding to given index in swap_info (swap type).
1707 */
1708sector_t swapdev_block(int type, pgoff_t offset)
1709{
1710        struct block_device *bdev;
1711
1712        if ((unsigned int)type >= nr_swapfiles)
1713                return 0;
1714        if (!(swap_info[type]->flags & SWP_WRITEOK))
1715                return 0;
1716        return map_swap_entry(swp_entry(type, offset), &bdev);
1717}
1718
1719/*
1720 * Return either the total number of swap pages of given type, or the number
1721 * of free pages of that type (depending on @free)
1722 *
1723 * This is needed for software suspend
1724 */
1725unsigned int count_swap_pages(int type, int free)
1726{
1727        unsigned int n = 0;
1728
1729        spin_lock(&swap_lock);
1730        if ((unsigned int)type < nr_swapfiles) {
1731                struct swap_info_struct *sis = swap_info[type];
1732
1733                spin_lock(&sis->lock);
1734                if (sis->flags & SWP_WRITEOK) {
1735                        n = sis->pages;
1736                        if (free)
1737                                n -= sis->inuse_pages;
1738                }
1739                spin_unlock(&sis->lock);
1740        }
1741        spin_unlock(&swap_lock);
1742        return n;
1743}
1744#endif /* CONFIG_HIBERNATION */
1745
1746static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
1747{
1748        return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
1749}
1750
1751/*
1752 * No need to decide whether this PTE shares the swap entry with others,
1753 * just let do_wp_page work it out if a write is requested later - to
1754 * force COW, vm_page_prot omits write permission from any private vma.
1755 */
1756static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
1757                unsigned long addr, swp_entry_t entry, struct page *page)
1758{
1759        struct page *swapcache;
1760        struct mem_cgroup *memcg;
1761        spinlock_t *ptl;
1762        pte_t *pte;
1763        int ret = 1;
1764
1765        swapcache = page;
1766        page = ksm_might_need_to_copy(page, vma, addr);
1767        if (unlikely(!page))
1768                return -ENOMEM;
1769
1770        if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
1771                                &memcg, false)) {
1772                ret = -ENOMEM;
1773                goto out_nolock;
1774        }
1775
1776        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1777        if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
1778                mem_cgroup_cancel_charge(page, memcg, false);
1779                ret = 0;
1780                goto out;
1781        }
1782
1783        dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
1784        inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
1785        get_page(page);
1786        set_pte_at(vma->vm_mm, addr, pte,
1787                   pte_mkold(mk_pte(page, vma->vm_page_prot)));
1788        if (page == swapcache) {
1789                page_add_anon_rmap(page, vma, addr, false);
1790                mem_cgroup_commit_charge(page, memcg, true, false);
1791        } else { /* ksm created a completely new copy */
1792                page_add_new_anon_rmap(page, vma, addr, false);
1793                mem_cgroup_commit_charge(page, memcg, false, false);
1794                lru_cache_add_active_or_unevictable(page, vma);
1795        }
1796        swap_free(entry);
1797        /*
1798         * Move the page to the active list so it is not
1799         * immediately swapped out again after swapon.
1800         */
1801        activate_page(page);
1802out:
1803        pte_unmap_unlock(pte, ptl);
1804out_nolock:
1805        if (page != swapcache) {
1806                unlock_page(page);
1807                put_page(page);
1808        }
1809        return ret;
1810}
1811
1812static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
1813                                unsigned long addr, unsigned long end,
1814                                swp_entry_t entry, struct page *page)
1815{
1816        pte_t swp_pte = swp_entry_to_pte(entry);
1817        pte_t *pte;
1818        int ret = 0;
1819
1820        /*
1821         * We don't actually need pte lock while scanning for swp_pte: since
1822         * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
1823         * page table while we're scanning; though it could get zapped, and on
1824         * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
1825         * of unmatched parts which look like swp_pte, so unuse_pte must
1826         * recheck under pte lock.  Scanning without pte lock lets it be
1827         * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
1828         */
1829        pte = pte_offset_map(pmd, addr);
1830        do {
1831                /*
1832                 * swapoff spends a _lot_ of time in this loop!
1833                 * Test inline before going to call unuse_pte.
1834                 */
1835                if (unlikely(pte_same_as_swp(*pte, swp_pte))) {
1836                        pte_unmap(pte);
1837                        ret = unuse_pte(vma, pmd, addr, entry, page);
1838                        if (ret)
1839                                goto out;
1840                        pte = pte_offset_map(pmd, addr);
1841                }
1842        } while (pte++, addr += PAGE_SIZE, addr != end);
1843        pte_unmap(pte - 1);
1844out:
1845        return ret;
1846}
1847
1848static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
1849                                unsigned long addr, unsigned long end,
1850                                swp_entry_t entry, struct page *page)
1851{
1852        pmd_t *pmd;
1853        unsigned long next;
1854        int ret;
1855
1856        pmd = pmd_offset(pud, addr);
1857        do {
1858                cond_resched();
1859                next = pmd_addr_end(addr, end);
1860                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1861                        continue;
1862                ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
1863                if (ret)
1864                        return ret;
1865        } while (pmd++, addr = next, addr != end);
1866        return 0;
1867}
1868
1869static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
1870                                unsigned long addr, unsigned long end,
1871                                swp_entry_t entry, struct page *page)
1872{
1873        pud_t *pud;
1874        unsigned long next;
1875        int ret;
1876
1877        pud = pud_offset(p4d, addr);
1878        do {
1879                next = pud_addr_end(addr, end);
1880                if (pud_none_or_clear_bad(pud))
1881                        continue;
1882                ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
1883                if (ret)
1884                        return ret;
1885        } while (pud++, addr = next, addr != end);
1886        return 0;
1887}
1888
1889static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
1890                                unsigned long addr, unsigned long end,
1891                                swp_entry_t entry, struct page *page)
1892{
1893        p4d_t *p4d;
1894        unsigned long next;
1895        int ret;
1896
1897        p4d = p4d_offset(pgd, addr);
1898        do {
1899                next = p4d_addr_end(addr, end);
1900                if (p4d_none_or_clear_bad(p4d))
1901                        continue;
1902                ret = unuse_pud_range(vma, p4d, addr, next, entry, page);
1903                if (ret)
1904                        return ret;
1905        } while (p4d++, addr = next, addr != end);
1906        return 0;
1907}
1908
1909static int unuse_vma(struct vm_area_struct *vma,
1910                                swp_entry_t entry, struct page *page)
1911{
1912        pgd_t *pgd;
1913        unsigned long addr, end, next;
1914        int ret;
1915
1916        if (page_anon_vma(page)) {
1917                addr = page_address_in_vma(page, vma);
1918                if (addr == -EFAULT)
1919                        return 0;
1920                else
1921                        end = addr + PAGE_SIZE;
1922        } else {
1923                addr = vma->vm_start;
1924                end = vma->vm_end;
1925        }
1926
1927        pgd = pgd_offset(vma->vm_mm, addr);
1928        do {
1929                next = pgd_addr_end(addr, end);
1930                if (pgd_none_or_clear_bad(pgd))
1931                        continue;
1932                ret = unuse_p4d_range(vma, pgd, addr, next, entry, page);
1933                if (ret)
1934                        return ret;
1935        } while (pgd++, addr = next, addr != end);
1936        return 0;
1937}
1938
1939static int unuse_mm(struct mm_struct *mm,
1940                                swp_entry_t entry, struct page *page)
1941{
1942        struct vm_area_struct *vma;
1943        int ret = 0;
1944
1945        if (!down_read_trylock(&mm->mmap_sem)) {
1946                /*
1947                 * Activate page so shrink_inactive_list is unlikely to unmap
1948                 * its ptes while lock is dropped, so swapoff can make progress.
1949                 */
1950                activate_page(page);
1951                unlock_page(page);
1952                down_read(&mm->mmap_sem);
1953                lock_page(page);
1954        }
1955        for (vma = mm->mmap; vma; vma = vma->vm_next) {
1956                if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
1957                        break;
1958                cond_resched();
1959        }
1960        up_read(&mm->mmap_sem);
1961        return (ret < 0)? ret: 0;
1962}
1963
1964/*
1965 * Scan swap_map (or frontswap_map if frontswap parameter is true)
1966 * from current position to next entry still in use.
1967 * Recycle to start on reaching the end, returning 0 when empty.
1968 */
1969static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1970                                        unsigned int prev, bool frontswap)
1971{
1972        unsigned int max = si->max;
1973        unsigned int i = prev;
1974        unsigned char count;
1975
1976        /*
1977         * No need for swap_lock here: we're just looking
1978         * for whether an entry is in use, not modifying it; false
1979         * hits are okay, and sys_swapoff() has already prevented new
1980         * allocations from this area (while holding swap_lock).
1981         */
1982        for (;;) {
1983                if (++i >= max) {
1984                        if (!prev) {
1985                                i = 0;
1986                                break;
1987                        }
1988                        /*
1989                         * No entries in use at top of swap_map,
1990                         * loop back to start and recheck there.
1991                         */
1992                        max = prev + 1;
1993                        prev = 0;
1994                        i = 1;
1995                }
1996                count = READ_ONCE(si->swap_map[i]);
1997                if (count && swap_count(count) != SWAP_MAP_BAD)
1998                        if (!frontswap || frontswap_test(si, i))
1999                                break;
2000                if ((i % LATENCY_LIMIT) == 0)

2001                        cond_resched();
2002        }
2003        return i;
2004}
2005
2006/*
2007 * We completely avoid races by reading each swap page in advance,
2008 * and then search for the process using it.  All the necessary
2009 * page table adjustments can then be made atomically.
2010 *
2011 * if the boolean frontswap is true, only unuse pages_to_unuse pages;
2012 * pages_to_unuse==0 means all pages; ignored if frontswap is false
2013 */
2014int try_to_unuse(unsigned int type, bool frontswap,
2015                 unsigned long pages_to_unuse)
2016{
2017        struct swap_info_struct *si = swap_info[type];
2018        struct mm_struct *start_mm;
2019        volatile unsigned char *swap_map; /* swap_map is accessed without
2020                                           * locking. Mark it as volatile
2021                                           * to prevent compiler doing
2022                                           * something odd.
2023                                           */
2024        unsigned char swcount;
2025        struct page *page;
2026        swp_entry_t entry;
2027        unsigned int i = 0;
2028        int retval = 0;
2029
2030        /*
2031         * When searching mms for an entry, a good strategy is to
2032         * start at the first mm we freed the previous entry from
2033         * (though actually we don't notice whether we or coincidence
2034         * freed the entry).  Initialize this start_mm with a hold.
2035         *
2036         * A simpler strategy would be to start at the last mm we
2037         * freed the previous entry from; but that would take less
2038         * advantage of mmlist ordering, which clusters forked mms
2039         * together, child after parent.  If we race with dup_mmap(), we
2040         * prefer to resolve parent before child, lest we miss entries
2041         * duplicated after we scanned child: using last mm would invert
2042         * that.
2043         */
2044        start_mm = &init_mm;
2045        mmget(&init_mm);
2046
2047        /*
2048         * Keep on scanning until all entries have gone.  Usually,
2049         * one pass through swap_map is enough, but not necessarily:
2050         * there are races when an instance of an entry might be missed.
2051         */
2052        while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
2053                if (signal_pending(current)) {
2054                        retval = -EINTR;
2055                        break;
2056                }
2057
2058                /*
2059                 * Get a page for the entry, using the existing swap
2060                 * cache page if there is one.  Otherwise, get a clean
2061                 * page and read the swap into it.
2062                 */
2063                swap_map = &si->swap_map[i];
2064                entry = swp_entry(type, i);
2065                page = read_swap_cache_async(entry,
2066                                        GFP_HIGHUSER_MOVABLE, NULL, 0, false);
2067                if (!page) {
2068                        /*
2069                         * Either swap_duplicate() failed because entry
2070                         * has been freed independently, and will not be
2071                         * reused since sys_swapoff() already disabled
2072                         * allocation from here, or alloc_page() failed.
2073                         */
2074                        swcount = *swap_map;
2075                        /*
2076                         * We don't hold lock here, so the swap entry could be
2077                         * SWAP_MAP_BAD (when the cluster is discarding).
2078                         * Instead of fail out, We can just skip the swap
2079                         * entry because swapoff will wait for discarding
2080                         * finish anyway.
2081                         */
2082                        if (!swcount || swcount == SWAP_MAP_BAD)
2083                                continue;
2084                        retval = -ENOMEM;
2085                        break;
2086                }
2087
2088                /*
2089                 * Don't hold on to start_mm if it looks like exiting.
2090                 */
2091                if (atomic_read(&start_mm->mm_users) == 1) {
2092                        mmput(start_mm);
2093                        start_mm = &init_mm;
2094                        mmget(&init_mm);
2095                }
2096
2097                /*
2098                 * Wait for and lock page.  When do_swap_page races with
2099                 * try_to_unuse, do_swap_page can handle the fault much
2100                 * faster than try_to_unuse can locate the entry.  This
2101                 * apparently redundant "wait_on_page_locked" lets try_to_unuse
2102                 * defer to do_swap_page in such a case - in some tests,
2103                 * do_swap_page and try_to_unuse repeatedly compete.
2104                 */
2105                wait_on_page_locked(page);
2106                wait_on_page_writeback(page);
2107                lock_page(page);
2108                wait_on_page_writeback(page);
2109
2110                /*
2111                 * Remove all references to entry.
2112                 */
2113                swcount = *swap_map;
2114                if (swap_count(swcount) == SWAP_MAP_SHMEM) {
2115                        retval = shmem_unuse(entry, page);
2116                        /* page has already been unlocked and released */
2117                        if (retval < 0)
2118                                break;
2119                        continue;
2120                }
2121                if (swap_count(swcount) && start_mm != &init_mm)
2122                        retval = unuse_mm(start_mm, entry, page);
2123
2124                if (swap_count(*swap_map)) {
2125                        int set_start_mm = (*swap_map >= swcount);
2126                        struct list_head *p = &start_mm->mmlist;
2127                        struct mm_struct *new_start_mm = start_mm;
2128                        struct mm_struct *prev_mm = start_mm;
2129                        struct mm_struct *mm;
2130
2131                        mmget(new_start_mm);
2132                        mmget(prev_mm);
2133                        spin_lock(&mmlist_lock);
2134                        while (swap_count(*swap_map) && !retval &&
2135                                        (p = p->next) != &start_mm->mmlist) {
2136                                mm = list_entry(p, struct mm_struct, mmlist);
2137                                if (!mmget_not_zero(mm))
2138                                        continue;
2139                                spin_unlock(&mmlist_lock);
2140                                mmput(prev_mm);
2141                                prev_mm = mm;
2142
2143                                cond_resched();
2144
2145                                swcount = *swap_map;
2146                                if (!swap_count(swcount)) /* any usage ? */
2147                                        ;
2148                                else if (mm == &init_mm)
2149                                        set_start_mm = 1;
2150                                else
2151                                        retval = unuse_mm(mm, entry, page);
2152
2153                                if (set_start_mm && *swap_map < swcount) {
2154                                        mmput(new_start_mm);
2155                                        mmget(mm);
2156                                        new_start_mm = mm;
2157                                        set_start_mm = 0;
2158                                }
2159                                spin_lock(&mmlist_lock);
2160                        }
2161                        spin_unlock(&mmlist_lock);
2162                        mmput(prev_mm);
2163                        mmput(start_mm);
2164                        start_mm = new_start_mm;
2165                }
2166                if (retval) {
2167                        unlock_page(page);
2168                        put_page(page);
2169                        break;
2170                }
2171
2172                /*
2173                 * If a reference remains (rare), we would like to leave
2174                 * the page in the swap cache; but try_to_unmap could
2175                 * then re-duplicate the entry once we drop page lock,
2176                 * so we might loop indefinitely; also, that page could
2177                 * not be swapped out to other storage meanwhile.  So:
2178                 * delete from cache even if there's another reference,
2179                 * after ensuring that the data has been saved to disk -
2180                 * since if the reference remains (rarer), it will be
2181                 * read from disk into another page.  Splitting into two
2182                 * pages would be incorrect if swap supported "shared
2183                 * private" pages, but they are handled by tmpfs files.
2184                 *
2185                 * Given how unuse_vma() targets one particular offset
2186                 * in an anon_vma, once the anon_vma has been determined,
2187                 * this splitting happens to be just what is needed to
2188                 * handle where KSM pages have been swapped out: re-reading
2189                 * is unnecessarily slow, but we can fix that later on.
2190                 */
2191                if (swap_count(*swap_map) &&
2192                     PageDirty(page) && PageSwapCache(page)) {
2193                        struct writeback_control wbc = {
2194                                .sync_mode = WB_SYNC_NONE,
2195                        };
2196
2197                        swap_writepage(compound_head(page), &wbc);
2198                        lock_page(page);
2199                        wait_on_page_writeback(page);
2200                }
2201
2202                /*
2203                 * It is conceivable that a racing task removed this page from
2204                 * swap cache just before we acquired the page lock at the top,
2205                 * or while we dropped it in unuse_mm().  The page might even
2206                 * be back in swap cache on another swap area: that we must not
2207                 * delete, since it may not have been written out to swap yet.
2208                 */
2209                if (PageSwapCache(page) &&
2210                    likely(page_private(page) == entry.val) &&
2211                    !page_swapped(page))
2212                        delete_from_swap_cache(compound_head(page));
2213
2214                /*
2215                 * So we could skip searching mms once swap count went
2216                 * to 1, we did not mark any present ptes as dirty: must
2217                 * mark page dirty so shrink_page_list will preserve it.
2218                 */
2219                SetPageDirty(page);
2220                unlock_page(page);
2221                put_page(page);
2222
2223                /*
2224                 * Make sure that we aren't completely killing
2225                 * interactive performance.
2226                 */
2227                cond_resched();
2228                if (frontswap && pages_to_unuse > 0) {
2229                        if (!--pages_to_unuse)
2230                                break;
2231                }
2232        }
2233
2234        mmput(start_mm);
2235        return retval;
2236}
2237
2238/*
2239 * After a successful try_to_unuse, if no swap is now in use, we know
2240 * we can empty the mmlist.  swap_lock must be held on entry and exit.
2241 * Note that mmlist_lock nests inside swap_lock, and an mm must be
2242 * added to the mmlist just after page_duplicate - before would be racy.
2243 */
2244static void drain_mmlist(void)
2245{
2246        struct list_head *p, *next;
2247        unsigned int type;
2248
2249        for (type = 0; type < nr_swapfiles; type++)
2250                if (swap_info[type]->inuse_pages)
2251                        return;
2252        spin_lock(&mmlist_lock);
2253        list_for_each_safe(p, next, &init_mm.mmlist)
2254                list_del_init(p);
2255        spin_unlock(&mmlist_lock);
2256}
2257
2258/*
2259 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
2260 * corresponds to page offset for the specified swap entry.
2261 * Note that the type of this function is sector_t, but it returns page offset
2262 * into the bdev, not sector offset.
2263 */
2264static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
2265{
2266        struct swap_info_struct *sis;
2267        struct swap_extent *start_se;
2268        struct swap_extent *se;
2269        pgoff_t offset;
2270
2271        sis = swap_info[swp_type(entry)];
2272        *bdev = sis->bdev;
2273
2274        offset = swp_offset(entry);
2275        start_se = sis->curr_swap_extent;
2276        se = start_se;
2277
2278        for ( ; ; ) {
2279                if (se->start_page <= offset &&
2280                                offset < (se->start_page + se->nr_pages)) {
2281                        return se->start_block + (offset - se->start_page);
2282                }
2283                se = list_next_entry(se, list);
2284                sis->curr_swap_extent = se;
2285                BUG_ON(se == start_se);         /* It *must* be present */
2286        }
2287}
2288
2289/*
2290 * Returns the page offset into bdev for the specified page's swap entry.
2291 */
2292sector_t map_swap_page(struct page *page, struct block_device **bdev)
2293{
2294        swp_entry_t entry;
2295        entry.val = page_private(page);
2296        return map_swap_entry(entry, bdev);
2297}
2298
2299/*
2300 * Free all of a swapdev's extent information
2301 */
2302static void destroy_swap_extents(struct swap_info_struct *sis)
2303{
2304        while (!list_empty(&sis->first_swap_extent.list)) {
2305                struct swap_extent *se;
2306
2307                se = list_first_entry(&sis->first_swap_extent.list,
2308                                struct swap_extent, list);
2309                list_del(&se->list);
2310                kfree(se);
2311        }
2312
2313        if (sis->flags & SWP_FILE) {
2314                struct file *swap_file = sis->swap_file;
2315                struct address_space *mapping = swap_file->f_mapping;
2316
2317                sis->flags &= ~SWP_FILE;
2318                mapping->a_ops->swap_deactivate(swap_file);
2319        }
2320}
2321
2322/*
2323 * Add a block range (and the corresponding page range) into this swapdev's
2324 * extent list.  The extent list is kept sorted in page order.
2325 *
2326 * This function rather assumes that it is called in ascending page order.
2327 */
2328int
2329add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
2330                unsigned long nr_pages, sector_t start_block)
2331{
2332        struct swap_extent *se;
2333        struct swap_extent *new_se;
2334        struct list_head *lh;
2335
2336        if (start_page == 0) {
2337                se = &sis->first_swap_extent;
2338                sis->curr_swap_extent = se;
2339                se->start_page = 0;
2340                se->nr_pages = nr_pages;
2341                se->start_block = start_block;
2342                return 1;
2343        } else {
2344                lh = sis->first_swap_extent.list.prev;  /* Highest extent */
2345                se = list_entry(lh, struct swap_extent, list);
2346                BUG_ON(se->start_page + se->nr_pages != start_page);
2347                if (se->start_block + se->nr_pages == start_block) {
2348                        /* Merge it */
2349                        se->nr_pages += nr_pages;
2350                        return 0;
2351                }
2352        }
2353
2354        /*
2355         * No merge.  Insert a new extent, preserving ordering.
2356         */
2357        new_se = kmalloc(sizeof(*se), GFP_KERNEL);
2358        if (new_se == NULL)
2359                return -ENOMEM;
2360        new_se->start_page = start_page;
2361        new_se->nr_pages = nr_pages;
2362        new_se->start_block = start_block;
2363
2364        list_add_tail(&new_se->list, &sis->first_swap_extent.list);
2365        return 1;
2366}
2367
2368/*
2369 * A `swap extent' is a simple thing which maps a contiguous range of pages
2370 * onto a contiguous range of disk blocks.  An ordered list of swap extents
2371 * is built at swapon time and is then used at swap_writepage/swap_readpage
2372 * time for locating where on disk a page belongs.
2373 *
2374 * If the swapfile is an S_ISBLK block device, a single extent is installed.
2375 * This is done so that the main operating code can treat S_ISBLK and S_ISREG
2376 * swap files identically.
2377 *
2378 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
2379 * extent list operates in PAGE_SIZE disk blocks.  Both S_ISREG and S_ISBLK
2380 * swapfiles are handled *identically* after swapon time.
2381 *
2382 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
2383 * and will parse them into an ordered extent list, in PAGE_SIZE chunks.  If
2384 * some stray blocks are found which do not fall within the PAGE_SIZE alignment
2385 * requirements, they are simply tossed out - we will never use those blocks
2386 * for swapping.
2387 *
2388 * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon.  This
2389 * prevents root from shooting her foot off by ftruncating an in-use swapfile,
2390 * which will scribble on the fs.
2391 *
2392 * The amount of disk space which a single swap extent represents varies.
2393 * Typically it is in the 1-4 megabyte range.  So we can have hundreds of
2394 * extents in the list.  To avoid much list walking, we cache the previous
2395 * search location in `curr_swap_extent', and start new searches from there.
2396 * This is extremely effective.  The average number of iterations in
2397 * map_swap_page() has been measured at about 0.3 per page.  - akpm.
2398 */
2399static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
2400{
2401        struct file *swap_file = sis->swap_file;
2402        struct address_space *mapping = swap_file->f_mapping;
2403        struct inode *inode = mapping->host;
2404        int ret;
2405
2406        if (S_ISBLK(inode->i_mode)) {
2407                ret = add_swap_extent(sis, 0, sis->max, 0);
2408                *span = sis->pages;
2409                return ret;
2410        }
2411
2412        if (mapping->a_ops->swap_activate) {
2413                ret = mapping->a_ops->swap_activate(sis, swap_file, span);
2414                if (!ret) {
2415                        sis->flags |= SWP_FILE;
2416                        ret = add_swap_extent(sis, 0, sis->max, 0);
2417                        *span = sis->pages;
2418                }
2419                return ret;
2420        }
2421
2422        return generic_swapfile_activate(sis, swap_file, span);
2423}
2424
2425static int swap_node(struct swap_info_struct *p)
2426{
2427        struct block_device *bdev;
2428
2429        if (p->bdev)
2430                bdev = p->bdev;
2431        else
2432                bdev = p->swap_file->f_inode->i_sb->s_bdev;
2433
2434        return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
2435}
2436
2437static void _enable_swap_info(struct swap_info_struct *p, int prio,
2438                                unsigned char *swap_map,
2439                                struct swap_cluster_info *cluster_info)
2440{
2441        int i;
2442
2443        if (prio >= 0)
2444                p->prio = prio;
2445        else
2446                p->prio = --least_priority;
2447        /*
2448         * the plist prio is negated because plist ordering is
2449         * low-to-high, while swap ordering is high-to-low
2450         */
2451        p->list.prio = -p->prio;
2452        for_each_node(i) {
2453                if (p->prio >= 0)
2454                        p->avail_lists[i].prio = -p->prio;
2455                else {
2456                        if (swap_node(p) == i)
2457                                p->avail_lists[i].prio = 1;
2458                        else
2459                                p->avail_lists[i].prio = -p->prio;
2460                }
2461        }
2462        p->swap_map = swap_map;
2463        p->cluster_info = cluster_info;
2464        p->flags |= SWP_WRITEOK;
2465        atomic_long_add(p->pages, &nr_swap_pages);
2466        total_swap_pages += p->pages;
2467
2468        assert_spin_locked(&swap_lock);
2469        /*
2470         * both lists are plists, and thus priority ordered.
2471         * swap_active_head needs to be priority ordered for swapoff(),
2472         * which on removal of any swap_info_struct with an auto-assigned
2473         * (i.e. negative) priority increments the auto-assigned priority
2474         * of any lower-priority swap_info_structs.
2475         * swap_avail_head needs to be priority ordered for get_swap_page(),
2476         * which allocates swap pages from the highest available priority
2477         * swap_info_struct.
2478         */
2479        plist_add(&p->list, &swap_active_head);
2480        add_to_avail_list(p);
2481}
2482
2483static void enable_swap_info(struct swap_info_struct *p, int prio,
2484                                unsigned char *swap_map,
2485                                struct swap_cluster_info *cluster_info,
2486                                unsigned long *frontswap_map)
2487{
2488        frontswap_init(p->type, frontswap_map);
2489        spin_lock(&swap_lock);
2490        spin_lock(&p->lock);
2491         _enable_swap_info(p, prio, swap_map, cluster_info);
2492        spin_unlock(&p->lock);
2493        spin_unlock(&swap_lock);
2494}
2495
2496static void reinsert_swap_info(struct swap_info_struct *p)
2497{
2498        spin_lock(&swap_lock);
2499        spin_lock(&p->lock);
2500        _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
2501        spin_unlock(&p->lock);
2502        spin_unlock(&swap_lock);
2503}
2504
2505bool has_usable_swap(void)
2506{
2507        bool ret = true;
2508
2509        spin_lock(&swap_lock);
2510        if (plist_head_empty(&swap_active_head))
2511                ret = false;
2512        spin_unlock(&swap_lock);
2513        return ret;
2514}
2515
2516SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
2517{
2518        struct swap_info_struct *p = NULL;
2519        unsigned char *swap_map;
2520        struct swap_cluster_info *cluster_info;
2521        unsigned long *frontswap_map;
2522        struct file *swap_file, *victim;
2523        struct address_space *mapping;
2524        struct inode *inode;
2525        struct filename *pathname;
2526        int err, found = 0;
2527        unsigned int old_block_size;
2528
2529        if (!capable(CAP_SYS_ADMIN))
2530                return -EPERM;
2531
2532        BUG_ON(!current->mm);
2533
2534        pathname = getname(specialfile);
2535        if (IS_ERR(pathname))
2536                return PTR_ERR(pathname);
2537
2538        victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
2539        err = PTR_ERR(victim);
2540        if (IS_ERR(victim))
2541                goto out;
2542
2543        mapping = victim->f_mapping;
2544        spin_lock(&swap_lock);
2545        plist_for_each_entry(p, &swap_active_head, list) {
2546                if (p->flags & SWP_WRITEOK) {
2547                        if (p->swap_file->f_mapping == mapping) {
2548                                found = 1;
2549                                break;
2550                        }
2551                }
2552        }
2553        if (!found) {
2554                err = -EINVAL;
2555                spin_unlock(&swap_lock);
2556                goto out_dput;
2557        }
2558        if (!security_vm_enough_memory_mm(current->mm, p->pages))
2559                vm_unacct_memory(p->pages);
2560        else {
2561                err = -ENOMEM;
2562                spin_unlock(&swap_lock);
2563                goto out_dput;
2564        }
2565        del_from_avail_list(p);
2566        spin_lock(&p->lock);
2567        if (p->prio < 0) {
2568                struct swap_info_struct *si = p;
2569                int nid;
2570
2571                plist_for_each_entry_continue(si, &swap_active_head, list) {
2572                        si->prio++;
2573                        si->list.prio--;
2574                        for_each_node(nid) {
2575                                if (si->avail_lists[nid].prio != 1)
2576                                        si->avail_lists[nid].prio--;
2577                        }
2578                }
2579                least_priority++;
2580        }
2581        plist_del(&p->list, &swap_active_head);
2582        atomic_long_sub(p->pages, &nr_swap_pages);
2583        total_swap_pages -= p->pages;
2584        p->flags &= ~SWP_WRITEOK;
2585        spin_unlock(&p->lock);
2586        spin_unlock(&swap_lock);
2587
2588        disable_swap_slots_cache_lock();
2589
2590        set_current_oom_origin();
2591        err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
2592        clear_current_oom_origin();
2593
2594        if (err) {
2595                /* re-insert swap space back into swap_list */
2596                reinsert_swap_info(p);
2597                reenable_swap_slots_cache_unlock();
2598                goto out_dput;
2599        }
2600
2601        reenable_swap_slots_cache_unlock();
2602
2603        flush_work(&p->discard_work);
2604
2605        destroy_swap_extents(p);
2606        if (p->flags & SWP_CONTINUED)
2607                free_swap_count_continuations(p);
2608
2609        if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
2610                atomic_dec(&nr_rotate_swap);
2611
2612        mutex_lock(&swapon_mutex);
2613        spin_lock(&swap_lock);
2614        spin_lock(&p->lock);
2615        drain_mmlist();
2616
2617        /* wait for anyone still in scan_swap_map */
2618        p->highest_bit = 0;             /* cuts scans short */
2619        while (p->flags >= SWP_SCANNING) {
2620                spin_unlock(&p->lock);
2621                spin_unlock(&swap_lock);
2622                schedule_timeout_uninterruptible(1);
2623                spin_lock(&swap_lock);
2624                spin_lock(&p->lock);
2625        }
2626
2627        swap_file = p->swap_file;
2628        old_block_size = p->old_block_size;
2629        p->swap_file = NULL;
2630        p->max = 0;
2631        swap_map = p->swap_map;
2632        p->swap_map = NULL;
2633        cluster_info = p->cluster_info;
2634        p->cluster_info = NULL;
2635        frontswap_map = frontswap_map_get(p);
2636        spin_unlock(&p->lock);
2637        spin_unlock(&swap_lock);
2638        frontswap_invalidate_area(p->type);
2639        frontswap_map_set(p, NULL);
2640        mutex_unlock(&swapon_mutex);
2641        free_percpu(p->percpu_cluster);
2642        p->percpu_cluster = NULL;
2643        vfree(swap_map);
2644        kvfree(cluster_info);
2645        kvfree(frontswap_map);
2646        /* Destroy swap account information */
2647        swap_cgroup_swapoff(p->type);
2648        exit_swap_address_space(p->type);
2649
2650        inode = mapping->host;
2651        if (S_ISBLK(inode->i_mode)) {
2652                struct block_device *bdev = I_BDEV(inode);
2653                set_blocksize(bdev, old_block_size);
2654                blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2655        } else {
2656                inode_lock(inode);
2657                inode->i_flags &= ~S_SWAPFILE;
2658                inode_unlock(inode);
2659        }
2660        filp_close(swap_file, NULL);
2661
2662        /*
2663         * Clear the SWP_USED flag after all resources are freed so that swapon
2664         * can reuse this swap_info in alloc_swap_info() safely.  It is ok to
2665         * not hold p->lock after we cleared its SWP_WRITEOK.
2666         */
2667        spin_lock(&swap_lock);
2668        p->flags = 0;
2669        spin_unlock(&swap_lock);
2670
2671        err = 0;
2672        atomic_inc(&proc_poll_event);
2673        wake_up_interruptible(&proc_poll_wait);
2674
2675out_dput:
2676        filp_close(victim, NULL);
2677out:
2678        putname(pathname);
2679        return err;
2680}
2681
2682#ifdef CONFIG_PROC_FS
2683static __poll_t swaps_poll(struct file *file, poll_table *wait)
2684{
2685        struct seq_file *seq = file->private_data;
2686
2687        poll_wait(file, &proc_poll_wait, wait);
2688
2689        if (seq->poll_event != atomic_read(&proc_poll_event)) {
2690                seq->poll_event = atomic_read(&proc_poll_event);
2691                return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
2692        }
2693
2694        return EPOLLIN | EPOLLRDNORM;
2695}
2696
2697/* iterator */
2698static void *swap_start(struct seq_file *swap, loff_t *pos)
2699{
2700        struct swap_info_struct *si;
2701        int type;
2702        loff_t l = *pos;
2703
2704        mutex_lock(&swapon_mutex);
2705
2706        if (!l)
2707                return SEQ_START_TOKEN;
2708
2709        for (type = 0; type < nr_swapfiles; type++) {
2710                smp_rmb();      /* read nr_swapfiles before swap_info[type] */
2711                si = swap_info[type];
2712                if (!(si->flags & SWP_USED) || !si->swap_map)
2713                        continue;
2714                if (!--l)
2715                        return si;
2716        }
2717
2718        return NULL;
2719}
2720
2721static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
2722{
2723        struct swap_info_struct *si = v;
2724        int type;
2725
2726        if (v == SEQ_START_TOKEN)
2727                type = 0;
2728        else
2729                type = si->type + 1;
2730
2731        for (; type < nr_swapfiles; type++) {
2732                smp_rmb();      /* read nr_swapfiles before swap_info[type] */
2733                si = swap_info[type];
2734                if (!(si->flags & SWP_USED) || !si->swap_map)
2735                        continue;
2736                ++*pos;
2737                return si;
2738        }
2739
2740        return NULL;
2741}
2742
2743static void swap_stop(struct seq_file *swap, void *v)
2744{
2745        mutex_unlock(&swapon_mutex);
2746}
2747
2748static int swap_show(struct seq_file *swap, void *v)
2749{
2750        struct swap_info_struct *si = v;
2751        struct file *file;
2752        int len;
2753
2754        if (si == SEQ_START_TOKEN) {
2755                seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
2756                return 0;
2757        }
2758
2759        file = si->swap_file;
2760        len = seq_file_path(swap, file, " \t\n\\");
2761        seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
2762                        len < 40 ? 40 - len : 1, " ",
2763                        S_ISBLK(file_inode(file)->i_mode) ?
2764                                "partition" : "file\t",
2765                        si->pages << (PAGE_SHIFT - 10),
2766                        si->inuse_pages << (PAGE_SHIFT - 10),
2767                        si->prio);
2768        return 0;
2769}
2770
2771static const struct seq_operations swaps_op = {
2772        .start =        swap_start,
2773        .next =         swap_next,
2774        .stop =         swap_stop,
2775        .show =         swap_show
2776};
2777
2778static int swaps_open(struct inode *inode, struct file *file)
2779{
2780        struct seq_file *seq;
2781        int ret;
2782
2783        ret = seq_open(file, &swaps_op);
2784        if (ret)
2785                return ret;
2786
2787        seq = file->private_data;
2788        seq->poll_event = atomic_read(&proc_poll_event);
2789        return 0;
2790}
2791
2792static const struct file_operations proc_swaps_operations = {
2793        .open           = swaps_open,
2794        .read           = seq_read,
2795        .llseek         = seq_lseek,
2796        .release        = seq_release,
2797        .poll           = swaps_poll,
2798};
2799
2800static int __init procswaps_init(void)
2801{
2802        proc_create("swaps", 0, NULL, &proc_swaps_operations);
2803        return 0;
2804}
2805__initcall(procswaps_init);
2806#endif /* CONFIG_PROC_FS */
2807
2808#ifdef MAX_SWAPFILES_CHECK
2809static int __init max_swapfiles_check(void)
2810{
2811        MAX_SWAPFILES_CHECK();
2812        return 0;
2813}
2814late_initcall(max_swapfiles_check);
2815#endif
2816
2817static struct swap_info_struct *alloc_swap_info(void)
2818{
2819        struct swap_info_struct *p;
2820        unsigned int type;
2821        int i;
2822
2823        p = kzalloc(sizeof(*p), GFP_KERNEL);
2824        if (!p)
2825                return ERR_PTR(-ENOMEM);
2826
2827        spin_lock(&swap_lock);
2828        for (type = 0; type < nr_swapfiles; type++) {
2829                if (!(swap_info[type]->flags & SWP_USED))
2830                        break;
2831        }
2832        if (type >= MAX_SWAPFILES) {
2833                spin_unlock(&swap_lock);
2834                kfree(p);
2835                return ERR_PTR(-EPERM);
2836        }
2837        if (type >= nr_swapfiles) {
2838                p->type = type;
2839                swap_info[type] = p;
2840                /*
2841                 * Write swap_info[type] before nr_swapfiles, in case a
2842                 * racing procfs swap_start() or swap_next() is reading them.
2843                 * (We never shrink nr_swapfiles, we never free this entry.)
2844                 */
2845                smp_wmb();
2846                nr_swapfiles++;
2847        } else {
2848                kfree(p);
2849                p = swap_info[type];
2850                /*
2851                 * Do not memset this entry: a racing procfs swap_next()
2852                 * would be relying on p->type to remain valid.
2853                 */
2854        }
2855        INIT_LIST_HEAD(&p->first_swap_extent.list);
2856        plist_node_init(&p->list, 0);
2857        for_each_node(i)
2858                plist_node_init(&p->avail_lists[i], 0);
2859        p->flags = SWP_USED;
2860        spin_unlock(&swap_lock);
2861        spin_lock_init(&p->lock);
2862        spin_lock_init(&p->cont_lock);
2863
2864        return p;
2865}
2866
2867static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
2868{
2869        int error;
2870
2871        if (S_ISBLK(inode->i_mode)) {
2872                p->bdev = bdgrab(I_BDEV(inode));
2873                error = blkdev_get(p->bdev,
2874                                   FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
2875                if (error < 0) {
2876                        p->bdev = NULL;
2877                        return error;
2878                }
2879                p->old_block_size = block_size(p->bdev);
2880                error = set_blocksize(p->bdev, PAGE_SIZE);
2881                if (error < 0)
2882                        return error;
2883                p->flags |= SWP_BLKDEV;
2884        } else if (S_ISREG(inode->i_mode)) {
2885                p->bdev = inode->i_sb->s_bdev;
2886                inode_lock(inode);
2887                if (IS_SWAPFILE(inode))
2888                        return -EBUSY;
2889        } else
2890                return -EINVAL;
2891
2892        return 0;
2893}
2894
2895
2896/*
2897 * Find out how many pages are allowed for a single swap device. There
2898 * are two limiting factors:
2899 * 1) the number of bits for the swap offset in the swp_entry_t type, and
2900 * 2) the number of bits in the swap pte, as defined by the different
2901 * architectures.
2902 *
2903 * In order to find the largest possible bit mask, a swap entry with
2904 * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
2905 * decoded to a swp_entry_t again, and finally the swap offset is
2906 * extracted.
2907 *
2908 * This will mask all the bits from the initial ~0UL mask that can't
2909 * be encoded in either the swp_entry_t or the architecture definition
2910 * of a swap pte.
2911 */
2912unsigned long generic_max_swapfile_size(void)
2913{
2914        return swp_offset(pte_to_swp_entry(
2915                        swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
2916}
2917
2918/* Can be overridden by an architecture for additional checks. */
2919__weak unsigned long max_swapfile_size(void)
2920{
2921        return generic_max_swapfile_size();
2922}
2923
2924static unsigned long read_swap_header(struct swap_info_struct *p,
2925                                        union swap_header *swap_header,
2926                                        struct inode *inode)
2927{
2928        int i;
2929        unsigned long maxpages;
2930        unsigned long swapfilepages;
2931        unsigned long last_page;
2932
2933        if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
2934                pr_err("Unable to find swap-space signature\n");
2935                return 0;
2936        }
2937
2938        /* swap partition endianess hack... */
2939        if (swab32(swap_header->info.version) == 1) {
2940                swab32s(&swap_header->info.version);
2941                swab32s(&swap_header->info.last_page);
2942                swab32s(&swap_header->info.nr_badpages);
2943                if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2944                        return 0;
2945                for (i = 0; i < swap_header->info.nr_badpages; i++)
2946                        swab32s(&swap_header->info.badpages[i]);
2947        }
2948        /* Check the swap header's sub-version */
2949        if (swap_header->info.version != 1) {
2950                pr_warn("Unable to handle swap header version %d\n",
2951                        swap_header->info.version);
2952                return 0;
2953        }
2954
2955        p->lowest_bit  = 1;
2956        p->cluster_next = 1;
2957        p->cluster_nr = 0;
2958
2959        maxpages = max_swapfile_size();
2960        last_page = swap_header->info.last_page;
2961        if (!last_page) {
2962                pr_warn("Empty swap-file\n");
2963                return 0;
2964        }
2965        if (last_page > maxpages) {
2966                pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
2967                        maxpages << (PAGE_SHIFT - 10),
2968                        last_page << (PAGE_SHIFT - 10));
2969        }
2970        if (maxpages > last_page) {
2971                maxpages = last_page + 1;
2972                /* p->max is an unsigned int: don't overflow it */
2973                if ((unsigned int)maxpages == 0)
2974                        maxpages = UINT_MAX;
2975        }
2976        p->highest_bit = maxpages - 1;
2977
2978        if (!maxpages)
2979                return 0;
2980        swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
2981        if (swapfilepages && maxpages > swapfilepages) {
2982                pr_warn("Swap area shorter than signature indicates\n");
2983                return 0;
2984        }
2985        if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
2986                return 0;
2987        if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2988                return 0;
2989
2990        return maxpages;
2991}
2992
2993#define SWAP_CLUSTER_INFO_COLS                                          \
2994        DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
2995#define SWAP_CLUSTER_SPACE_COLS                                         \
2996        DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
2997#define SWAP_CLUSTER_COLS                                               \
2998        max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
2999
3000static int setup_swap_map_and_extents(struct swap_info_struct *p,

3001                                        union swap_header *swap_header,
3002                                        unsigned char *swap_map,
3003                                        struct swap_cluster_info *cluster_info,
3004                                        unsigned long maxpages,
3005                                        sector_t *span)
3006{
3007        unsigned int j, k;
3008        unsigned int nr_good_pages;
3009        int nr_extents;
3010        unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
3011        unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
3012        unsigned long i, idx;
3013
3014        nr_good_pages = maxpages - 1;   /* omit header page */
3015
3016        cluster_list_init(&p->free_clusters);
3017        cluster_list_init(&p->discard_clusters);
3018
3019        for (i = 0; i < swap_header->info.nr_badpages; i++) {
3020                unsigned int page_nr = swap_header->info.badpages[i];
3021                if (page_nr == 0 || page_nr > swap_header->info.last_page)
3022                        return -EINVAL;
3023                if (page_nr < maxpages) {
3024                        swap_map[page_nr] = SWAP_MAP_BAD;
3025                        nr_good_pages--;
3026                        /*
3027                         * Haven't marked the cluster free yet, no list
3028                         * operation involved
3029                         */
3030                        inc_cluster_info_page(p, cluster_info, page_nr);
3031                }
3032        }
3033
3034        /* Haven't marked the cluster free yet, no list operation involved */
3035        for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
3036                inc_cluster_info_page(p, cluster_info, i);
3037
3038        if (nr_good_pages) {
3039                swap_map[0] = SWAP_MAP_BAD;
3040                /*
3041                 * Not mark the cluster free yet, no list
3042                 * operation involved
3043                 */
3044                inc_cluster_info_page(p, cluster_info, 0);
3045                p->max = maxpages;
3046                p->pages = nr_good_pages;
3047                nr_extents = setup_swap_extents(p, span);
3048                if (nr_extents < 0)
3049                        return nr_extents;
3050                nr_good_pages = p->pages;
3051        }
3052        if (!nr_good_pages) {
3053                pr_warn("Empty swap-file\n");
3054                return -EINVAL;
3055        }
3056
3057        if (!cluster_info)
3058                return nr_extents;
3059
3060
3061        /*
3062         * Reduce false cache line sharing between cluster_info and
3063         * sharing same address space.
3064         */
3065        for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
3066                j = (k + col) % SWAP_CLUSTER_COLS;
3067                for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
3068                        idx = i * SWAP_CLUSTER_COLS + j;
3069                        if (idx >= nr_clusters)
3070                                continue;
3071                        if (cluster_count(&cluster_info[idx]))
3072                                continue;
3073                        cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
3074                        cluster_list_add_tail(&p->free_clusters, cluster_info,
3075                                              idx);
3076                }
3077        }
3078        return nr_extents;
3079}
3080
3081/*
3082 * Helper to sys_swapon determining if a given swap
3083 * backing device queue supports DISCARD operations.
3084 */
3085static bool swap_discardable(struct swap_info_struct *si)
3086{
3087        struct request_queue *q = bdev_get_queue(si->bdev);
3088
3089        if (!q || !blk_queue_discard(q))
3090                return false;
3091
3092        return true;
3093}
3094
3095SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
3096{
3097        struct swap_info_struct *p;
3098        struct filename *name;
3099        struct file *swap_file = NULL;
3100        struct address_space *mapping;
3101        int prio;
3102        int error;
3103        union swap_header *swap_header;
3104        int nr_extents;
3105        sector_t span;
3106        unsigned long maxpages;
3107        unsigned char *swap_map = NULL;
3108        struct swap_cluster_info *cluster_info = NULL;
3109        unsigned long *frontswap_map = NULL;
3110        struct page *page = NULL;
3111        struct inode *inode = NULL;
3112        bool inced_nr_rotate_swap = false;
3113
3114        if (swap_flags & ~SWAP_FLAGS_VALID)
3115                return -EINVAL;
3116
3117        if (!capable(CAP_SYS_ADMIN))
3118                return -EPERM;
3119
3120        if (!swap_avail_heads)
3121                return -ENOMEM;
3122
3123        p = alloc_swap_info();
3124        if (IS_ERR(p))
3125                return PTR_ERR(p);
3126
3127        INIT_WORK(&p->discard_work, swap_discard_work);
3128
3129        name = getname(specialfile);
3130        if (IS_ERR(name)) {
3131                error = PTR_ERR(name);
3132                name = NULL;
3133                goto bad_swap;
3134        }
3135        swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
3136        if (IS_ERR(swap_file)) {
3137                error = PTR_ERR(swap_file);
3138                swap_file = NULL;
3139                goto bad_swap;
3140        }
3141
3142        p->swap_file = swap_file;
3143        mapping = swap_file->f_mapping;
3144        inode = mapping->host;
3145
3146        /* If S_ISREG(inode->i_mode) will do inode_lock(inode); */
3147        error = claim_swapfile(p, inode);
3148        if (unlikely(error))
3149                goto bad_swap;
3150
3151        /*
3152         * Read the swap header.
3153         */
3154        if (!mapping->a_ops->readpage) {
3155                error = -EINVAL;
3156                goto bad_swap;
3157        }
3158        page = read_mapping_page(mapping, 0, swap_file);
3159        if (IS_ERR(page)) {
3160                error = PTR_ERR(page);
3161                goto bad_swap;
3162        }
3163        swap_header = kmap(page);
3164
3165        maxpages = read_swap_header(p, swap_header, inode);
3166        if (unlikely(!maxpages)) {
3167                error = -EINVAL;
3168                goto bad_swap;
3169        }
3170
3171        /* OK, set up the swap map and apply the bad block list */
3172        swap_map = vzalloc(maxpages);
3173        if (!swap_map) {
3174                error = -ENOMEM;
3175                goto bad_swap;
3176        }
3177
3178        if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
3179                p->flags |= SWP_STABLE_WRITES;
3180
3181        if (bdi_cap_synchronous_io(inode_to_bdi(inode)))
3182                p->flags |= SWP_SYNCHRONOUS_IO;
3183
3184        if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
3185                int cpu;
3186                unsigned long ci, nr_cluster;
3187
3188                p->flags |= SWP_SOLIDSTATE;
3189                /*
3190                 * select a random position to start with to help wear leveling
3191                 * SSD
3192                 */
3193                p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
3194                nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
3195
3196                cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
3197                                        GFP_KERNEL);
3198                if (!cluster_info) {
3199                        error = -ENOMEM;
3200                        goto bad_swap;
3201                }
3202
3203                for (ci = 0; ci < nr_cluster; ci++)
3204                        spin_lock_init(&((cluster_info + ci)->lock));
3205
3206                p->percpu_cluster = alloc_percpu(struct percpu_cluster);
3207                if (!p->percpu_cluster) {
3208                        error = -ENOMEM;
3209                        goto bad_swap;
3210                }
3211                for_each_possible_cpu(cpu) {
3212                        struct percpu_cluster *cluster;
3213                        cluster = per_cpu_ptr(p->percpu_cluster, cpu);
3214                        cluster_set_null(&cluster->index);
3215                }
3216        } else {
3217                atomic_inc(&nr_rotate_swap);
3218                inced_nr_rotate_swap = true;
3219        }
3220
3221        error = swap_cgroup_swapon(p->type, maxpages);
3222        if (error)
3223                goto bad_swap;
3224
3225        nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
3226                cluster_info, maxpages, &span);
3227        if (unlikely(nr_extents < 0)) {
3228                error = nr_extents;
3229                goto bad_swap;
3230        }
3231        /* frontswap enabled? set up bit-per-page map for frontswap */
3232        if (IS_ENABLED(CONFIG_FRONTSWAP))
3233                frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages),
3234                                         sizeof(long),
3235                                         GFP_KERNEL);
3236
3237        if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
3238                /*
3239                 * When discard is enabled for swap with no particular
3240                 * policy flagged, we set all swap discard flags here in
3241                 * order to sustain backward compatibility with older
3242                 * swapon(8) releases.
3243                 */
3244                p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
3245                             SWP_PAGE_DISCARD);
3246
3247                /*
3248                 * By flagging sys_swapon, a sysadmin can tell us to
3249                 * either do single-time area discards only, or to just
3250                 * perform discards for released swap page-clusters.
3251                 * Now it's time to adjust the p->flags accordingly.
3252                 */
3253                if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
3254                        p->flags &= ~SWP_PAGE_DISCARD;
3255                else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
3256                        p->flags &= ~SWP_AREA_DISCARD;
3257
3258                /* issue a swapon-time discard if it's still required */
3259                if (p->flags & SWP_AREA_DISCARD) {
3260                        int err = discard_swap(p);
3261                        if (unlikely(err))
3262                                pr_err("swapon: discard_swap(%p): %d\n",
3263                                        p, err);
3264                }
3265        }
3266
3267        error = init_swap_address_space(p->type, maxpages);
3268        if (error)
3269                goto bad_swap;
3270
3271        mutex_lock(&swapon_mutex);
3272        prio = -1;
3273        if (swap_flags & SWAP_FLAG_PREFER)
3274                prio =
3275                  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
3276        enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
3277
3278        pr_info("Adding %uk swap on %s.  Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
3279                p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
3280                nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
3281                (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
3282                (p->flags & SWP_DISCARDABLE) ? "D" : "",
3283                (p->flags & SWP_AREA_DISCARD) ? "s" : "",
3284                (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
3285                (frontswap_map) ? "FS" : "");
3286
3287        mutex_unlock(&swapon_mutex);
3288        atomic_inc(&proc_poll_event);
3289        wake_up_interruptible(&proc_poll_wait);
3290
3291        if (S_ISREG(inode->i_mode))
3292                inode->i_flags |= S_SWAPFILE;
3293        error = 0;
3294        goto out;
3295bad_swap:
3296        free_percpu(p->percpu_cluster);
3297        p->percpu_cluster = NULL;
3298        if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
3299                set_blocksize(p->bdev, p->old_block_size);
3300                blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3301        }
3302        destroy_swap_extents(p);
3303        swap_cgroup_swapoff(p->type);
3304        spin_lock(&swap_lock);
3305        p->swap_file = NULL;
3306        p->flags = 0;
3307        spin_unlock(&swap_lock);
3308        vfree(swap_map);
3309        kvfree(cluster_info);
3310        kvfree(frontswap_map);
3311        if (inced_nr_rotate_swap)
3312                atomic_dec(&nr_rotate_swap);
3313        if (swap_file) {
3314                if (inode && S_ISREG(inode->i_mode)) {
3315                        inode_unlock(inode);
3316                        inode = NULL;
3317                }
3318                filp_close(swap_file, NULL);
3319        }
3320out:
3321        if (page && !IS_ERR(page)) {
3322                kunmap(page);
3323                put_page(page);
3324        }
3325        if (name)
3326                putname(name);
3327        if (inode && S_ISREG(inode->i_mode))
3328                inode_unlock(inode);
3329        if (!error)
3330                enable_swap_slots_cache();
3331        return error;
3332}
3333
3334void si_swapinfo(struct sysinfo *val)
3335{
3336        unsigned int type;
3337        unsigned long nr_to_be_unused = 0;
3338
3339        spin_lock(&swap_lock);
3340        for (type = 0; type < nr_swapfiles; type++) {
3341                struct swap_info_struct *si = swap_info[type];
3342
3343                if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
3344                        nr_to_be_unused += si->inuse_pages;
3345        }
3346        val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
3347        val->totalswap = total_swap_pages + nr_to_be_unused;
3348        spin_unlock(&swap_lock);
3349}
3350
3351/*
3352 * Verify that a swap entry is valid and increment its swap map count.
3353 *
3354 * Returns error code in following case.
3355 * - success -> 0
3356 * - swp_entry is invalid -> EINVAL
3357 * - swp_entry is migration entry -> EINVAL
3358 * - swap-cache reference is requested but there is already one. -> EEXIST
3359 * - swap-cache reference is requested but the entry is not used. -> ENOENT
3360 * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
3361 */
3362static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
3363{
3364        struct swap_info_struct *p;
3365        struct swap_cluster_info *ci;
3366        unsigned long offset, type;
3367        unsigned char count;
3368        unsigned char has_cache;
3369        int err = -EINVAL;
3370
3371        if (non_swap_entry(entry))
3372                goto out;
3373
3374        type = swp_type(entry);
3375        if (type >= nr_swapfiles)
3376                goto bad_file;
3377        p = swap_info[type];
3378        offset = swp_offset(entry);
3379        if (unlikely(offset >= p->max))
3380                goto out;
3381
3382        ci = lock_cluster_or_swap_info(p, offset);
3383
3384        count = p->swap_map[offset];
3385
3386        /*
3387         * swapin_readahead() doesn't check if a swap entry is valid, so the
3388         * swap entry could be SWAP_MAP_BAD. Check here with lock held.
3389         */
3390        if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
3391                err = -ENOENT;
3392                goto unlock_out;
3393        }
3394
3395        has_cache = count & SWAP_HAS_CACHE;
3396        count &= ~SWAP_HAS_CACHE;
3397        err = 0;
3398
3399        if (usage == SWAP_HAS_CACHE) {
3400
3401                /* set SWAP_HAS_CACHE if there is no cache and entry is used */
3402                if (!has_cache && count)
3403                        has_cache = SWAP_HAS_CACHE;
3404                else if (has_cache)             /* someone else added cache */
3405                        err = -EEXIST;
3406                else                            /* no users remaining */
3407                        err = -ENOENT;
3408
3409        } else if (count || has_cache) {
3410
3411                if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
3412                        count += usage;
3413                else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
3414                        err = -EINVAL;
3415                else if (swap_count_continued(p, offset, count))
3416                        count = COUNT_CONTINUED;
3417                else
3418                        err = -ENOMEM;
3419        } else
3420                err = -ENOENT;                  /* unused swap entry */
3421
3422        p->swap_map[offset] = count | has_cache;
3423
3424unlock_out:
3425        unlock_cluster_or_swap_info(p, ci);
3426out:
3427        return err;
3428
3429bad_file:
3430        pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
3431        goto out;
3432}
3433
3434/*
3435 * Help swapoff by noting that swap entry belongs to shmem/tmpfs
3436 * (in which case its reference count is never incremented).
3437 */
3438void swap_shmem_alloc(swp_entry_t entry)
3439{
3440        __swap_duplicate(entry, SWAP_MAP_SHMEM);
3441}
3442
3443/*
3444 * Increase reference count of swap entry by 1.
3445 * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
3446 * but could not be atomically allocated.  Returns 0, just as if it succeeded,
3447 * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
3448 * might occur if a page table entry has got corrupted.
3449 */
3450int swap_duplicate(swp_entry_t entry)
3451{
3452        int err = 0;
3453
3454        while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
3455                err = add_swap_count_continuation(entry, GFP_ATOMIC);
3456        return err;
3457}
3458
3459/*
3460 * @entry: swap entry for which we allocate swap cache.
3461 *
3462 * Called when allocating swap cache for existing swap entry,
3463 * This can return error codes. Returns 0 at success.
3464 * -EBUSY means there is a swap cache.
3465 * Note: return code is different from swap_duplicate().
3466 */
3467int swapcache_prepare(swp_entry_t entry)
3468{
3469        return __swap_duplicate(entry, SWAP_HAS_CACHE);
3470}
3471
3472struct swap_info_struct *swp_swap_info(swp_entry_t entry)
3473{
3474        return swap_info[swp_type(entry)];
3475}
3476
3477struct swap_info_struct *page_swap_info(struct page *page)
3478{
3479        swp_entry_t entry = { .val = page_private(page) };
3480        return swp_swap_info(entry);
3481}
3482
3483/*
3484 * out-of-line __page_file_ methods to avoid include hell.
3485 */
3486struct address_space *__page_file_mapping(struct page *page)
3487{
3488        return page_swap_info(page)->swap_file->f_mapping;
3489}
3490EXPORT_SYMBOL_GPL(__page_file_mapping);
3491
3492pgoff_t __page_file_index(struct page *page)
3493{
3494        swp_entry_t swap = { .val = page_private(page) };
3495        return swp_offset(swap);
3496}
3497EXPORT_SYMBOL_GPL(__page_file_index);
3498
3499/*
3500 * add_swap_count_continuation - called when a swap count is duplicated
3501 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
3502 * page of the original vmalloc'ed swap_map, to hold the continuation count
3503 * (for that entry and for its neighbouring PAGE_SIZE swap entries).  Called
3504 * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
3505 *
3506 * These continuation pages are seldom referenced: the common paths all work
3507 * on the original swap_map, only referring to a continuation page when the
3508 * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
3509 *
3510 * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
3511 * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
3512 * can be called after dropping locks.
3513 */
3514int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
3515{
3516        struct swap_info_struct *si;
3517        struct swap_cluster_info *ci;
3518        struct page *head;
3519        struct page *page;
3520        struct page *list_page;
3521        pgoff_t offset;
3522        unsigned char count;
3523
3524        /*
3525         * When debugging, it's easier to use __GFP_ZERO here; but it's better
3526         * for latency not to zero a page while GFP_ATOMIC and holding locks.
3527         */
3528        page = alloc_page(gfp_mask | __GFP_HIGHMEM);
3529
3530        si = swap_info_get(entry);
3531        if (!si) {
3532                /*
3533                 * An acceptable race has occurred since the failing
3534                 * __swap_duplicate(): the swap entry has been freed,
3535                 * perhaps even the whole swap_map cleared for swapoff.
3536                 */
3537                goto outer;
3538        }
3539
3540        offset = swp_offset(entry);
3541
3542        ci = lock_cluster(si, offset);
3543
3544        count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
3545
3546        if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
3547                /*
3548                 * The higher the swap count, the more likely it is that tasks
3549                 * will race to add swap count continuation: we need to avoid
3550                 * over-provisioning.
3551                 */
3552                goto out;
3553        }
3554
3555        if (!page) {
3556                unlock_cluster(ci);
3557                spin_unlock(&si->lock);
3558                return -ENOMEM;
3559        }
3560
3561        /*
3562         * We are fortunate that although vmalloc_to_page uses pte_offset_map,
3563         * no architecture is using highmem pages for kernel page tables: so it
3564         * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps.
3565         */
3566        head = vmalloc_to_page(si->swap_map + offset);
3567        offset &= ~PAGE_MASK;
3568
3569        spin_lock(&si->cont_lock);
3570        /*
3571         * Page allocation does not initialize the page's lru field,
3572         * but it does always reset its private field.
3573         */
3574        if (!page_private(head)) {
3575                BUG_ON(count & COUNT_CONTINUED);
3576                INIT_LIST_HEAD(&head->lru);
3577                set_page_private(head, SWP_CONTINUED);
3578                si->flags |= SWP_CONTINUED;
3579        }
3580
3581        list_for_each_entry(list_page, &head->lru, lru) {
3582                unsigned char *map;
3583
3584                /*
3585                 * If the previous map said no continuation, but we've found
3586                 * a continuation page, free our allocation and use this one.
3587                 */
3588                if (!(count & COUNT_CONTINUED))
3589                        goto out_unlock_cont;
3590
3591                map = kmap_atomic(list_page) + offset;
3592                count = *map;
3593                kunmap_atomic(map);
3594
3595                /*
3596                 * If this continuation count now has some space in it,
3597                 * free our allocation and use this one.
3598                 */
3599                if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
3600                        goto out_unlock_cont;
3601        }
3602
3603        list_add_tail(&page->lru, &head->lru);
3604        page = NULL;                    /* now it's attached, don't free it */
3605out_unlock_cont:
3606        spin_unlock(&si->cont_lock);
3607out:
3608        unlock_cluster(ci);
3609        spin_unlock(&si->lock);
3610outer:
3611        if (page)
3612                __free_page(page);
3613        return 0;
3614}
3615
3616/*
3617 * swap_count_continued - when the original swap_map count is incremented
3618 * from SWAP_MAP_MAX, check if there is already a continuation page to carry
3619 * into, carry if so, or else fail until a new continuation page is allocated;
3620 * when the original swap_map count is decremented from 0 with continuation,
3621 * borrow from the continuation and report whether it still holds more.
3622 * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster
3623 * lock.
3624 */
3625static bool swap_count_continued(struct swap_info_struct *si,
3626                                 pgoff_t offset, unsigned char count)
3627{
3628        struct page *head;
3629        struct page *page;
3630        unsigned char *map;
3631        bool ret;
3632
3633        head = vmalloc_to_page(si->swap_map + offset);
3634        if (page_private(head) != SWP_CONTINUED) {
3635                BUG_ON(count & COUNT_CONTINUED);
3636                return false;           /* need to add count continuation */
3637        }
3638
3639        spin_lock(&si->cont_lock);
3640        offset &= ~PAGE_MASK;
3641        page = list_entry(head->lru.next, struct page, lru);
3642        map = kmap_atomic(page) + offset;
3643
3644        if (count == SWAP_MAP_MAX)      /* initial increment from swap_map */
3645                goto init_map;          /* jump over SWAP_CONT_MAX checks */
3646
3647        if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
3648                /*
3649                 * Think of how you add 1 to 999
3650                 */
3651                while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
3652                        kunmap_atomic(map);
3653                        page = list_entry(page->lru.next, struct page, lru);
3654                        BUG_ON(page == head);
3655                        map = kmap_atomic(page) + offset;
3656                }
3657                if (*map == SWAP_CONT_MAX) {
3658                        kunmap_atomic(map);
3659                        page = list_entry(page->lru.next, struct page, lru);
3660                        if (page == head) {
3661                                ret = false;    /* add count continuation */
3662                                goto out;
3663                        }
3664                        map = kmap_atomic(page) + offset;
3665init_map:               *map = 0;               /* we didn't zero the page */
3666                }
3667                *map += 1;
3668                kunmap_atomic(map);
3669                page = list_entry(page->lru.prev, struct page, lru);
3670                while (page != head) {
3671                        map = kmap_atomic(page) + offset;
3672                        *map = COUNT_CONTINUED;
3673                        kunmap_atomic(map);
3674                        page = list_entry(page->lru.prev, struct page, lru);
3675                }
3676                ret = true;                     /* incremented */
3677
3678        } else {                                /* decrementing */
3679                /*
3680                 * Think of how you subtract 1 from 1000
3681                 */
3682                BUG_ON(count != COUNT_CONTINUED);
3683                while (*map == COUNT_CONTINUED) {
3684                        kunmap_atomic(map);
3685                        page = list_entry(page->lru.next, struct page, lru);
3686                        BUG_ON(page == head);
3687                        map = kmap_atomic(page) + offset;
3688                }
3689                BUG_ON(*map == 0);
3690                *map -= 1;
3691                if (*map == 0)
3692                        count = 0;
3693                kunmap_atomic(map);
3694                page = list_entry(page->lru.prev, struct page, lru);
3695                while (page != head) {
3696                        map = kmap_atomic(page) + offset;
3697                        *map = SWAP_CONT_MAX | count;
3698                        count = COUNT_CONTINUED;
3699                        kunmap_atomic(map);
3700                        page = list_entry(page->lru.prev, struct page, lru);
3701                }
3702                ret = count == COUNT_CONTINUED;
3703        }
3704out:
3705        spin_unlock(&si->cont_lock);
3706        return ret;
3707}
3708
3709/*
3710 * free_swap_count_continuations - swapoff free all the continuation pages
3711 * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
3712 */
3713static void free_swap_count_continuations(struct swap_info_struct *si)
3714{
3715        pgoff_t offset;
3716
3717        for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
3718                struct page *head;
3719                head = vmalloc_to_page(si->swap_map + offset);
3720                if (page_private(head)) {
3721                        struct page *page, *next;
3722
3723                        list_for_each_entry_safe(page, next, &head->lru, lru) {
3724                                list_del(&page->lru);
3725                                __free_page(page);
3726                        }
3727                }
3728        }
3729}
3730
3731#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
3732void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
3733                                  gfp_t gfp_mask)
3734{
3735        struct swap_info_struct *si, *next;
3736        if (!(gfp_mask & __GFP_IO) || !memcg)
3737                return;
3738
3739        if (!blk_cgroup_congested())
3740                return;
3741
3742        /*
3743         * We've already scheduled a throttle, avoid taking the global swap
3744         * lock.
3745         */
3746        if (current->throttle_queue)
3747                return;
3748
3749        spin_lock(&swap_avail_lock);
3750        plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
3751                                  avail_lists[node]) {
3752                if (si->bdev) {
3753                        blkcg_schedule_throttle(bdev_get_queue(si->bdev),
3754                                                true);
3755                        break;
3756                }
3757        }
3758        spin_unlock(&swap_avail_lock);
3759}
3760#endif
3761
3762static int __init swapfile_init(void)
3763{
3764        int nid;
3765
3766        swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
3767                                         GFP_KERNEL);
3768        if (!swap_avail_heads) {
3769                pr_emerg("Not enough memory for swap heads, swap is disabled\n");
3770                return -ENOMEM;
3771        }
3772
3773        for_each_node(nid)
3774                plist_head_init(&swap_avail_heads[nid]);
3775
3776        return 0;
3777}
3778subsys_initcall(swapfile_init);
3779