linux/mm/swapfile.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *  linux/mm/swapfile.c
   4 *
   5 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   6 *  Swap reorganised 29.12.95, Stephen Tweedie
   7 */
   8
   9#include <linux/mm.h>
  10#include <linux/sched/mm.h>
  11#include <linux/sched/task.h>
  12#include <linux/hugetlb.h>
  13#include <linux/mman.h>
  14#include <linux/slab.h>
  15#include <linux/kernel_stat.h>
  16#include <linux/swap.h>
  17#include <linux/vmalloc.h>
  18#include <linux/pagemap.h>
  19#include <linux/namei.h>
  20#include <linux/shmem_fs.h>
  21#include <linux/blkdev.h>
  22#include <linux/random.h>
  23#include <linux/writeback.h>
  24#include <linux/proc_fs.h>
  25#include <linux/seq_file.h>
  26#include <linux/init.h>
  27#include <linux/ksm.h>
  28#include <linux/rmap.h>
  29#include <linux/security.h>
  30#include <linux/backing-dev.h>
  31#include <linux/mutex.h>
  32#include <linux/capability.h>
  33#include <linux/syscalls.h>
  34#include <linux/memcontrol.h>
  35#include <linux/poll.h>
  36#include <linux/oom.h>
  37#include <linux/frontswap.h>
  38#include <linux/swapfile.h>
  39#include <linux/export.h>
  40#include <linux/swap_slots.h>
  41#include <linux/sort.h>
  42
  43#include <asm/pgtable.h>
  44#include <asm/tlbflush.h>
  45#include <linux/swapops.h>
  46#include <linux/swap_cgroup.h>
  47
  48static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
  49                                 unsigned char);
  50static void free_swap_count_continuations(struct swap_info_struct *);
  51static sector_t map_swap_entry(swp_entry_t, struct block_device**);
  52
  53DEFINE_SPINLOCK(swap_lock);
  54static unsigned int nr_swapfiles;
  55atomic_long_t nr_swap_pages;
  56/*
  57 * Some modules use swappable objects and may try to swap them out under
  58 * memory pressure (via the shrinker). Before doing so, they may wish to
  59 * check to see if any swap space is available.
  60 */
  61EXPORT_SYMBOL_GPL(nr_swap_pages);
  62/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
  63long total_swap_pages;
  64static int least_priority = -1;
  65
  66static const char Bad_file[] = "Bad swap file entry ";
  67static const char Unused_file[] = "Unused swap file entry ";
  68static const char Bad_offset[] = "Bad swap offset entry ";
  69static const char Unused_offset[] = "Unused swap offset entry ";
  70
  71/*
  72 * all active swap_info_structs
  73 * protected with swap_lock, and ordered by priority.
  74 */
  75PLIST_HEAD(swap_active_head);
  76
  77/*
  78 * all available (active, not full) swap_info_structs
  79 * protected with swap_avail_lock, ordered by priority.
  80 * This is used by get_swap_page() instead of swap_active_head
  81 * because swap_active_head includes all swap_info_structs,
  82 * but get_swap_page() doesn't need to look at full ones.
  83 * This uses its own lock instead of swap_lock because when a
  84 * swap_info_struct changes between not-full/full, it needs to
  85 * add/remove itself to/from this list, but the swap_info_struct->lock
  86 * is held and the locking order requires swap_lock to be taken
  87 * before any swap_info_struct->lock.
  88 */
  89static struct plist_head *swap_avail_heads;
  90static DEFINE_SPINLOCK(swap_avail_lock);
  91
  92struct swap_info_struct *swap_info[MAX_SWAPFILES];
  93
  94static DEFINE_MUTEX(swapon_mutex);
  95
  96static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
  97/* Activity counter to indicate that a swapon or swapoff has occurred */
  98static atomic_t proc_poll_event = ATOMIC_INIT(0);
  99
 100atomic_t nr_rotate_swap = ATOMIC_INIT(0);
 101
 102static struct swap_info_struct *swap_type_to_swap_info(int type)
 103{
 104        if (type >= READ_ONCE(nr_swapfiles))
 105                return NULL;
 106
 107        smp_rmb();      /* Pairs with smp_wmb in alloc_swap_info. */
 108        return READ_ONCE(swap_info[type]);
 109}
 110
 111static inline unsigned char swap_count(unsigned char ent)
 112{
 113        return ent & ~SWAP_HAS_CACHE;   /* may include COUNT_CONTINUED flag */
 114}
 115
 116/* Reclaim the swap entry anyway if possible */
 117#define TTRS_ANYWAY             0x1
 118/*
 119 * Reclaim the swap entry if there are no more mappings of the
 120 * corresponding page
 121 */
 122#define TTRS_UNMAPPED           0x2
 123/* Reclaim the swap entry if swap is getting full*/
 124#define TTRS_FULL               0x4
 125
 126/* returns 1 if swap entry is freed */
 127static int __try_to_reclaim_swap(struct swap_info_struct *si,
 128                                 unsigned long offset, unsigned long flags)
 129{
 130        swp_entry_t entry = swp_entry(si->type, offset);
 131        struct page *page;
 132        int ret = 0;
 133
 134        page = find_get_page(swap_address_space(entry), offset);
 135        if (!page)
 136                return 0;
 137        /*
 138         * When this function is called from scan_swap_map_slots() and it's
 139         * called by vmscan.c at reclaiming pages. So, we hold a lock on a page,
 140         * here. We have to use trylock for avoiding deadlock. This is a special
 141         * case and you should use try_to_free_swap() with explicit lock_page()
 142         * in usual operations.
 143         */
 144        if (trylock_page(page)) {
 145                if ((flags & TTRS_ANYWAY) ||
 146                    ((flags & TTRS_UNMAPPED) && !page_mapped(page)) ||
 147                    ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
 148                        ret = try_to_free_swap(page);
 149                unlock_page(page);
 150        }
 151        put_page(page);
 152        return ret;
 153}
 154
 155/*
 156 * swapon tell device that all the old swap contents can be discarded,
 157 * to allow the swap device to optimize its wear-levelling.
 158 */
 159static int discard_swap(struct swap_info_struct *si)
 160{
 161        struct swap_extent *se;
 162        sector_t start_block;
 163        sector_t nr_blocks;
 164        int err = 0;
 165
 166        /* Do not discard the swap header page! */
 167        se = &si->first_swap_extent;
 168        start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
 169        nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
 170        if (nr_blocks) {
 171                err = blkdev_issue_discard(si->bdev, start_block,
 172                                nr_blocks, GFP_KERNEL, 0);
 173                if (err)
 174                        return err;
 175                cond_resched();
 176        }
 177
 178        list_for_each_entry(se, &si->first_swap_extent.list, list) {
 179                start_block = se->start_block << (PAGE_SHIFT - 9);
 180                nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
 181
 182                err = blkdev_issue_discard(si->bdev, start_block,
 183                                nr_blocks, GFP_KERNEL, 0);
 184                if (err)
 185                        break;
 186
 187                cond_resched();
 188        }
 189        return err;             /* That will often be -EOPNOTSUPP */
 190}
 191
 192/*
 193 * swap allocation tell device that a cluster of swap can now be discarded,
 194 * to allow the swap device to optimize its wear-levelling.
 195 */
 196static void discard_swap_cluster(struct swap_info_struct *si,
 197                                 pgoff_t start_page, pgoff_t nr_pages)
 198{
 199        struct swap_extent *se = si->curr_swap_extent;
 200        int found_extent = 0;
 201
 202        while (nr_pages) {
 203                if (se->start_page <= start_page &&
 204                    start_page < se->start_page + se->nr_pages) {
 205                        pgoff_t offset = start_page - se->start_page;
 206                        sector_t start_block = se->start_block + offset;
 207                        sector_t nr_blocks = se->nr_pages - offset;
 208
 209                        if (nr_blocks > nr_pages)
 210                                nr_blocks = nr_pages;
 211                        start_page += nr_blocks;
 212                        nr_pages -= nr_blocks;
 213
 214                        if (!found_extent++)
 215                                si->curr_swap_extent = se;
 216
 217                        start_block <<= PAGE_SHIFT - 9;
 218                        nr_blocks <<= PAGE_SHIFT - 9;
 219                        if (blkdev_issue_discard(si->bdev, start_block,
 220                                    nr_blocks, GFP_NOIO, 0))
 221                                break;
 222                }
 223
 224                se = list_next_entry(se, list);
 225        }
 226}
 227
 228#ifdef CONFIG_THP_SWAP
 229#define SWAPFILE_CLUSTER        HPAGE_PMD_NR
 230
 231#define swap_entry_size(size)   (size)
 232#else
 233#define SWAPFILE_CLUSTER        256
 234
 235/*
 236 * Define swap_entry_size() as constant to let compiler to optimize
 237 * out some code if !CONFIG_THP_SWAP
 238 */
 239#define swap_entry_size(size)   1
 240#endif
 241#define LATENCY_LIMIT           256
 242
 243static inline void cluster_set_flag(struct swap_cluster_info *info,
 244        unsigned int flag)
 245{
 246        info->flags = flag;
 247}
 248
 249static inline unsigned int cluster_count(struct swap_cluster_info *info)
 250{
 251        return info->data;
 252}
 253
 254static inline void cluster_set_count(struct swap_cluster_info *info,
 255                                     unsigned int c)
 256{
 257        info->data = c;
 258}
 259
 260static inline void cluster_set_count_flag(struct swap_cluster_info *info,
 261                                         unsigned int c, unsigned int f)
 262{
 263        info->flags = f;
 264        info->data = c;
 265}
 266
 267static inline unsigned int cluster_next(struct swap_cluster_info *info)
 268{
 269        return info->data;
 270}
 271
 272static inline void cluster_set_next(struct swap_cluster_info *info,
 273                                    unsigned int n)
 274{
 275        info->data = n;
 276}
 277
 278static inline void cluster_set_next_flag(struct swap_cluster_info *info,
 279                                         unsigned int n, unsigned int f)
 280{
 281        info->flags = f;
 282        info->data = n;
 283}
 284
 285static inline bool cluster_is_free(struct swap_cluster_info *info)
 286{
 287        return info->flags & CLUSTER_FLAG_FREE;
 288}
 289
 290static inline bool cluster_is_null(struct swap_cluster_info *info)
 291{
 292        return info->flags & CLUSTER_FLAG_NEXT_NULL;
 293}
 294
 295static inline void cluster_set_null(struct swap_cluster_info *info)
 296{
 297        info->flags = CLUSTER_FLAG_NEXT_NULL;
 298        info->data = 0;
 299}
 300
 301static inline bool cluster_is_huge(struct swap_cluster_info *info)
 302{
 303        if (IS_ENABLED(CONFIG_THP_SWAP))
 304                return info->flags & CLUSTER_FLAG_HUGE;
 305        return false;
 306}
 307
 308static inline void cluster_clear_huge(struct swap_cluster_info *info)
 309{
 310        info->flags &= ~CLUSTER_FLAG_HUGE;
 311}
 312
 313static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
 314                                                     unsigned long offset)
 315{
 316        struct swap_cluster_info *ci;
 317
 318        ci = si->cluster_info;
 319        if (ci) {
 320                ci += offset / SWAPFILE_CLUSTER;
 321                spin_lock(&ci->lock);
 322        }
 323        return ci;
 324}
 325
 326static inline void unlock_cluster(struct swap_cluster_info *ci)
 327{
 328        if (ci)
 329                spin_unlock(&ci->lock);
 330}
 331
 332/*
 333 * Determine the locking method in use for this device.  Return
 334 * swap_cluster_info if SSD-style cluster-based locking is in place.
 335 */
 336static inline struct swap_cluster_info *lock_cluster_or_swap_info(
 337                struct swap_info_struct *si, unsigned long offset)
 338{
 339        struct swap_cluster_info *ci;
 340
 341        /* Try to use fine-grained SSD-style locking if available: */
 342        ci = lock_cluster(si, offset);
 343        /* Otherwise, fall back to traditional, coarse locking: */
 344        if (!ci)
 345                spin_lock(&si->lock);
 346
 347        return ci;
 348}
 349
 350static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
 351                                               struct swap_cluster_info *ci)
 352{
 353        if (ci)
 354                unlock_cluster(ci);
 355        else
 356                spin_unlock(&si->lock);
 357}
 358
 359static inline bool cluster_list_empty(struct swap_cluster_list *list)
 360{
 361        return cluster_is_null(&list->head);
 362}
 363
 364static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
 365{
 366        return cluster_next(&list->head);
 367}
 368
 369static void cluster_list_init(struct swap_cluster_list *list)
 370{
 371        cluster_set_null(&list->head);
 372        cluster_set_null(&list->tail);
 373}
 374
 375static void cluster_list_add_tail(struct swap_cluster_list *list,
 376                                  struct swap_cluster_info *ci,
 377                                  unsigned int idx)
 378{
 379        if (cluster_list_empty(list)) {
 380                cluster_set_next_flag(&list->head, idx, 0);
 381                cluster_set_next_flag(&list->tail, idx, 0);
 382        } else {
 383                struct swap_cluster_info *ci_tail;
 384                unsigned int tail = cluster_next(&list->tail);
 385
 386                /*
 387                 * Nested cluster lock, but both cluster locks are
 388                 * only acquired when we held swap_info_struct->lock
 389                 */
 390                ci_tail = ci + tail;
 391                spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
 392                cluster_set_next(ci_tail, idx);
 393                spin_unlock(&ci_tail->lock);
 394                cluster_set_next_flag(&list->tail, idx, 0);
 395        }
 396}
 397
 398static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
 399                                           struct swap_cluster_info *ci)
 400{
 401        unsigned int idx;
 402
 403        idx = cluster_next(&list->head);
 404        if (cluster_next(&list->tail) == idx) {
 405                cluster_set_null(&list->head);
 406                cluster_set_null(&list->tail);
 407        } else
 408                cluster_set_next_flag(&list->head,
 409                                      cluster_next(&ci[idx]), 0);
 410
 411        return idx;
 412}
 413
 414/* Add a cluster to discard list and schedule it to do discard */
 415static void swap_cluster_schedule_discard(struct swap_info_struct *si,
 416                unsigned int idx)
 417{
 418        /*
 419         * If scan_swap_map() can't find a free cluster, it will check
 420         * si->swap_map directly. To make sure the discarding cluster isn't
 421         * taken by scan_swap_map(), mark the swap entries bad (occupied). It
 422         * will be cleared after discard
 423         */
 424        memset(si->swap_map + idx * SWAPFILE_CLUSTER,
 425                        SWAP_MAP_BAD, SWAPFILE_CLUSTER);
 426
 427        cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
 428
 429        schedule_work(&si->discard_work);
 430}
 431
 432static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
 433{
 434        struct swap_cluster_info *ci = si->cluster_info;
 435
 436        cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
 437        cluster_list_add_tail(&si->free_clusters, ci, idx);
 438}
 439
 440/*
 441 * Doing discard actually. After a cluster discard is finished, the cluster
 442 * will be added to free cluster list. caller should hold si->lock.
 443*/
 444static void swap_do_scheduled_discard(struct swap_info_struct *si)
 445{
 446        struct swap_cluster_info *info, *ci;
 447        unsigned int idx;
 448
 449        info = si->cluster_info;
 450
 451        while (!cluster_list_empty(&si->discard_clusters)) {
 452                idx = cluster_list_del_first(&si->discard_clusters, info);
 453                spin_unlock(&si->lock);
 454
 455                discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
 456                                SWAPFILE_CLUSTER);
 457
 458                spin_lock(&si->lock);
 459                ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
 460                __free_cluster(si, idx);
 461                memset(si->swap_map + idx * SWAPFILE_CLUSTER,
 462                                0, SWAPFILE_CLUSTER);
 463                unlock_cluster(ci);
 464        }
 465}
 466
 467static void swap_discard_work(struct work_struct *work)
 468{
 469        struct swap_info_struct *si;
 470
 471        si = container_of(work, struct swap_info_struct, discard_work);
 472
 473        spin_lock(&si->lock);
 474        swap_do_scheduled_discard(si);
 475        spin_unlock(&si->lock);
 476}
 477
 478static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
 479{
 480        struct swap_cluster_info *ci = si->cluster_info;
 481
 482        VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
 483        cluster_list_del_first(&si->free_clusters, ci);
 484        cluster_set_count_flag(ci + idx, 0, 0);
 485}
 486
 487static void free_cluster(struct swap_info_struct *si, unsigned long idx)
 488{
 489        struct swap_cluster_info *ci = si->cluster_info + idx;
 490
 491        VM_BUG_ON(cluster_count(ci) != 0);
 492        /*
 493         * If the swap is discardable, prepare discard the cluster
 494         * instead of free it immediately. The cluster will be freed
 495         * after discard.
 496         */
 497        if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
 498            (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
 499                swap_cluster_schedule_discard(si, idx);
 500                return;
 501        }
 502
 503        __free_cluster(si, idx);
 504}
 505
 506/*
 507 * The cluster corresponding to page_nr will be used. The cluster will be
 508 * removed from free cluster list and its usage counter will be increased.
 509 */
 510static void inc_cluster_info_page(struct swap_info_struct *p,
 511        struct swap_cluster_info *cluster_info, unsigned long page_nr)
 512{
 513        unsigned long idx = page_nr / SWAPFILE_CLUSTER;
 514
 515        if (!cluster_info)
 516                return;
 517        if (cluster_is_free(&cluster_info[idx]))
 518                alloc_cluster(p, idx);
 519
 520        VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
 521        cluster_set_count(&cluster_info[idx],
 522                cluster_count(&cluster_info[idx]) + 1);
 523}
 524
 525/*
 526 * The cluster corresponding to page_nr decreases one usage. If the usage
 527 * counter becomes 0, which means no page in the cluster is in using, we can
 528 * optionally discard the cluster and add it to free cluster list.
 529 */
 530static void dec_cluster_info_page(struct swap_info_struct *p,
 531        struct swap_cluster_info *cluster_info, unsigned long page_nr)
 532{
 533        unsigned long idx = page_nr / SWAPFILE_CLUSTER;
 534
 535        if (!cluster_info)
 536                return;
 537
 538        VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
 539        cluster_set_count(&cluster_info[idx],
 540                cluster_count(&cluster_info[idx]) - 1);
 541
 542        if (cluster_count(&cluster_info[idx]) == 0)
 543                free_cluster(p, idx);
 544}
 545
 546/*
 547 * It's possible scan_swap_map() uses a free cluster in the middle of free
 548 * cluster list. Avoiding such abuse to avoid list corruption.
 549 */
 550static bool
 551scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
 552        unsigned long offset)
 553{
 554        struct percpu_cluster *percpu_cluster;
 555        bool conflict;
 556
 557        offset /= SWAPFILE_CLUSTER;
 558        conflict = !cluster_list_empty(&si->free_clusters) &&
 559                offset != cluster_list_first(&si->free_clusters) &&
 560                cluster_is_free(&si->cluster_info[offset]);
 561
 562        if (!conflict)
 563                return false;
 564
 565        percpu_cluster = this_cpu_ptr(si->percpu_cluster);
 566        cluster_set_null(&percpu_cluster->index);
 567        return true;
 568}
 569
 570/*
 571 * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
 572 * might involve allocating a new cluster for current CPU too.
 573 */
 574static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
 575        unsigned long *offset, unsigned long *scan_base)
 576{
 577        struct percpu_cluster *cluster;
 578        struct swap_cluster_info *ci;
 579        bool found_free;
 580        unsigned long tmp, max;
 581
 582new_cluster:
 583        cluster = this_cpu_ptr(si->percpu_cluster);
 584        if (cluster_is_null(&cluster->index)) {
 585                if (!cluster_list_empty(&si->free_clusters)) {
 586                        cluster->index = si->free_clusters.head;
 587                        cluster->next = cluster_next(&cluster->index) *
 588                                        SWAPFILE_CLUSTER;
 589                } else if (!cluster_list_empty(&si->discard_clusters)) {
 590                        /*
 591                         * we don't have free cluster but have some clusters in
 592                         * discarding, do discard now and reclaim them
 593                         */
 594                        swap_do_scheduled_discard(si);
 595                        *scan_base = *offset = si->cluster_next;
 596                        goto new_cluster;
 597                } else
 598                        return false;
 599        }
 600
 601        found_free = false;
 602
 603        /*
 604         * Other CPUs can use our cluster if they can't find a free cluster,
 605         * check if there is still free entry in the cluster
 606         */
 607        tmp = cluster->next;
 608        max = min_t(unsigned long, si->max,
 609                    (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
 610        if (tmp >= max) {
 611                cluster_set_null(&cluster->index);
 612                goto new_cluster;
 613        }
 614        ci = lock_cluster(si, tmp);
 615        while (tmp < max) {
 616                if (!si->swap_map[tmp]) {
 617                        found_free = true;
 618                        break;
 619                }
 620                tmp++;
 621        }
 622        unlock_cluster(ci);
 623        if (!found_free) {
 624                cluster_set_null(&cluster->index);
 625                goto new_cluster;
 626        }
 627        cluster->next = tmp + 1;
 628        *offset = tmp;
 629        *scan_base = tmp;
 630        return found_free;
 631}
 632
 633static void __del_from_avail_list(struct swap_info_struct *p)
 634{
 635        int nid;
 636
 637        for_each_node(nid)
 638                plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
 639}
 640
 641static void del_from_avail_list(struct swap_info_struct *p)
 642{
 643        spin_lock(&swap_avail_lock);
 644        __del_from_avail_list(p);
 645        spin_unlock(&swap_avail_lock);
 646}
 647
 648static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
 649                             unsigned int nr_entries)
 650{
 651        unsigned int end = offset + nr_entries - 1;
 652
 653        if (offset == si->lowest_bit)
 654                si->lowest_bit += nr_entries;
 655        if (end == si->highest_bit)
 656                si->highest_bit -= nr_entries;
 657        si->inuse_pages += nr_entries;
 658        if (si->inuse_pages == si->pages) {
 659                si->lowest_bit = si->max;
 660                si->highest_bit = 0;
 661                del_from_avail_list(si);
 662        }
 663}
 664
 665static void add_to_avail_list(struct swap_info_struct *p)
 666{
 667        int nid;
 668
 669        spin_lock(&swap_avail_lock);
 670        for_each_node(nid) {
 671                WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
 672                plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
 673        }
 674        spin_unlock(&swap_avail_lock);
 675}
 676
 677static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
 678                            unsigned int nr_entries)
 679{
 680        unsigned long end = offset + nr_entries - 1;
 681        void (*swap_slot_free_notify)(struct block_device *, unsigned long);
 682
 683        if (offset < si->lowest_bit)
 684                si->lowest_bit = offset;
 685        if (end > si->highest_bit) {
 686                bool was_full = !si->highest_bit;
 687
 688                si->highest_bit = end;
 689                if (was_full && (si->flags & SWP_WRITEOK))
 690                        add_to_avail_list(si);
 691        }
 692        atomic_long_add(nr_entries, &nr_swap_pages);
 693        si->inuse_pages -= nr_entries;
 694        if (si->flags & SWP_BLKDEV)
 695                swap_slot_free_notify =
 696                        si->bdev->bd_disk->fops->swap_slot_free_notify;
 697        else
 698                swap_slot_free_notify = NULL;
 699        while (offset <= end) {
 700                frontswap_invalidate_page(si->type, offset);
 701                if (swap_slot_free_notify)
 702                        swap_slot_free_notify(si->bdev, offset);
 703                offset++;
 704        }
 705}
 706
 707static int scan_swap_map_slots(struct swap_info_struct *si,
 708                               unsigned char usage, int nr,
 709                               swp_entry_t slots[])
 710{
 711        struct swap_cluster_info *ci;
 712        unsigned long offset;
 713        unsigned long scan_base;
 714        unsigned long last_in_cluster = 0;
 715        int latency_ration = LATENCY_LIMIT;
 716        int n_ret = 0;
 717
 718        if (nr > SWAP_BATCH)
 719                nr = SWAP_BATCH;
 720
 721        /*
 722         * We try to cluster swap pages by allocating them sequentially
 723         * in swap.  Once we've allocated SWAPFILE_CLUSTER pages this
 724         * way, however, we resort to first-free allocation, starting
 725         * a new cluster.  This prevents us from scattering swap pages
 726         * all over the entire swap partition, so that we reduce
 727         * overall disk seek times between swap pages.  -- sct
 728         * But we do now try to find an empty cluster.  -Andrea
 729         * And we let swap pages go all over an SSD partition.  Hugh
 730         */
 731
 732        si->flags += SWP_SCANNING;
 733        scan_base = offset = si->cluster_next;
 734
 735        /* SSD algorithm */
 736        if (si->cluster_info) {
 737                if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
 738                        goto checks;
 739                else
 740                        goto scan;
 741        }
 742
 743        if (unlikely(!si->cluster_nr--)) {
 744                if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
 745                        si->cluster_nr = SWAPFILE_CLUSTER - 1;
 746                        goto checks;
 747                }
 748
 749                spin_unlock(&si->lock);
 750
 751                /*
 752                 * If seek is expensive, start searching for new cluster from
 753                 * start of partition, to minimize the span of allocated swap.
 754                 * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
 755                 * case, just handled by scan_swap_map_try_ssd_cluster() above.
 756                 */
 757                scan_base = offset = si->lowest_bit;
 758                last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
 759
 760                /* Locate the first empty (unaligned) cluster */
 761                for (; last_in_cluster <= si->highest_bit; offset++) {
 762                        if (si->swap_map[offset])
 763                                last_in_cluster = offset + SWAPFILE_CLUSTER;
 764                        else if (offset == last_in_cluster) {
 765                                spin_lock(&si->lock);
 766                                offset -= SWAPFILE_CLUSTER - 1;
 767                                si->cluster_next = offset;
 768                                si->cluster_nr = SWAPFILE_CLUSTER - 1;
 769                                goto checks;
 770                        }
 771                        if (unlikely(--latency_ration < 0)) {
 772                                cond_resched();
 773                                latency_ration = LATENCY_LIMIT;
 774                        }
 775                }
 776
 777                offset = scan_base;
 778                spin_lock(&si->lock);
 779                si->cluster_nr = SWAPFILE_CLUSTER - 1;
 780        }
 781
 782checks:
 783        if (si->cluster_info) {
 784                while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
 785                /* take a break if we already got some slots */
 786                        if (n_ret)
 787                                goto done;
 788                        if (!scan_swap_map_try_ssd_cluster(si, &offset,
 789                                                        &scan_base))
 790                                goto scan;
 791                }
 792        }
 793        if (!(si->flags & SWP_WRITEOK))
 794                goto no_page;
 795        if (!si->highest_bit)
 796                goto no_page;
 797        if (offset > si->highest_bit)
 798                scan_base = offset = si->lowest_bit;
 799
 800        ci = lock_cluster(si, offset);
 801        /* reuse swap entry of cache-only swap if not busy. */
 802        if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
 803                int swap_was_freed;
 804                unlock_cluster(ci);
 805                spin_unlock(&si->lock);
 806                swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
 807                spin_lock(&si->lock);
 808                /* entry was freed successfully, try to use this again */
 809                if (swap_was_freed)
 810                        goto checks;
 811                goto scan; /* check next one */
 812        }
 813
 814        if (si->swap_map[offset]) {
 815                unlock_cluster(ci);
 816                if (!n_ret)
 817                        goto scan;
 818                else
 819                        goto done;
 820        }
 821        si->swap_map[offset] = usage;
 822        inc_cluster_info_page(si, si->cluster_info, offset);
 823        unlock_cluster(ci);
 824
 825        swap_range_alloc(si, offset, 1);
 826        si->cluster_next = offset + 1;
 827        slots[n_ret++] = swp_entry(si->type, offset);
 828
 829        /* got enough slots or reach max slots? */
 830        if ((n_ret == nr) || (offset >= si->highest_bit))
 831                goto done;
 832
 833        /* search for next available slot */
 834
 835        /* time to take a break? */
 836        if (unlikely(--latency_ration < 0)) {
 837                if (n_ret)
 838                        goto done;
 839                spin_unlock(&si->lock);
 840                cond_resched();
 841                spin_lock(&si->lock);
 842                latency_ration = LATENCY_LIMIT;
 843        }
 844
 845        /* try to get more slots in cluster */
 846        if (si->cluster_info) {
 847                if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
 848                        goto checks;
 849                else
 850                        goto done;
 851        }
 852        /* non-ssd case */
 853        ++offset;
 854
 855        /* non-ssd case, still more slots in cluster? */
 856        if (si->cluster_nr && !si->swap_map[offset]) {
 857                --si->cluster_nr;
 858                goto checks;
 859        }
 860
 861done:
 862        si->flags -= SWP_SCANNING;
 863        return n_ret;
 864
 865scan:
 866        spin_unlock(&si->lock);
 867        while (++offset <= si->highest_bit) {
 868                if (!si->swap_map[offset]) {
 869                        spin_lock(&si->lock);
 870                        goto checks;
 871                }
 872                if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
 873                        spin_lock(&si->lock);
 874                        goto checks;
 875                }
 876                if (unlikely(--latency_ration < 0)) {
 877                        cond_resched();
 878                        latency_ration = LATENCY_LIMIT;
 879                }
 880        }
 881        offset = si->lowest_bit;
 882        while (offset < scan_base) {
 883                if (!si->swap_map[offset]) {
 884                        spin_lock(&si->lock);
 885                        goto checks;
 886                }
 887                if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
 888                        spin_lock(&si->lock);
 889                        goto checks;
 890                }
 891                if (unlikely(--latency_ration < 0)) {
 892                        cond_resched();
 893                        latency_ration = LATENCY_LIMIT;
 894                }
 895                offset++;
 896        }
 897        spin_lock(&si->lock);
 898
 899no_page:
 900        si->flags -= SWP_SCANNING;
 901        return n_ret;
 902}
 903
 904static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
 905{
 906        unsigned long idx;
 907        struct swap_cluster_info *ci;
 908        unsigned long offset, i;
 909        unsigned char *map;
 910
 911        /*
 912         * Should not even be attempting cluster allocations when huge
 913         * page swap is disabled.  Warn and fail the allocation.
 914         */
 915        if (!IS_ENABLED(CONFIG_THP_SWAP)) {
 916                VM_WARN_ON_ONCE(1);
 917                return 0;
 918        }
 919
 920        if (cluster_list_empty(&si->free_clusters))
 921                return 0;
 922
 923        idx = cluster_list_first(&si->free_clusters);
 924        offset = idx * SWAPFILE_CLUSTER;
 925        ci = lock_cluster(si, offset);
 926        alloc_cluster(si, idx);
 927        cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
 928
 929        map = si->swap_map + offset;
 930        for (i = 0; i < SWAPFILE_CLUSTER; i++)
 931                map[i] = SWAP_HAS_CACHE;
 932        unlock_cluster(ci);
 933        swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
 934        *slot = swp_entry(si->type, offset);
 935
 936        return 1;
 937}
 938
 939static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
 940{
 941        unsigned long offset = idx * SWAPFILE_CLUSTER;
 942        struct swap_cluster_info *ci;
 943
 944        ci = lock_cluster(si, offset);
 945        memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
 946        cluster_set_count_flag(ci, 0, 0);
 947        free_cluster(si, idx);
 948        unlock_cluster(ci);
 949        swap_range_free(si, offset, SWAPFILE_CLUSTER);
 950}
 951
 952static unsigned long scan_swap_map(struct swap_info_struct *si,
 953                                   unsigned char usage)
 954{
 955        swp_entry_t entry;
 956        int n_ret;
 957
 958        n_ret = scan_swap_map_slots(si, usage, 1, &entry);
 959
 960        if (n_ret)
 961                return swp_offset(entry);
 962        else
 963                return 0;
 964
 965}
 966
 967int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
 968{
 969        unsigned long size = swap_entry_size(entry_size);
 970        struct swap_info_struct *si, *next;
 971        long avail_pgs;
 972        int n_ret = 0;
 973        int node;
 974
 975        /* Only single cluster request supported */
 976        WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
 977
 978        avail_pgs = atomic_long_read(&nr_swap_pages) / size;
 979        if (avail_pgs <= 0)
 980                goto noswap;
 981
 982        if (n_goal > SWAP_BATCH)
 983                n_goal = SWAP_BATCH;
 984
 985        if (n_goal > avail_pgs)
 986                n_goal = avail_pgs;
 987
 988        atomic_long_sub(n_goal * size, &nr_swap_pages);
 989
 990        spin_lock(&swap_avail_lock);
 991
 992start_over:
 993        node = numa_node_id();
 994        plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
 995                /* requeue si to after same-priority siblings */
 996                plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
 997                spin_unlock(&swap_avail_lock);
 998                spin_lock(&si->lock);
 999                if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
1000                        spin_lock(&swap_avail_lock);
1001                        if (plist_node_empty(&si->avail_lists[node])) {
1002                                spin_unlock(&si->lock);
1003                                goto nextsi;
1004                        }
1005                        WARN(!si->highest_bit,
1006                             "swap_info %d in list but !highest_bit\n",
1007                             si->type);
1008                        WARN(!(si->flags & SWP_WRITEOK),
1009                             "swap_info %d in list but !SWP_WRITEOK\n",
1010                             si->type);
1011                        __del_from_avail_list(si);
1012                        spin_unlock(&si->lock);
1013                        goto nextsi;
1014                }
1015                if (size == SWAPFILE_CLUSTER) {
1016                        if (!(si->flags & SWP_FS))
1017                                n_ret = swap_alloc_cluster(si, swp_entries);
1018                } else
1019                        n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
1020                                                    n_goal, swp_entries);
1021                spin_unlock(&si->lock);
1022                if (n_ret || size == SWAPFILE_CLUSTER)
1023                        goto check_out;
1024                pr_debug("scan_swap_map of si %d failed to find offset\n",
1025                        si->type);
1026
1027                spin_lock(&swap_avail_lock);
1028nextsi:
1029                /*
1030                 * if we got here, it's likely that si was almost full before,
1031                 * and since scan_swap_map() can drop the si->lock, multiple
1032                 * callers probably all tried to get a page from the same si
1033                 * and it filled up before we could get one; or, the si filled
1034                 * up between us dropping swap_avail_lock and taking si->lock.
1035                 * Since we dropped the swap_avail_lock, the swap_avail_head
1036                 * list may have been modified; so if next is still in the
1037                 * swap_avail_head list then try it, otherwise start over
1038                 * if we have not gotten any slots.
1039                 */
1040                if (plist_node_empty(&next->avail_lists[node]))
1041                        goto start_over;
1042        }
1043
1044        spin_unlock(&swap_avail_lock);
1045
1046check_out:
1047        if (n_ret < n_goal)
1048                atomic_long_add((long)(n_goal - n_ret) * size,
1049                                &nr_swap_pages);
1050noswap:
1051        return n_ret;
1052}
1053
1054/* The only caller of this function is now suspend routine */
1055swp_entry_t get_swap_page_of_type(int type)
1056{
1057        struct swap_info_struct *si = swap_type_to_swap_info(type);
1058        pgoff_t offset;
1059
1060        if (!si)
1061                goto fail;
1062
1063        spin_lock(&si->lock);
1064        if (si->flags & SWP_WRITEOK) {
1065                atomic_long_dec(&nr_swap_pages);
1066                /* This is called for allocating swap entry, not cache */
1067                offset = scan_swap_map(si, 1);
1068                if (offset) {
1069                        spin_unlock(&si->lock);
1070                        return swp_entry(type, offset);
1071                }
1072                atomic_long_inc(&nr_swap_pages);
1073        }
1074        spin_unlock(&si->lock);
1075fail:
1076        return (swp_entry_t) {0};
1077}
1078
1079static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
1080{
1081        struct swap_info_struct *p;
1082        unsigned long offset, type;
1083
1084        if (!entry.val)
1085                goto out;
1086        type = swp_type(entry);
1087        p = swap_type_to_swap_info(type);
1088        if (!p)
1089                goto bad_nofile;
1090        if (!(p->flags & SWP_USED))
1091                goto bad_device;
1092        offset = swp_offset(entry);
1093        if (offset >= p->max)
1094                goto bad_offset;
1095        return p;
1096
1097bad_offset:
1098        pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val);
1099        goto out;
1100bad_device:
1101        pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val);
1102        goto out;
1103bad_nofile:
1104        pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val);
1105out:
1106        return NULL;
1107}
1108
1109static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
1110{
1111        struct swap_info_struct *p;
1112
1113        p = __swap_info_get(entry);
1114        if (!p)
1115                goto out;
1116        if (!p->swap_map[swp_offset(entry)])
1117                goto bad_free;
1118        return p;
1119
1120bad_free:
1121        pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
1122        goto out;
1123out:
1124        return NULL;
1125}
1126
1127static struct swap_info_struct *swap_info_get(swp_entry_t entry)
1128{
1129        struct swap_info_struct *p;
1130
1131        p = _swap_info_get(entry);
1132        if (p)
1133                spin_lock(&p->lock);
1134        return p;
1135}
1136
1137static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
1138                                        struct swap_info_struct *q)
1139{
1140        struct swap_info_struct *p;
1141
1142        p = _swap_info_get(entry);
1143
1144        if (p != q) {
1145                if (q != NULL)
1146                        spin_unlock(&q->lock);
1147                if (p != NULL)
1148                        spin_lock(&p->lock);
1149        }
1150        return p;
1151}
1152
1153static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
1154                                              unsigned long offset,
1155                                              unsigned char usage)
1156{
1157        unsigned char count;
1158        unsigned char has_cache;
1159
1160        count = p->swap_map[offset];
1161
1162        has_cache = count & SWAP_HAS_CACHE;
1163        count &= ~SWAP_HAS_CACHE;
1164
1165        if (usage == SWAP_HAS_CACHE) {
1166                VM_BUG_ON(!has_cache);
1167                has_cache = 0;
1168        } else if (count == SWAP_MAP_SHMEM) {
1169                /*
1170                 * Or we could insist on shmem.c using a special
1171                 * swap_shmem_free() and free_shmem_swap_and_cache()...
1172                 */
1173                count = 0;
1174        } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
1175                if (count == COUNT_CONTINUED) {
1176                        if (swap_count_continued(p, offset, count))
1177                                count = SWAP_MAP_MAX | COUNT_CONTINUED;
1178                        else
1179                                count = SWAP_MAP_MAX;
1180                } else
1181                        count--;
1182        }
1183
1184        usage = count | has_cache;
1185        p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
1186
1187        return usage;
1188}
1189
1190static unsigned char __swap_entry_free(struct swap_info_struct *p,
1191                                       swp_entry_t entry, unsigned char usage)
1192{
1193        struct swap_cluster_info *ci;
1194        unsigned long offset = swp_offset(entry);
1195
1196        ci = lock_cluster_or_swap_info(p, offset);
1197        usage = __swap_entry_free_locked(p, offset, usage);
1198        unlock_cluster_or_swap_info(p, ci);
1199        if (!usage)
1200                free_swap_slot(entry);
1201
1202        return usage;
1203}
1204
1205static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
1206{
1207        struct swap_cluster_info *ci;
1208        unsigned long offset = swp_offset(entry);
1209        unsigned char count;
1210
1211        ci = lock_cluster(p, offset);
1212        count = p->swap_map[offset];
1213        VM_BUG_ON(count != SWAP_HAS_CACHE);
1214        p->swap_map[offset] = 0;
1215        dec_cluster_info_page(p, p->cluster_info, offset);
1216        unlock_cluster(ci);
1217
1218        mem_cgroup_uncharge_swap(entry, 1);
1219        swap_range_free(p, offset, 1);
1220}
1221
1222/*
1223 * Caller has made sure that the swap device corresponding to entry
1224 * is still around or has not been recycled.
1225 */
1226void swap_free(swp_entry_t entry)
1227{
1228        struct swap_info_struct *p;
1229
1230        p = _swap_info_get(entry);
1231        if (p)
1232                __swap_entry_free(p, entry, 1);
1233}
1234
1235/*
1236 * Called after dropping swapcache to decrease refcnt to swap entries.
1237 */
1238void put_swap_page(struct page *page, swp_entry_t entry)
1239{
1240        unsigned long offset = swp_offset(entry);
1241        unsigned long idx = offset / SWAPFILE_CLUSTER;
1242        struct swap_cluster_info *ci;
1243        struct swap_info_struct *si;
1244        unsigned char *map;
1245        unsigned int i, free_entries = 0;
1246        unsigned char val;
1247        int size = swap_entry_size(hpage_nr_pages(page));
1248
1249        si = _swap_info_get(entry);
1250        if (!si)
1251                return;
1252
1253        ci = lock_cluster_or_swap_info(si, offset);
1254        if (size == SWAPFILE_CLUSTER) {
1255                VM_BUG_ON(!cluster_is_huge(ci));
1256                map = si->swap_map + offset;
1257                for (i = 0; i < SWAPFILE_CLUSTER; i++) {
1258                        val = map[i];
1259                        VM_BUG_ON(!(val & SWAP_HAS_CACHE));
1260                        if (val == SWAP_HAS_CACHE)
1261                                free_entries++;
1262                }
1263                cluster_clear_huge(ci);
1264                if (free_entries == SWAPFILE_CLUSTER) {
1265                        unlock_cluster_or_swap_info(si, ci);
1266                        spin_lock(&si->lock);
1267                        mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
1268                        swap_free_cluster(si, idx);
1269                        spin_unlock(&si->lock);
1270                        return;
1271                }
1272        }
1273        for (i = 0; i < size; i++, entry.val++) {
1274                if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
1275                        unlock_cluster_or_swap_info(si, ci);
1276                        free_swap_slot(entry);
1277                        if (i == size - 1)
1278                                return;
1279                        lock_cluster_or_swap_info(si, offset);
1280                }
1281        }
1282        unlock_cluster_or_swap_info(si, ci);
1283}
1284
1285#ifdef CONFIG_THP_SWAP
1286int split_swap_cluster(swp_entry_t entry)
1287{
1288        struct swap_info_struct *si;
1289        struct swap_cluster_info *ci;
1290        unsigned long offset = swp_offset(entry);
1291
1292        si = _swap_info_get(entry);
1293        if (!si)
1294                return -EBUSY;
1295        ci = lock_cluster(si, offset);
1296        cluster_clear_huge(ci);
1297        unlock_cluster(ci);
1298        return 0;
1299}
1300#endif
1301
1302static int swp_entry_cmp(const void *ent1, const void *ent2)
1303{
1304        const swp_entry_t *e1 = ent1, *e2 = ent2;
1305
1306        return (int)swp_type(*e1) - (int)swp_type(*e2);
1307}
1308
1309void swapcache_free_entries(swp_entry_t *entries, int n)
1310{
1311        struct swap_info_struct *p, *prev;
1312        int i;
1313
1314        if (n <= 0)
1315                return;
1316
1317        prev = NULL;
1318        p = NULL;
1319
1320        /*
1321         * Sort swap entries by swap device, so each lock is only taken once.
1322         * nr_swapfiles isn't absolutely correct, but the overhead of sort() is
1323         * so low that it isn't necessary to optimize further.
1324         */
1325        if (nr_swapfiles > 1)
1326                sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
1327        for (i = 0; i < n; ++i) {
1328                p = swap_info_get_cont(entries[i], prev);
1329                if (p)
1330                        swap_entry_free(p, entries[i]);
1331                prev = p;
1332        }
1333        if (p)
1334                spin_unlock(&p->lock);
1335}
1336
1337/*
1338 * How many references to page are currently swapped out?
1339 * This does not give an exact answer when swap count is continued,
1340 * but does include the high COUNT_CONTINUED flag to allow for that.
1341 */
1342int page_swapcount(struct page *page)
1343{
1344        int count = 0;
1345        struct swap_info_struct *p;
1346        struct swap_cluster_info *ci;
1347        swp_entry_t entry;
1348        unsigned long offset;
1349
1350        entry.val = page_private(page);
1351        p = _swap_info_get(entry);
1352        if (p) {
1353                offset = swp_offset(entry);
1354                ci = lock_cluster_or_swap_info(p, offset);
1355                count = swap_count(p->swap_map[offset]);
1356                unlock_cluster_or_swap_info(p, ci);
1357        }
1358        return count;
1359}
1360
1361int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
1362{
1363        pgoff_t offset = swp_offset(entry);
1364
1365        return swap_count(si->swap_map[offset]);
1366}
1367
1368static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
1369{
1370        int count = 0;
1371        pgoff_t offset = swp_offset(entry);
1372        struct swap_cluster_info *ci;
1373
1374        ci = lock_cluster_or_swap_info(si, offset);
1375        count = swap_count(si->swap_map[offset]);
1376        unlock_cluster_or_swap_info(si, ci);
1377        return count;
1378}
1379
1380/*
1381 * How many references to @entry are currently swapped out?
1382 * This does not give an exact answer when swap count is continued,
1383 * but does include the high COUNT_CONTINUED flag to allow for that.
1384 */
1385int __swp_swapcount(swp_entry_t entry)
1386{
1387        int count = 0;
1388        struct swap_info_struct *si;
1389
1390        si = __swap_info_get(entry);
1391        if (si)
1392                count = swap_swapcount(si, entry);
1393        return count;
1394}
1395
1396/*
1397 * How many references to @entry are currently swapped out?
1398 * This considers COUNT_CONTINUED so it returns exact answer.
1399 */
1400int swp_swapcount(swp_entry_t entry)
1401{
1402        int count, tmp_count, n;
1403        struct swap_info_struct *p;
1404        struct swap_cluster_info *ci;
1405        struct page *page;
1406        pgoff_t offset;
1407        unsigned char *map;
1408
1409        p = _swap_info_get(entry);
1410        if (!p)
1411                return 0;
1412
1413        offset = swp_offset(entry);
1414
1415        ci = lock_cluster_or_swap_info(p, offset);
1416
1417        count = swap_count(p->swap_map[offset]);
1418        if (!(count & COUNT_CONTINUED))
1419                goto out;
1420
1421        count &= ~COUNT_CONTINUED;
1422        n = SWAP_MAP_MAX + 1;
1423
1424        page = vmalloc_to_page(p->swap_map + offset);
1425        offset &= ~PAGE_MASK;
1426        VM_BUG_ON(page_private(page) != SWP_CONTINUED);
1427
1428        do {
1429                page = list_next_entry(page, lru);
1430                map = kmap_atomic(page);
1431                tmp_count = map[offset];
1432                kunmap_atomic(map);
1433
1434                count += (tmp_count & ~COUNT_CONTINUED) * n;
1435                n *= (SWAP_CONT_MAX + 1);
1436        } while (tmp_count & COUNT_CONTINUED);
1437out:
1438        unlock_cluster_or_swap_info(p, ci);
1439        return count;
1440}
1441
1442static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
1443                                         swp_entry_t entry)
1444{
1445        struct swap_cluster_info *ci;
1446        unsigned char *map = si->swap_map;
1447        unsigned long roffset = swp_offset(entry);
1448        unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
1449        int i;
1450        bool ret = false;
1451
1452        ci = lock_cluster_or_swap_info(si, offset);
1453        if (!ci || !cluster_is_huge(ci)) {
1454                if (swap_count(map[roffset]))
1455                        ret = true;
1456                goto unlock_out;
1457        }
1458        for (i = 0; i < SWAPFILE_CLUSTER; i++) {
1459                if (swap_count(map[offset + i])) {
1460                        ret = true;
1461                        break;
1462                }
1463        }
1464unlock_out:
1465        unlock_cluster_or_swap_info(si, ci);
1466        return ret;
1467}
1468
1469static bool page_swapped(struct page *page)
1470{
1471        swp_entry_t entry;
1472        struct swap_info_struct *si;
1473
1474        if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
1475                return page_swapcount(page) != 0;
1476
1477        page = compound_head(page);
1478        entry.val = page_private(page);
1479        si = _swap_info_get(entry);
1480        if (si)
1481                return swap_page_trans_huge_swapped(si, entry);
1482        return false;
1483}
1484
1485static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
1486                                         int *total_swapcount)
1487{
1488        int i, map_swapcount, _total_mapcount, _total_swapcount;
1489        unsigned long offset = 0;
1490        struct swap_info_struct *si;
1491        struct swap_cluster_info *ci = NULL;
1492        unsigned char *map = NULL;
1493        int mapcount, swapcount = 0;
1494
1495        /* hugetlbfs shouldn't call it */
1496        VM_BUG_ON_PAGE(PageHuge(page), page);
1497
1498        if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
1499                mapcount = page_trans_huge_mapcount(page, total_mapcount);
1500                if (PageSwapCache(page))
1501                        swapcount = page_swapcount(page);
1502                if (total_swapcount)
1503                        *total_swapcount = swapcount;
1504                return mapcount + swapcount;
1505        }
1506
1507        page = compound_head(page);
1508
1509        _total_mapcount = _total_swapcount = map_swapcount = 0;
1510        if (PageSwapCache(page)) {
1511                swp_entry_t entry;
1512
1513                entry.val = page_private(page);
1514                si = _swap_info_get(entry);
1515                if (si) {
1516                        map = si->swap_map;
1517                        offset = swp_offset(entry);
1518                }
1519        }
1520        if (map)
1521                ci = lock_cluster(si, offset);
1522        for (i = 0; i < HPAGE_PMD_NR; i++) {
1523                mapcount = atomic_read(&page[i]._mapcount) + 1;
1524                _total_mapcount += mapcount;
1525                if (map) {
1526                        swapcount = swap_count(map[offset + i]);
1527                        _total_swapcount += swapcount;
1528                }
1529                map_swapcount = max(map_swapcount, mapcount + swapcount);
1530        }
1531        unlock_cluster(ci);
1532        if (PageDoubleMap(page)) {
1533                map_swapcount -= 1;
1534                _total_mapcount -= HPAGE_PMD_NR;
1535        }
1536        mapcount = compound_mapcount(page);
1537        map_swapcount += mapcount;
1538        _total_mapcount += mapcount;
1539        if (total_mapcount)
1540                *total_mapcount = _total_mapcount;
1541        if (total_swapcount)
1542                *total_swapcount = _total_swapcount;
1543
1544        return map_swapcount;
1545}
1546
1547/*
1548 * We can write to an anon page without COW if there are no other references
1549 * to it.  And as a side-effect, free up its swap: because the old content
1550 * on disk will never be read, and seeking back there to write new content
1551 * later would only waste time away from clustering.
1552 *
1553 * NOTE: total_map_swapcount should not be relied upon by the caller if
1554 * reuse_swap_page() returns false, but it may be always overwritten
1555 * (see the other implementation for CONFIG_SWAP=n).
1556 */
1557bool reuse_swap_page(struct page *page, int *total_map_swapcount)
1558{
1559        int count, total_mapcount, total_swapcount;
1560
1561        VM_BUG_ON_PAGE(!PageLocked(page), page);
1562        if (unlikely(PageKsm(page)))
1563                return false;
1564        count = page_trans_huge_map_swapcount(page, &total_mapcount,
1565                                              &total_swapcount);
1566        if (total_map_swapcount)
1567                *total_map_swapcount = total_mapcount + total_swapcount;
1568        if (count == 1 && PageSwapCache(page) &&
1569            (likely(!PageTransCompound(page)) ||
1570             /* The remaining swap count will be freed soon */
1571             total_swapcount == page_swapcount(page))) {
1572                if (!PageWriteback(page)) {
1573                        page = compound_head(page);
1574                        delete_from_swap_cache(page);
1575                        SetPageDirty(page);
1576                } else {
1577                        swp_entry_t entry;
1578                        struct swap_info_struct *p;
1579
1580                        entry.val = page_private(page);
1581                        p = swap_info_get(entry);
1582                        if (p->flags & SWP_STABLE_WRITES) {
1583                                spin_unlock(&p->lock);
1584                                return false;
1585                        }
1586                        spin_unlock(&p->lock);
1587                }
1588        }
1589
1590        return count <= 1;
1591}
1592
1593/*
1594 * If swap is getting full, or if there are no more mappings of this page,
1595 * then try_to_free_swap is called to free its swap space.
1596 */
1597int try_to_free_swap(struct page *page)
1598{
1599        VM_BUG_ON_PAGE(!PageLocked(page), page);
1600
1601        if (!PageSwapCache(page))
1602                return 0;
1603        if (PageWriteback(page))
1604                return 0;
1605        if (page_swapped(page))
1606                return 0;
1607
1608        /*
1609         * Once hibernation has begun to create its image of memory,
1610         * there's a danger that one of the calls to try_to_free_swap()
1611         * - most probably a call from __try_to_reclaim_swap() while
1612         * hibernation is allocating its own swap pages for the image,
1613         * but conceivably even a call from memory reclaim - will free
1614         * the swap from a page which has already been recorded in the
1615         * image as a clean swapcache page, and then reuse its swap for
1616         * another page of the image.  On waking from hibernation, the
1617         * original page might be freed under memory pressure, then
1618         * later read back in from swap, now with the wrong data.
1619         *
1620         * Hibernation suspends storage while it is writing the image
1621         * to disk so check that here.
1622         */
1623        if (pm_suspended_storage())
1624                return 0;
1625
1626        page = compound_head(page);
1627        delete_from_swap_cache(page);
1628        SetPageDirty(page);
1629        return 1;
1630}
1631
1632/*
1633 * Free the swap entry like above, but also try to
1634 * free the page cache entry if it is the last user.
1635 */
1636int free_swap_and_cache(swp_entry_t entry)
1637{
1638        struct swap_info_struct *p;
1639        unsigned char count;
1640
1641        if (non_swap_entry(entry))
1642                return 1;
1643
1644        p = _swap_info_get(entry);
1645        if (p) {
1646                count = __swap_entry_free(p, entry, 1);
1647                if (count == SWAP_HAS_CACHE &&
1648                    !swap_page_trans_huge_swapped(p, entry))
1649                        __try_to_reclaim_swap(p, swp_offset(entry),
1650                                              TTRS_UNMAPPED | TTRS_FULL);
1651        }
1652        return p != NULL;
1653}
1654
1655#ifdef CONFIG_HIBERNATION
1656/*
1657 * Find the swap type that corresponds to given device (if any).
1658 *
1659 * @offset - number of the PAGE_SIZE-sized block of the device, starting
1660 * from 0, in which the swap header is expected to be located.
1661 *
1662 * This is needed for the suspend to disk (aka swsusp).
1663 */
1664int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
1665{
1666        struct block_device *bdev = NULL;
1667        int type;
1668
1669        if (device)
1670                bdev = bdget(device);
1671
1672        spin_lock(&swap_lock);
1673        for (type = 0; type < nr_swapfiles; type++) {
1674                struct swap_info_struct *sis = swap_info[type];
1675
1676                if (!(sis->flags & SWP_WRITEOK))
1677                        continue;
1678
1679                if (!bdev) {
1680                        if (bdev_p)
1681                                *bdev_p = bdgrab(sis->bdev);
1682
1683                        spin_unlock(&swap_lock);
1684                        return type;
1685                }
1686                if (bdev == sis->bdev) {
1687                        struct swap_extent *se = &sis->first_swap_extent;
1688
1689                        if (se->start_block == offset) {
1690                                if (bdev_p)
1691                                        *bdev_p = bdgrab(sis->bdev);
1692
1693                                spin_unlock(&swap_lock);
1694                                bdput(bdev);
1695                                return type;
1696                        }
1697                }
1698        }
1699        spin_unlock(&swap_lock);
1700        if (bdev)
1701                bdput(bdev);
1702
1703        return -ENODEV;
1704}
1705
1706/*
1707 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
1708 * corresponding to given index in swap_info (swap type).
1709 */
1710sector_t swapdev_block(int type, pgoff_t offset)
1711{
1712        struct block_device *bdev;
1713        struct swap_info_struct *si = swap_type_to_swap_info(type);
1714
1715        if (!si || !(si->flags & SWP_WRITEOK))
1716                return 0;
1717        return map_swap_entry(swp_entry(type, offset), &bdev);
1718}
1719
1720/*
1721 * Return either the total number of swap pages of given type, or the number
1722 * of free pages of that type (depending on @free)
1723 *
1724 * This is needed for software suspend
1725 */
1726unsigned int count_swap_pages(int type, int free)
1727{
1728        unsigned int n = 0;
1729
1730        spin_lock(&swap_lock);
1731        if ((unsigned int)type < nr_swapfiles) {
1732                struct swap_info_struct *sis = swap_info[type];
1733
1734                spin_lock(&sis->lock);
1735                if (sis->flags & SWP_WRITEOK) {
1736                        n = sis->pages;
1737                        if (free)
1738                                n -= sis->inuse_pages;
1739                }
1740                spin_unlock(&sis->lock);
1741        }
1742        spin_unlock(&swap_lock);
1743        return n;
1744}
1745#endif /* CONFIG_HIBERNATION */
1746
1747static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
1748{
1749        return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
1750}
1751
1752/*
1753 * No need to decide whether this PTE shares the swap entry with others,
1754 * just let do_wp_page work it out if a write is requested later - to
1755 * force COW, vm_page_prot omits write permission from any private vma.
1756 */
1757static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
1758                unsigned long addr, swp_entry_t entry, struct page *page)
1759{
1760        struct page *swapcache;
1761        struct mem_cgroup *memcg;
1762        spinlock_t *ptl;
1763        pte_t *pte;
1764        int ret = 1;
1765
1766        swapcache = page;
1767        page = ksm_might_need_to_copy(page, vma, addr);
1768        if (unlikely(!page))
1769                return -ENOMEM;
1770
1771        if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
1772                                &memcg, false)) {
1773                ret = -ENOMEM;
1774                goto out_nolock;
1775        }
1776
1777        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1778        if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
1779                mem_cgroup_cancel_charge(page, memcg, false);
1780                ret = 0;
1781                goto out;
1782        }
1783
1784        dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
1785        inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
1786        get_page(page);
1787        set_pte_at(vma->vm_mm, addr, pte,
1788                   pte_mkold(mk_pte(page, vma->vm_page_prot)));
1789        if (page == swapcache) {
1790                page_add_anon_rmap(page, vma, addr, false);
1791                mem_cgroup_commit_charge(page, memcg, true, false);
1792        } else { /* ksm created a completely new copy */
1793                page_add_new_anon_rmap(page, vma, addr, false);
1794                mem_cgroup_commit_charge(page, memcg, false, false);
1795                lru_cache_add_active_or_unevictable(page, vma);
1796        }
1797        swap_free(entry);
1798        /*
1799         * Move the page to the active list so it is not
1800         * immediately swapped out again after swapon.
1801         */
1802        activate_page(page);
1803out:
1804        pte_unmap_unlock(pte, ptl);
1805out_nolock:
1806        if (page != swapcache) {
1807                unlock_page(page);
1808                put_page(page);
1809        }
1810        return ret;
1811}
1812
1813static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
1814                        unsigned long addr, unsigned long end,
1815                        unsigned int type, bool frontswap,
1816                        unsigned long *fs_pages_to_unuse)
1817{
1818        struct page *page;
1819        swp_entry_t entry;
1820        pte_t *pte;
1821        struct swap_info_struct *si;
1822        unsigned long offset;
1823        int ret = 0;
1824        volatile unsigned char *swap_map;
1825
1826        si = swap_info[type];
1827        pte = pte_offset_map(pmd, addr);
1828        do {
1829                struct vm_fault vmf;
1830
1831                if (!is_swap_pte(*pte))
1832                        continue;
1833
1834                entry = pte_to_swp_entry(*pte);
1835                if (swp_type(entry) != type)
1836                        continue;
1837
1838                offset = swp_offset(entry);
1839                if (frontswap && !frontswap_test(si, offset))
1840                        continue;
1841
1842                pte_unmap(pte);
1843                swap_map = &si->swap_map[offset];
1844                vmf.vma = vma;
1845                vmf.address = addr;
1846                vmf.pmd = pmd;
1847                page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf);
1848                if (!page) {
1849                        if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
1850                                goto try_next;
1851                        return -ENOMEM;
1852                }
1853
1854                lock_page(page);
1855                wait_on_page_writeback(page);
1856                ret = unuse_pte(vma, pmd, addr, entry, page);
1857                if (ret < 0) {
1858                        unlock_page(page);
1859                        put_page(page);
1860                        goto out;
1861                }
1862
1863                try_to_free_swap(page);
1864                unlock_page(page);
1865                put_page(page);
1866
1867                if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) {
1868                        ret = FRONTSWAP_PAGES_UNUSED;
1869                        goto out;
1870                }
1871try_next:
1872                pte = pte_offset_map(pmd, addr);
1873        } while (pte++, addr += PAGE_SIZE, addr != end);
1874        pte_unmap(pte - 1);
1875
1876        ret = 0;
1877out:
1878        return ret;
1879}
1880
1881static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
1882                                unsigned long addr, unsigned long end,
1883                                unsigned int type, bool frontswap,
1884                                unsigned long *fs_pages_to_unuse)
1885{
1886        pmd_t *pmd;
1887        unsigned long next;
1888        int ret;
1889
1890        pmd = pmd_offset(pud, addr);
1891        do {
1892                cond_resched();
1893                next = pmd_addr_end(addr, end);
1894                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1895                        continue;
1896                ret = unuse_pte_range(vma, pmd, addr, next, type,
1897                                      frontswap, fs_pages_to_unuse);
1898                if (ret)
1899                        return ret;
1900        } while (pmd++, addr = next, addr != end);
1901        return 0;
1902}
1903
1904static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
1905                                unsigned long addr, unsigned long end,
1906                                unsigned int type, bool frontswap,
1907                                unsigned long *fs_pages_to_unuse)
1908{
1909        pud_t *pud;
1910        unsigned long next;
1911        int ret;
1912
1913        pud = pud_offset(p4d, addr);
1914        do {
1915                next = pud_addr_end(addr, end);
1916                if (pud_none_or_clear_bad(pud))
1917                        continue;
1918                ret = unuse_pmd_range(vma, pud, addr, next, type,
1919                                      frontswap, fs_pages_to_unuse);
1920                if (ret)
1921                        return ret;
1922        } while (pud++, addr = next, addr != end);
1923        return 0;
1924}
1925
1926static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
1927                                unsigned long addr, unsigned long end,
1928                                unsigned int type, bool frontswap,
1929                                unsigned long *fs_pages_to_unuse)
1930{
1931        p4d_t *p4d;
1932        unsigned long next;
1933        int ret;
1934
1935        p4d = p4d_offset(pgd, addr);
1936        do {
1937                next = p4d_addr_end(addr, end);
1938                if (p4d_none_or_clear_bad(p4d))
1939                        continue;
1940                ret = unuse_pud_range(vma, p4d, addr, next, type,
1941                                      frontswap, fs_pages_to_unuse);
1942                if (ret)
1943                        return ret;
1944        } while (p4d++, addr = next, addr != end);
1945        return 0;
1946}
1947
1948static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
1949                     bool frontswap, unsigned long *fs_pages_to_unuse)
1950{
1951        pgd_t *pgd;
1952        unsigned long addr, end, next;
1953        int ret;
1954
1955        addr = vma->vm_start;
1956        end = vma->vm_end;
1957
1958        pgd = pgd_offset(vma->vm_mm, addr);
1959        do {
1960                next = pgd_addr_end(addr, end);
1961                if (pgd_none_or_clear_bad(pgd))
1962                        continue;
1963                ret = unuse_p4d_range(vma, pgd, addr, next, type,
1964                                      frontswap, fs_pages_to_unuse);
1965                if (ret)
1966                        return ret;
1967        } while (pgd++, addr = next, addr != end);
1968        return 0;
1969}
1970
1971static int unuse_mm(struct mm_struct *mm, unsigned int type,
1972                    bool frontswap, unsigned long *fs_pages_to_unuse)
1973{
1974        struct vm_area_struct *vma;
1975        int ret = 0;
1976
1977        down_read(&mm->mmap_sem);
1978        for (vma = mm->mmap; vma; vma = vma->vm_next) {
1979                if (vma->anon_vma) {
1980                        ret = unuse_vma(vma, type, frontswap,
1981                                        fs_pages_to_unuse);
1982                        if (ret)
1983                                break;
1984                }
1985                cond_resched();
1986        }
1987        up_read(&mm->mmap_sem);
1988        return ret;
1989}
1990
1991/*
1992 * Scan swap_map (or frontswap_map if frontswap parameter is true)
1993 * from current position to next entry still in use. Return 0
1994 * if there are no inuse entries after prev till end of the map.
1995 */
1996static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1997                                        unsigned int prev, bool frontswap)
1998{
1999        unsigned int i;
2000        unsigned char count;
2001
2002        /*
2003         * No need for swap_lock here: we're just looking
2004         * for whether an entry is in use, not modifying it; false
2005         * hits are okay, and sys_swapoff() has already prevented new
2006         * allocations from this area (while holding swap_lock).
2007         */
2008        for (i = prev + 1; i < si->max; i++) {
2009                count = READ_ONCE(si->swap_map[i]);
2010                if (count && swap_count(count) != SWAP_MAP_BAD)
2011                        if (!frontswap || frontswap_test(si, i))
2012                                break;
2013                if ((i % LATENCY_LIMIT) == 0)
2014                        cond_resched();
2015        }
2016
2017        if (i == si->max)
2018                i = 0;
2019
2020        return i;
2021}
2022
2023/*
2024 * If the boolean frontswap is true, only unuse pages_to_unuse pages;
2025 * pages_to_unuse==0 means all pages; ignored if frontswap is false
2026 */
2027int try_to_unuse(unsigned int type, bool frontswap,
2028                 unsigned long pages_to_unuse)
2029{
2030        struct mm_struct *prev_mm;
2031        struct mm_struct *mm;
2032        struct list_head *p;
2033        int retval = 0;
2034        struct swap_info_struct *si = swap_info[type];
2035        struct page *page;
2036        swp_entry_t entry;
2037        unsigned int i;
2038
2039        if (!si->inuse_pages)
2040                return 0;
2041
2042        if (!frontswap)
2043                pages_to_unuse = 0;
2044
2045retry:
2046        retval = shmem_unuse(type, frontswap, &pages_to_unuse);
2047        if (retval)
2048                goto out;
2049
2050        prev_mm = &init_mm;
2051        mmget(prev_mm);
2052
2053        spin_lock(&mmlist_lock);
2054        p = &init_mm.mmlist;
2055        while (si->inuse_pages &&
2056               !signal_pending(current) &&
2057               (p = p->next) != &init_mm.mmlist) {
2058
2059                mm = list_entry(p, struct mm_struct, mmlist);
2060                if (!mmget_not_zero(mm))
2061                        continue;
2062                spin_unlock(&mmlist_lock);
2063                mmput(prev_mm);
2064                prev_mm = mm;
2065                retval = unuse_mm(mm, type, frontswap, &pages_to_unuse);
2066
2067                if (retval) {
2068                        mmput(prev_mm);
2069                        goto out;
2070                }
2071
2072                /*
2073                 * Make sure that we aren't completely killing
2074                 * interactive performance.
2075                 */
2076                cond_resched();
2077                spin_lock(&mmlist_lock);
2078        }
2079        spin_unlock(&mmlist_lock);
2080
2081        mmput(prev_mm);
2082
2083        i = 0;
2084        while (si->inuse_pages &&
2085               !signal_pending(current) &&
2086               (i = find_next_to_unuse(si, i, frontswap)) != 0) {
2087
2088                entry = swp_entry(type, i);
2089                page = find_get_page(swap_address_space(entry), i);
2090                if (!page)
2091                        continue;
2092
2093                /*
2094                 * It is conceivable that a racing task removed this page from
2095                 * swap cache just before we acquired the page lock. The page
2096                 * might even be back in swap cache on another swap area. But
2097                 * that is okay, try_to_free_swap() only removes stale pages.
2098                 */
2099                lock_page(page);
2100                wait_on_page_writeback(page);
2101                try_to_free_swap(page);
2102                unlock_page(page);
2103                put_page(page);
2104
2105                /*
2106                 * For frontswap, we just need to unuse pages_to_unuse, if
2107                 * it was specified. Need not check frontswap again here as
2108                 * we already zeroed out pages_to_unuse if not frontswap.
2109                 */
2110                if (pages_to_unuse && --pages_to_unuse == 0)
2111                        goto out;
2112        }
2113
2114        /*
2115         * Lets check again to see if there are still swap entries in the map.
2116         * If yes, we would need to do retry the unuse logic again.
2117         * Under global memory pressure, swap entries can be reinserted back
2118         * into process space after the mmlist loop above passes over them.
2119         *
2120         * Limit the number of retries? No: when mmget_not_zero() above fails,
2121         * that mm is likely to be freeing swap from exit_mmap(), which proceeds
2122         * at its own independent pace; and even shmem_writepage() could have
2123         * been preempted after get_swap_page(), temporarily hiding that swap.
2124         * It's easy and robust (though cpu-intensive) just to keep retrying.
2125         */
2126        if (si->inuse_pages) {
2127                if (!signal_pending(current))
2128                        goto retry;
2129                retval = -EINTR;
2130        }
2131out:
2132        return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval;
2133}
2134
2135/*
2136 * After a successful try_to_unuse, if no swap is now in use, we know
2137 * we can empty the mmlist.  swap_lock must be held on entry and exit.
2138 * Note that mmlist_lock nests inside swap_lock, and an mm must be
2139 * added to the mmlist just after page_duplicate - before would be racy.
2140 */
2141static void drain_mmlist(void)
2142{
2143        struct list_head *p, *next;
2144        unsigned int type;
2145
2146        for (type = 0; type < nr_swapfiles; type++)
2147                if (swap_info[type]->inuse_pages)
2148                        return;
2149        spin_lock(&mmlist_lock);
2150        list_for_each_safe(p, next, &init_mm.mmlist)
2151                list_del_init(p);
2152        spin_unlock(&mmlist_lock);
2153}
2154
2155/*
2156 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
2157 * corresponds to page offset for the specified swap entry.
2158 * Note that the type of this function is sector_t, but it returns page offset
2159 * into the bdev, not sector offset.
2160 */
2161static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
2162{
2163        struct swap_info_struct *sis;
2164        struct swap_extent *start_se;
2165        struct swap_extent *se;
2166        pgoff_t offset;
2167
2168        sis = swp_swap_info(entry);
2169        *bdev = sis->bdev;
2170
2171        offset = swp_offset(entry);
2172        start_se = sis->curr_swap_extent;
2173        se = start_se;
2174
2175        for ( ; ; ) {
2176                if (se->start_page <= offset &&
2177                                offset < (se->start_page + se->nr_pages)) {
2178                        return se->start_block + (offset - se->start_page);
2179                }
2180                se = list_next_entry(se, list);
2181                sis->curr_swap_extent = se;
2182                BUG_ON(se == start_se);         /* It *must* be present */
2183        }
2184}
2185
2186/*
2187 * Returns the page offset into bdev for the specified page's swap entry.
2188 */
2189sector_t map_swap_page(struct page *page, struct block_device **bdev)
2190{
2191        swp_entry_t entry;
2192        entry.val = page_private(page);
2193        return map_swap_entry(entry, bdev);
2194}
2195
2196/*
2197 * Free all of a swapdev's extent information
2198 */
2199static void destroy_swap_extents(struct swap_info_struct *sis)
2200{
2201        while (!list_empty(&sis->first_swap_extent.list)) {
2202                struct swap_extent *se;
2203
2204                se = list_first_entry(&sis->first_swap_extent.list,
2205                                struct swap_extent, list);
2206                list_del(&se->list);
2207                kfree(se);
2208        }
2209
2210        if (sis->flags & SWP_ACTIVATED) {
2211                struct file *swap_file = sis->swap_file;
2212                struct address_space *mapping = swap_file->f_mapping;
2213
2214                sis->flags &= ~SWP_ACTIVATED;
2215                if (mapping->a_ops->swap_deactivate)
2216                        mapping->a_ops->swap_deactivate(swap_file);
2217        }
2218}
2219
2220/*
2221 * Add a block range (and the corresponding page range) into this swapdev's
2222 * extent list.  The extent list is kept sorted in page order.
2223 *
2224 * This function rather assumes that it is called in ascending page order.
2225 */
2226int
2227add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
2228                unsigned long nr_pages, sector_t start_block)
2229{
2230        struct swap_extent *se;
2231        struct swap_extent *new_se;
2232        struct list_head *lh;
2233
2234        if (start_page == 0) {
2235                se = &sis->first_swap_extent;
2236                sis->curr_swap_extent = se;
2237                se->start_page = 0;
2238                se->nr_pages = nr_pages;
2239                se->start_block = start_block;
2240                return 1;
2241        } else {
2242                lh = sis->first_swap_extent.list.prev;  /* Highest extent */
2243                se = list_entry(lh, struct swap_extent, list);
2244                BUG_ON(se->start_page + se->nr_pages != start_page);
2245                if (se->start_block + se->nr_pages == start_block) {
2246                        /* Merge it */
2247                        se->nr_pages += nr_pages;
2248                        return 0;
2249                }
2250        }
2251
2252        /*
2253         * No merge.  Insert a new extent, preserving ordering.
2254         */
2255        new_se = kmalloc(sizeof(*se), GFP_KERNEL);
2256        if (new_se == NULL)
2257                return -ENOMEM;
2258        new_se->start_page = start_page;
2259        new_se->nr_pages = nr_pages;
2260        new_se->start_block = start_block;
2261
2262        list_add_tail(&new_se->list, &sis->first_swap_extent.list);
2263        return 1;
2264}
2265EXPORT_SYMBOL_GPL(add_swap_extent);
2266
2267/*
2268 * A `swap extent' is a simple thing which maps a contiguous range of pages
2269 * onto a contiguous range of disk blocks.  An ordered list of swap extents
2270 * is built at swapon time and is then used at swap_writepage/swap_readpage
2271 * time for locating where on disk a page belongs.
2272 *
2273 * If the swapfile is an S_ISBLK block device, a single extent is installed.
2274 * This is done so that the main operating code can treat S_ISBLK and S_ISREG
2275 * swap files identically.
2276 *
2277 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
2278 * extent list operates in PAGE_SIZE disk blocks.  Both S_ISREG and S_ISBLK
2279 * swapfiles are handled *identically* after swapon time.
2280 *
2281 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
2282 * and will parse them into an ordered extent list, in PAGE_SIZE chunks.  If
2283 * some stray blocks are found which do not fall within the PAGE_SIZE alignment
2284 * requirements, they are simply tossed out - we will never use those blocks
2285 * for swapping.
2286 *
2287 * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon.  This
2288 * prevents root from shooting her foot off by ftruncating an in-use swapfile,
2289 * which will scribble on the fs.
2290 *
2291 * The amount of disk space which a single swap extent represents varies.
2292 * Typically it is in the 1-4 megabyte range.  So we can have hundreds of
2293 * extents in the list.  To avoid much list walking, we cache the previous
2294 * search location in `curr_swap_extent', and start new searches from there.
2295 * This is extremely effective.  The average number of iterations in
2296 * map_swap_page() has been measured at about 0.3 per page.  - akpm.
2297 */
2298static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
2299{
2300        struct file *swap_file = sis->swap_file;
2301        struct address_space *mapping = swap_file->f_mapping;
2302        struct inode *inode = mapping->host;
2303        int ret;
2304
2305        if (S_ISBLK(inode->i_mode)) {
2306                ret = add_swap_extent(sis, 0, sis->max, 0);
2307                *span = sis->pages;
2308                return ret;
2309        }
2310
2311        if (mapping->a_ops->swap_activate) {
2312                ret = mapping->a_ops->swap_activate(sis, swap_file, span);
2313                if (ret >= 0)
2314                        sis->flags |= SWP_ACTIVATED;
2315                if (!ret) {
2316                        sis->flags |= SWP_FS;
2317                        ret = add_swap_extent(sis, 0, sis->max, 0);
2318                        *span = sis->pages;
2319                }
2320                return ret;
2321        }
2322
2323        return generic_swapfile_activate(sis, swap_file, span);
2324}
2325
2326static int swap_node(struct swap_info_struct *p)
2327{
2328        struct block_device *bdev;
2329
2330        if (p->bdev)
2331                bdev = p->bdev;
2332        else
2333                bdev = p->swap_file->f_inode->i_sb->s_bdev;
2334
2335        return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
2336}
2337
2338static void _enable_swap_info(struct swap_info_struct *p, int prio,
2339                                unsigned char *swap_map,
2340                                struct swap_cluster_info *cluster_info)
2341{
2342        int i;
2343
2344        if (prio >= 0)
2345                p->prio = prio;
2346        else
2347                p->prio = --least_priority;
2348        /*
2349         * the plist prio is negated because plist ordering is
2350         * low-to-high, while swap ordering is high-to-low
2351         */
2352        p->list.prio = -p->prio;
2353        for_each_node(i) {
2354                if (p->prio >= 0)
2355                        p->avail_lists[i].prio = -p->prio;
2356                else {
2357                        if (swap_node(p) == i)
2358                                p->avail_lists[i].prio = 1;
2359                        else
2360                                p->avail_lists[i].prio = -p->prio;
2361                }
2362        }
2363        p->swap_map = swap_map;
2364        p->cluster_info = cluster_info;
2365        p->flags |= SWP_WRITEOK;
2366        atomic_long_add(p->pages, &nr_swap_pages);
2367        total_swap_pages += p->pages;
2368
2369        assert_spin_locked(&swap_lock);
2370        /*
2371         * both lists are plists, and thus priority ordered.
2372         * swap_active_head needs to be priority ordered for swapoff(),
2373         * which on removal of any swap_info_struct with an auto-assigned
2374         * (i.e. negative) priority increments the auto-assigned priority
2375         * of any lower-priority swap_info_structs.
2376         * swap_avail_head needs to be priority ordered for get_swap_page(),
2377         * which allocates swap pages from the highest available priority
2378         * swap_info_struct.
2379         */
2380        plist_add(&p->list, &swap_active_head);
2381        add_to_avail_list(p);
2382}
2383
2384static void enable_swap_info(struct swap_info_struct *p, int prio,
2385                                unsigned char *swap_map,
2386                                struct swap_cluster_info *cluster_info,
2387                                unsigned long *frontswap_map)
2388{
2389        frontswap_init(p->type, frontswap_map);
2390        spin_lock(&swap_lock);
2391        spin_lock(&p->lock);
2392         _enable_swap_info(p, prio, swap_map, cluster_info);
2393        spin_unlock(&p->lock);
2394        spin_unlock(&swap_lock);
2395}
2396
2397static void reinsert_swap_info(struct swap_info_struct *p)
2398{
2399        spin_lock(&swap_lock);
2400        spin_lock(&p->lock);
2401        _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
2402        spin_unlock(&p->lock);
2403        spin_unlock(&swap_lock);
2404}
2405
2406bool has_usable_swap(void)
2407{
2408        bool ret = true;
2409
2410        spin_lock(&swap_lock);
2411        if (plist_head_empty(&swap_active_head))
2412                ret = false;
2413        spin_unlock(&swap_lock);
2414        return ret;
2415}
2416
2417SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
2418{
2419        struct swap_info_struct *p = NULL;
2420        unsigned char *swap_map;
2421        struct swap_cluster_info *cluster_info;
2422        unsigned long *frontswap_map;
2423        struct file *swap_file, *victim;
2424        struct address_space *mapping;
2425        struct inode *inode;
2426        struct filename *pathname;
2427        int err, found = 0;
2428        unsigned int old_block_size;
2429
2430        if (!capable(CAP_SYS_ADMIN))
2431                return -EPERM;
2432
2433        BUG_ON(!current->mm);
2434
2435        pathname = getname(specialfile);
2436        if (IS_ERR(pathname))
2437                return PTR_ERR(pathname);
2438
2439        victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
2440        err = PTR_ERR(victim);
2441        if (IS_ERR(victim))
2442                goto out;
2443
2444        mapping = victim->f_mapping;
2445        spin_lock(&swap_lock);
2446        plist_for_each_entry(p, &swap_active_head, list) {
2447                if (p->flags & SWP_WRITEOK) {
2448                        if (p->swap_file->f_mapping == mapping) {
2449                                found = 1;
2450                                break;
2451                        }
2452                }
2453        }
2454        if (!found) {
2455                err = -EINVAL;
2456                spin_unlock(&swap_lock);
2457                goto out_dput;
2458        }
2459        if (!security_vm_enough_memory_mm(current->mm, p->pages))
2460                vm_unacct_memory(p->pages);
2461        else {
2462                err = -ENOMEM;
2463                spin_unlock(&swap_lock);
2464                goto out_dput;
2465        }
2466        del_from_avail_list(p);
2467        spin_lock(&p->lock);
2468        if (p->prio < 0) {
2469                struct swap_info_struct *si = p;
2470                int nid;
2471
2472                plist_for_each_entry_continue(si, &swap_active_head, list) {
2473                        si->prio++;
2474                        si->list.prio--;
2475                        for_each_node(nid) {
2476                                if (si->avail_lists[nid].prio != 1)
2477                                        si->avail_lists[nid].prio--;
2478                        }
2479                }
2480                least_priority++;
2481        }
2482        plist_del(&p->list, &swap_active_head);
2483        atomic_long_sub(p->pages, &nr_swap_pages);
2484        total_swap_pages -= p->pages;
2485        p->flags &= ~SWP_WRITEOK;
2486        spin_unlock(&p->lock);
2487        spin_unlock(&swap_lock);
2488
2489        disable_swap_slots_cache_lock();
2490
2491        set_current_oom_origin();
2492        err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
2493        clear_current_oom_origin();
2494
2495        if (err) {
2496                /* re-insert swap space back into swap_list */
2497                reinsert_swap_info(p);
2498                reenable_swap_slots_cache_unlock();
2499                goto out_dput;
2500        }
2501
2502        reenable_swap_slots_cache_unlock();
2503
2504        flush_work(&p->discard_work);
2505
2506        destroy_swap_extents(p);
2507        if (p->flags & SWP_CONTINUED)
2508                free_swap_count_continuations(p);
2509
2510        if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
2511                atomic_dec(&nr_rotate_swap);
2512
2513        mutex_lock(&swapon_mutex);
2514        spin_lock(&swap_lock);
2515        spin_lock(&p->lock);
2516        drain_mmlist();
2517
2518        /* wait for anyone still in scan_swap_map */
2519        p->highest_bit = 0;             /* cuts scans short */
2520        while (p->flags >= SWP_SCANNING) {
2521                spin_unlock(&p->lock);
2522                spin_unlock(&swap_lock);
2523                schedule_timeout_uninterruptible(1);
2524                spin_lock(&swap_lock);
2525                spin_lock(&p->lock);
2526        }
2527
2528        swap_file = p->swap_file;
2529        old_block_size = p->old_block_size;
2530        p->swap_file = NULL;
2531        p->max = 0;
2532        swap_map = p->swap_map;
2533        p->swap_map = NULL;
2534        cluster_info = p->cluster_info;
2535        p->cluster_info = NULL;
2536        frontswap_map = frontswap_map_get(p);
2537        spin_unlock(&p->lock);
2538        spin_unlock(&swap_lock);
2539        frontswap_invalidate_area(p->type);
2540        frontswap_map_set(p, NULL);
2541        mutex_unlock(&swapon_mutex);
2542        free_percpu(p->percpu_cluster);
2543        p->percpu_cluster = NULL;
2544        vfree(swap_map);
2545        kvfree(cluster_info);
2546        kvfree(frontswap_map);
2547        /* Destroy swap account information */
2548        swap_cgroup_swapoff(p->type);
2549        exit_swap_address_space(p->type);
2550
2551        inode = mapping->host;
2552        if (S_ISBLK(inode->i_mode)) {
2553                struct block_device *bdev = I_BDEV(inode);
2554                set_blocksize(bdev, old_block_size);
2555                blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2556        } else {
2557                inode_lock(inode);
2558                inode->i_flags &= ~S_SWAPFILE;
2559                inode_unlock(inode);
2560        }
2561        filp_close(swap_file, NULL);
2562
2563        /*
2564         * Clear the SWP_USED flag after all resources are freed so that swapon
2565         * can reuse this swap_info in alloc_swap_info() safely.  It is ok to
2566         * not hold p->lock after we cleared its SWP_WRITEOK.
2567         */
2568        spin_lock(&swap_lock);
2569        p->flags = 0;
2570        spin_unlock(&swap_lock);
2571
2572        err = 0;
2573        atomic_inc(&proc_poll_event);
2574        wake_up_interruptible(&proc_poll_wait);
2575
2576out_dput:
2577        filp_close(victim, NULL);
2578out:
2579        putname(pathname);
2580        return err;
2581}
2582
2583#ifdef CONFIG_PROC_FS
2584static __poll_t swaps_poll(struct file *file, poll_table *wait)
2585{
2586        struct seq_file *seq = file->private_data;
2587
2588        poll_wait(file, &proc_poll_wait, wait);
2589
2590        if (seq->poll_event != atomic_read(&proc_poll_event)) {
2591                seq->poll_event = atomic_read(&proc_poll_event);
2592                return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
2593        }
2594
2595        return EPOLLIN | EPOLLRDNORM;
2596}
2597
2598/* iterator */
2599static void *swap_start(struct seq_file *swap, loff_t *pos)
2600{
2601        struct swap_info_struct *si;
2602        int type;
2603        loff_t l = *pos;
2604
2605        mutex_lock(&swapon_mutex);
2606
2607        if (!l)
2608                return SEQ_START_TOKEN;
2609
2610        for (type = 0; (si = swap_type_to_swap_info(type)); type++) {
2611                if (!(si->flags & SWP_USED) || !si->swap_map)
2612                        continue;
2613                if (!--l)
2614                        return si;
2615        }
2616
2617        return NULL;
2618}
2619
2620static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
2621{
2622        struct swap_info_struct *si = v;
2623        int type;
2624
2625        if (v == SEQ_START_TOKEN)
2626                type = 0;
2627        else
2628                type = si->type + 1;
2629
2630        for (; (si = swap_type_to_swap_info(type)); type++) {
2631                if (!(si->flags & SWP_USED) || !si->swap_map)
2632                        continue;
2633                ++*pos;
2634                return si;
2635        }
2636
2637        return NULL;
2638}
2639
2640static void swap_stop(struct seq_file *swap, void *v)
2641{
2642        mutex_unlock(&swapon_mutex);
2643}
2644
2645static int swap_show(struct seq_file *swap, void *v)
2646{
2647        struct swap_info_struct *si = v;
2648        struct file *file;
2649        int len;
2650
2651        if (si == SEQ_START_TOKEN) {
2652                seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
2653                return 0;
2654        }
2655
2656        file = si->swap_file;
2657        len = seq_file_path(swap, file, " \t\n\\");
2658        seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
2659                        len < 40 ? 40 - len : 1, " ",
2660                        S_ISBLK(file_inode(file)->i_mode) ?
2661                                "partition" : "file\t",
2662                        si->pages << (PAGE_SHIFT - 10),
2663                        si->inuse_pages << (PAGE_SHIFT - 10),
2664                        si->prio);
2665        return 0;
2666}
2667
2668static const struct seq_operations swaps_op = {
2669        .start =        swap_start,
2670        .next =         swap_next,
2671        .stop =         swap_stop,
2672        .show =         swap_show
2673};
2674
2675static int swaps_open(struct inode *inode, struct file *file)
2676{
2677        struct seq_file *seq;
2678        int ret;
2679
2680        ret = seq_open(file, &swaps_op);
2681        if (ret)
2682                return ret;
2683
2684        seq = file->private_data;
2685        seq->poll_event = atomic_read(&proc_poll_event);
2686        return 0;
2687}
2688
2689static const struct file_operations proc_swaps_operations = {
2690        .open           = swaps_open,
2691        .read           = seq_read,
2692        .llseek         = seq_lseek,
2693        .release        = seq_release,
2694        .poll           = swaps_poll,
2695};
2696
2697static int __init procswaps_init(void)
2698{
2699        proc_create("swaps", 0, NULL, &proc_swaps_operations);
2700        return 0;
2701}
2702__initcall(procswaps_init);
2703#endif /* CONFIG_PROC_FS */
2704
2705#ifdef MAX_SWAPFILES_CHECK
2706static int __init max_swapfiles_check(void)
2707{
2708        MAX_SWAPFILES_CHECK();
2709        return 0;
2710}
2711late_initcall(max_swapfiles_check);
2712#endif
2713
2714static struct swap_info_struct *alloc_swap_info(void)
2715{
2716        struct swap_info_struct *p;
2717        unsigned int type;
2718        int i;
2719
2720        p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
2721        if (!p)
2722                return ERR_PTR(-ENOMEM);
2723
2724        spin_lock(&swap_lock);
2725        for (type = 0; type < nr_swapfiles; type++) {
2726                if (!(swap_info[type]->flags & SWP_USED))
2727                        break;
2728        }
2729        if (type >= MAX_SWAPFILES) {
2730                spin_unlock(&swap_lock);
2731                kvfree(p);
2732                return ERR_PTR(-EPERM);
2733        }
2734        if (type >= nr_swapfiles) {
2735                p->type = type;
2736                WRITE_ONCE(swap_info[type], p);
2737                /*
2738                 * Write swap_info[type] before nr_swapfiles, in case a
2739                 * racing procfs swap_start() or swap_next() is reading them.
2740                 * (We never shrink nr_swapfiles, we never free this entry.)
2741                 */
2742                smp_wmb();
2743                WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1);
2744        } else {
2745                kvfree(p);
2746                p = swap_info[type];
2747                /*
2748                 * Do not memset this entry: a racing procfs swap_next()
2749                 * would be relying on p->type to remain valid.
2750                 */
2751        }
2752        INIT_LIST_HEAD(&p->first_swap_extent.list);
2753        plist_node_init(&p->list, 0);
2754        for_each_node(i)
2755                plist_node_init(&p->avail_lists[i], 0);
2756        p->flags = SWP_USED;
2757        spin_unlock(&swap_lock);
2758        spin_lock_init(&p->lock);
2759        spin_lock_init(&p->cont_lock);
2760
2761        return p;
2762}
2763
2764static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
2765{
2766        int error;
2767
2768        if (S_ISBLK(inode->i_mode)) {
2769                p->bdev = bdgrab(I_BDEV(inode));
2770                error = blkdev_get(p->bdev,
2771                                   FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
2772                if (error < 0) {
2773                        p->bdev = NULL;
2774                        return error;
2775                }
2776                p->old_block_size = block_size(p->bdev);
2777                error = set_blocksize(p->bdev, PAGE_SIZE);
2778                if (error < 0)
2779                        return error;
2780                p->flags |= SWP_BLKDEV;
2781        } else if (S_ISREG(inode->i_mode)) {
2782                p->bdev = inode->i_sb->s_bdev;
2783                inode_lock(inode);
2784                if (IS_SWAPFILE(inode))
2785                        return -EBUSY;
2786        } else
2787                return -EINVAL;
2788
2789        return 0;
2790}
2791
2792
2793/*
2794 * Find out how many pages are allowed for a single swap device. There
2795 * are two limiting factors:
2796 * 1) the number of bits for the swap offset in the swp_entry_t type, and
2797 * 2) the number of bits in the swap pte, as defined by the different
2798 * architectures.
2799 *
2800 * In order to find the largest possible bit mask, a swap entry with
2801 * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
2802 * decoded to a swp_entry_t again, and finally the swap offset is
2803 * extracted.
2804 *
2805 * This will mask all the bits from the initial ~0UL mask that can't
2806 * be encoded in either the swp_entry_t or the architecture definition
2807 * of a swap pte.
2808 */
2809unsigned long generic_max_swapfile_size(void)
2810{
2811        return swp_offset(pte_to_swp_entry(
2812                        swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
2813}
2814
2815/* Can be overridden by an architecture for additional checks. */
2816__weak unsigned long max_swapfile_size(void)
2817{
2818        return generic_max_swapfile_size();
2819}
2820
2821static unsigned long read_swap_header(struct swap_info_struct *p,
2822                                        union swap_header *swap_header,
2823                                        struct inode *inode)
2824{
2825        int i;
2826        unsigned long maxpages;
2827        unsigned long swapfilepages;
2828        unsigned long last_page;
2829
2830        if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
2831                pr_err("Unable to find swap-space signature\n");
2832                return 0;
2833        }
2834
2835        /* swap partition endianess hack... */
2836        if (swab32(swap_header->info.version) == 1) {
2837                swab32s(&swap_header->info.version);
2838                swab32s(&swap_header->info.last_page);
2839                swab32s(&swap_header->info.nr_badpages);
2840                if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2841                        return 0;
2842                for (i = 0; i < swap_header->info.nr_badpages; i++)
2843                        swab32s(&swap_header->info.badpages[i]);
2844        }
2845        /* Check the swap header's sub-version */
2846        if (swap_header->info.version != 1) {
2847                pr_warn("Unable to handle swap header version %d\n",
2848                        swap_header->info.version);
2849                return 0;
2850        }
2851
2852        p->lowest_bit  = 1;
2853        p->cluster_next = 1;
2854        p->cluster_nr = 0;
2855
2856        maxpages = max_swapfile_size();
2857        last_page = swap_header->info.last_page;
2858        if (!last_page) {
2859                pr_warn("Empty swap-file\n");
2860                return 0;
2861        }
2862        if (last_page > maxpages) {
2863                pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
2864                        maxpages << (PAGE_SHIFT - 10),
2865                        last_page << (PAGE_SHIFT - 10));
2866        }
2867        if (maxpages > last_page) {
2868                maxpages = last_page + 1;
2869                /* p->max is an unsigned int: don't overflow it */
2870                if ((unsigned int)maxpages == 0)
2871                        maxpages = UINT_MAX;
2872        }
2873        p->highest_bit = maxpages - 1;
2874
2875        if (!maxpages)
2876                return 0;
2877        swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
2878        if (swapfilepages && maxpages > swapfilepages) {
2879                pr_warn("Swap area shorter than signature indicates\n");
2880                return 0;
2881        }
2882        if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
2883                return 0;
2884        if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2885                return 0;
2886
2887        return maxpages;
2888}
2889
2890#define SWAP_CLUSTER_INFO_COLS                                          \
2891        DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
2892#define SWAP_CLUSTER_SPACE_COLS                                         \
2893        DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
2894#define SWAP_CLUSTER_COLS                                               \
2895        max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
2896
2897static int setup_swap_map_and_extents(struct swap_info_struct *p,
2898                                        union swap_header *swap_header,
2899                                        unsigned char *swap_map,
2900                                        struct swap_cluster_info *cluster_info,
2901                                        unsigned long maxpages,
2902                                        sector_t *span)
2903{
2904        unsigned int j, k;
2905        unsigned int nr_good_pages;
2906        int nr_extents;
2907        unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
2908        unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
2909        unsigned long i, idx;
2910
2911        nr_good_pages = maxpages - 1;   /* omit header page */
2912
2913        cluster_list_init(&p->free_clusters);
2914        cluster_list_init(&p->discard_clusters);
2915
2916        for (i = 0; i < swap_header->info.nr_badpages; i++) {
2917                unsigned int page_nr = swap_header->info.badpages[i];
2918                if (page_nr == 0 || page_nr > swap_header->info.last_page)
2919                        return -EINVAL;
2920                if (page_nr < maxpages) {
2921                        swap_map[page_nr] = SWAP_MAP_BAD;
2922                        nr_good_pages--;
2923                        /*
2924                         * Haven't marked the cluster free yet, no list
2925                         * operation involved
2926                         */
2927                        inc_cluster_info_page(p, cluster_info, page_nr);
2928                }
2929        }
2930
2931        /* Haven't marked the cluster free yet, no list operation involved */
2932        for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
2933                inc_cluster_info_page(p, cluster_info, i);
2934
2935        if (nr_good_pages) {
2936                swap_map[0] = SWAP_MAP_BAD;
2937                /*
2938                 * Not mark the cluster free yet, no list
2939                 * operation involved
2940                 */
2941                inc_cluster_info_page(p, cluster_info, 0);
2942                p->max = maxpages;
2943                p->pages = nr_good_pages;
2944                nr_extents = setup_swap_extents(p, span);
2945                if (nr_extents < 0)
2946                        return nr_extents;
2947                nr_good_pages = p->pages;
2948        }
2949        if (!nr_good_pages) {
2950                pr_warn("Empty swap-file\n");
2951                return -EINVAL;
2952        }
2953
2954        if (!cluster_info)
2955                return nr_extents;
2956
2957
2958        /*
2959         * Reduce false cache line sharing between cluster_info and
2960         * sharing same address space.
2961         */
2962        for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
2963                j = (k + col) % SWAP_CLUSTER_COLS;
2964                for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
2965                        idx = i * SWAP_CLUSTER_COLS + j;
2966                        if (idx >= nr_clusters)
2967                                continue;
2968                        if (cluster_count(&cluster_info[idx]))
2969                                continue;
2970                        cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
2971                        cluster_list_add_tail(&p->free_clusters, cluster_info,
2972                                              idx);
2973                }
2974        }
2975        return nr_extents;
2976}
2977
2978/*
2979 * Helper to sys_swapon determining if a given swap
2980 * backing device queue supports DISCARD operations.
2981 */
2982static bool swap_discardable(struct swap_info_struct *si)
2983{
2984        struct request_queue *q = bdev_get_queue(si->bdev);
2985
2986        if (!q || !blk_queue_discard(q))
2987                return false;
2988
2989        return true;
2990}
2991
2992SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2993{
2994        struct swap_info_struct *p;
2995        struct filename *name;
2996        struct file *swap_file = NULL;
2997        struct address_space *mapping;
2998        int prio;
2999        int error;
3000        union swap_header *swap_header;
3001        int nr_extents;
3002        sector_t span;
3003        unsigned long maxpages;
3004        unsigned char *swap_map = NULL;
3005        struct swap_cluster_info *cluster_info = NULL;
3006        unsigned long *frontswap_map = NULL;
3007        struct page *page = NULL;
3008        struct inode *inode = NULL;
3009        bool inced_nr_rotate_swap = false;
3010
3011        if (swap_flags & ~SWAP_FLAGS_VALID)
3012                return -EINVAL;
3013
3014        if (!capable(CAP_SYS_ADMIN))
3015                return -EPERM;
3016
3017        if (!swap_avail_heads)
3018                return -ENOMEM;
3019
3020        p = alloc_swap_info();
3021        if (IS_ERR(p))
3022                return PTR_ERR(p);
3023
3024        INIT_WORK(&p->discard_work, swap_discard_work);
3025
3026        name = getname(specialfile);
3027        if (IS_ERR(name)) {
3028                error = PTR_ERR(name);
3029                name = NULL;
3030                goto bad_swap;
3031        }
3032        swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
3033        if (IS_ERR(swap_file)) {
3034                error = PTR_ERR(swap_file);
3035                swap_file = NULL;
3036                goto bad_swap;
3037        }
3038
3039        p->swap_file = swap_file;
3040        mapping = swap_file->f_mapping;
3041        inode = mapping->host;
3042
3043        /* If S_ISREG(inode->i_mode) will do inode_lock(inode); */
3044        error = claim_swapfile(p, inode);
3045        if (unlikely(error))
3046                goto bad_swap;
3047
3048        /*
3049         * Read the swap header.
3050         */
3051        if (!mapping->a_ops->readpage) {
3052                error = -EINVAL;
3053                goto bad_swap;
3054        }
3055        page = read_mapping_page(mapping, 0, swap_file);
3056        if (IS_ERR(page)) {
3057                error = PTR_ERR(page);
3058                goto bad_swap;
3059        }
3060        swap_header = kmap(page);
3061
3062        maxpages = read_swap_header(p, swap_header, inode);
3063        if (unlikely(!maxpages)) {
3064                error = -EINVAL;
3065                goto bad_swap;
3066        }
3067
3068        /* OK, set up the swap map and apply the bad block list */
3069        swap_map = vzalloc(maxpages);
3070        if (!swap_map) {
3071                error = -ENOMEM;
3072                goto bad_swap;
3073        }
3074
3075        if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
3076                p->flags |= SWP_STABLE_WRITES;
3077
3078        if (bdi_cap_synchronous_io(inode_to_bdi(inode)))
3079                p->flags |= SWP_SYNCHRONOUS_IO;
3080
3081        if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
3082                int cpu;
3083                unsigned long ci, nr_cluster;
3084
3085                p->flags |= SWP_SOLIDSTATE;
3086                /*
3087                 * select a random position to start with to help wear leveling
3088                 * SSD
3089                 */
3090                p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
3091                nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
3092
3093                cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
3094                                        GFP_KERNEL);
3095                if (!cluster_info) {
3096                        error = -ENOMEM;
3097                        goto bad_swap;
3098                }
3099
3100                for (ci = 0; ci < nr_cluster; ci++)
3101                        spin_lock_init(&((cluster_info + ci)->lock));
3102
3103                p->percpu_cluster = alloc_percpu(struct percpu_cluster);
3104                if (!p->percpu_cluster) {
3105                        error = -ENOMEM;
3106                        goto bad_swap;
3107                }
3108                for_each_possible_cpu(cpu) {
3109                        struct percpu_cluster *cluster;
3110                        cluster = per_cpu_ptr(p->percpu_cluster, cpu);
3111                        cluster_set_null(&cluster->index);
3112                }
3113        } else {
3114                atomic_inc(&nr_rotate_swap);
3115                inced_nr_rotate_swap = true;
3116        }
3117
3118        error = swap_cgroup_swapon(p->type, maxpages);
3119        if (error)
3120                goto bad_swap;
3121
3122        nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
3123                cluster_info, maxpages, &span);
3124        if (unlikely(nr_extents < 0)) {
3125                error = nr_extents;
3126                goto bad_swap;
3127        }
3128        /* frontswap enabled? set up bit-per-page map for frontswap */
3129        if (IS_ENABLED(CONFIG_FRONTSWAP))
3130                frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages),
3131                                         sizeof(long),
3132                                         GFP_KERNEL);
3133
3134        if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
3135                /*
3136                 * When discard is enabled for swap with no particular
3137                 * policy flagged, we set all swap discard flags here in
3138                 * order to sustain backward compatibility with older
3139                 * swapon(8) releases.
3140                 */
3141                p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
3142                             SWP_PAGE_DISCARD);
3143
3144                /*
3145                 * By flagging sys_swapon, a sysadmin can tell us to
3146                 * either do single-time area discards only, or to just
3147                 * perform discards for released swap page-clusters.
3148                 * Now it's time to adjust the p->flags accordingly.
3149                 */
3150                if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
3151                        p->flags &= ~SWP_PAGE_DISCARD;
3152                else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
3153                        p->flags &= ~SWP_AREA_DISCARD;
3154
3155                /* issue a swapon-time discard if it's still required */
3156                if (p->flags & SWP_AREA_DISCARD) {
3157                        int err = discard_swap(p);
3158                        if (unlikely(err))
3159                                pr_err("swapon: discard_swap(%p): %d\n",
3160                                        p, err);
3161                }
3162        }
3163
3164        error = init_swap_address_space(p->type, maxpages);
3165        if (error)
3166                goto bad_swap;
3167
3168        mutex_lock(&swapon_mutex);
3169        prio = -1;
3170        if (swap_flags & SWAP_FLAG_PREFER)
3171                prio =
3172                  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
3173        enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
3174
3175        pr_info("Adding %uk swap on %s.  Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
3176                p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
3177                nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
3178                (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
3179                (p->flags & SWP_DISCARDABLE) ? "D" : "",
3180                (p->flags & SWP_AREA_DISCARD) ? "s" : "",
3181                (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
3182                (frontswap_map) ? "FS" : "");
3183
3184        mutex_unlock(&swapon_mutex);
3185        atomic_inc(&proc_poll_event);
3186        wake_up_interruptible(&proc_poll_wait);
3187
3188        if (S_ISREG(inode->i_mode))
3189                inode->i_flags |= S_SWAPFILE;
3190        error = 0;
3191        goto out;
3192bad_swap:
3193        free_percpu(p->percpu_cluster);
3194        p->percpu_cluster = NULL;
3195        if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
3196                set_blocksize(p->bdev, p->old_block_size);
3197                blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3198        }
3199        destroy_swap_extents(p);
3200        swap_cgroup_swapoff(p->type);
3201        spin_lock(&swap_lock);
3202        p->swap_file = NULL;
3203        p->flags = 0;
3204        spin_unlock(&swap_lock);
3205        vfree(swap_map);
3206        kvfree(cluster_info);
3207        kvfree(frontswap_map);
3208        if (inced_nr_rotate_swap)
3209                atomic_dec(&nr_rotate_swap);
3210        if (swap_file) {
3211                if (inode && S_ISREG(inode->i_mode)) {
3212                        inode_unlock(inode);
3213                        inode = NULL;
3214                }
3215                filp_close(swap_file, NULL);
3216        }
3217out:
3218        if (page && !IS_ERR(page)) {
3219                kunmap(page);
3220                put_page(page);
3221        }
3222        if (name)
3223                putname(name);
3224        if (inode && S_ISREG(inode->i_mode))
3225                inode_unlock(inode);
3226        if (!error)
3227                enable_swap_slots_cache();
3228        return error;
3229}
3230
3231void si_swapinfo(struct sysinfo *val)
3232{
3233        unsigned int type;
3234        unsigned long nr_to_be_unused = 0;
3235
3236        spin_lock(&swap_lock);
3237        for (type = 0; type < nr_swapfiles; type++) {
3238                struct swap_info_struct *si = swap_info[type];
3239
3240                if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
3241                        nr_to_be_unused += si->inuse_pages;
3242        }
3243        val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
3244        val->totalswap = total_swap_pages + nr_to_be_unused;
3245        spin_unlock(&swap_lock);
3246}
3247
3248/*
3249 * Verify that a swap entry is valid and increment its swap map count.
3250 *
3251 * Returns error code in following case.
3252 * - success -> 0
3253 * - swp_entry is invalid -> EINVAL
3254 * - swp_entry is migration entry -> EINVAL
3255 * - swap-cache reference is requested but there is already one. -> EEXIST
3256 * - swap-cache reference is requested but the entry is not used. -> ENOENT
3257 * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
3258 */
3259static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
3260{
3261        struct swap_info_struct *p;
3262        struct swap_cluster_info *ci;
3263        unsigned long offset;
3264        unsigned char count;
3265        unsigned char has_cache;
3266        int err = -EINVAL;
3267
3268        if (non_swap_entry(entry))
3269                goto out;
3270
3271        p = swp_swap_info(entry);
3272        if (!p)
3273                goto bad_file;
3274
3275        offset = swp_offset(entry);
3276        if (unlikely(offset >= p->max))
3277                goto out;
3278
3279        ci = lock_cluster_or_swap_info(p, offset);
3280
3281        count = p->swap_map[offset];
3282
3283        /*
3284         * swapin_readahead() doesn't check if a swap entry is valid, so the
3285         * swap entry could be SWAP_MAP_BAD. Check here with lock held.
3286         */
3287        if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
3288                err = -ENOENT;
3289                goto unlock_out;
3290        }
3291
3292        has_cache = count & SWAP_HAS_CACHE;
3293        count &= ~SWAP_HAS_CACHE;
3294        err = 0;
3295
3296        if (usage == SWAP_HAS_CACHE) {
3297
3298                /* set SWAP_HAS_CACHE if there is no cache and entry is used */
3299                if (!has_cache && count)
3300                        has_cache = SWAP_HAS_CACHE;
3301                else if (has_cache)             /* someone else added cache */
3302                        err = -EEXIST;
3303                else                            /* no users remaining */
3304                        err = -ENOENT;
3305
3306        } else if (count || has_cache) {
3307
3308                if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
3309                        count += usage;
3310                else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
3311                        err = -EINVAL;
3312                else if (swap_count_continued(p, offset, count))
3313                        count = COUNT_CONTINUED;
3314                else
3315                        err = -ENOMEM;
3316        } else
3317                err = -ENOENT;                  /* unused swap entry */
3318
3319        p->swap_map[offset] = count | has_cache;
3320
3321unlock_out:
3322        unlock_cluster_or_swap_info(p, ci);
3323out:
3324        return err;
3325
3326bad_file:
3327        pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
3328        goto out;
3329}
3330
3331/*
3332 * Help swapoff by noting that swap entry belongs to shmem/tmpfs
3333 * (in which case its reference count is never incremented).
3334 */
3335void swap_shmem_alloc(swp_entry_t entry)
3336{
3337        __swap_duplicate(entry, SWAP_MAP_SHMEM);
3338}
3339
3340/*
3341 * Increase reference count of swap entry by 1.
3342 * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
3343 * but could not be atomically allocated.  Returns 0, just as if it succeeded,
3344 * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
3345 * might occur if a page table entry has got corrupted.
3346 */
3347int swap_duplicate(swp_entry_t entry)
3348{
3349        int err = 0;
3350
3351        while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
3352                err = add_swap_count_continuation(entry, GFP_ATOMIC);
3353        return err;
3354}
3355
3356/*
3357 * @entry: swap entry for which we allocate swap cache.
3358 *
3359 * Called when allocating swap cache for existing swap entry,
3360 * This can return error codes. Returns 0 at success.
3361 * -EBUSY means there is a swap cache.
3362 * Note: return code is different from swap_duplicate().
3363 */
3364int swapcache_prepare(swp_entry_t entry)
3365{
3366        return __swap_duplicate(entry, SWAP_HAS_CACHE);
3367}
3368
3369struct swap_info_struct *swp_swap_info(swp_entry_t entry)
3370{
3371        return swap_type_to_swap_info(swp_type(entry));
3372}
3373
3374struct swap_info_struct *page_swap_info(struct page *page)
3375{
3376        swp_entry_t entry = { .val = page_private(page) };
3377        return swp_swap_info(entry);
3378}
3379
3380/*
3381 * out-of-line __page_file_ methods to avoid include hell.
3382 */
3383struct address_space *__page_file_mapping(struct page *page)
3384{
3385        return page_swap_info(page)->swap_file->f_mapping;
3386}
3387EXPORT_SYMBOL_GPL(__page_file_mapping);
3388
3389pgoff_t __page_file_index(struct page *page)
3390{
3391        swp_entry_t swap = { .val = page_private(page) };
3392        return swp_offset(swap);
3393}
3394EXPORT_SYMBOL_GPL(__page_file_index);
3395
3396/*
3397 * add_swap_count_continuation - called when a swap count is duplicated
3398 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
3399 * page of the original vmalloc'ed swap_map, to hold the continuation count
3400 * (for that entry and for its neighbouring PAGE_SIZE swap entries).  Called
3401 * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
3402 *
3403 * These continuation pages are seldom referenced: the common paths all work
3404 * on the original swap_map, only referring to a continuation page when the
3405 * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
3406 *
3407 * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
3408 * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
3409 * can be called after dropping locks.
3410 */
3411int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
3412{
3413        struct swap_info_struct *si;
3414        struct swap_cluster_info *ci;
3415        struct page *head;
3416        struct page *page;
3417        struct page *list_page;
3418        pgoff_t offset;
3419        unsigned char count;
3420
3421        /*
3422         * When debugging, it's easier to use __GFP_ZERO here; but it's better
3423         * for latency not to zero a page while GFP_ATOMIC and holding locks.
3424         */
3425        page = alloc_page(gfp_mask | __GFP_HIGHMEM);
3426
3427        si = swap_info_get(entry);
3428        if (!si) {
3429                /*
3430                 * An acceptable race has occurred since the failing
3431                 * __swap_duplicate(): the swap entry has been freed,
3432                 * perhaps even the whole swap_map cleared for swapoff.
3433                 */
3434                goto outer;
3435        }
3436
3437        offset = swp_offset(entry);
3438
3439        ci = lock_cluster(si, offset);
3440
3441        count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
3442
3443        if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
3444                /*
3445                 * The higher the swap count, the more likely it is that tasks
3446                 * will race to add swap count continuation: we need to avoid
3447                 * over-provisioning.
3448                 */
3449                goto out;
3450        }
3451
3452        if (!page) {
3453                unlock_cluster(ci);
3454                spin_unlock(&si->lock);
3455                return -ENOMEM;
3456        }
3457
3458        /*
3459         * We are fortunate that although vmalloc_to_page uses pte_offset_map,
3460         * no architecture is using highmem pages for kernel page tables: so it
3461         * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps.
3462         */
3463        head = vmalloc_to_page(si->swap_map + offset);
3464        offset &= ~PAGE_MASK;
3465
3466        spin_lock(&si->cont_lock);
3467        /*
3468         * Page allocation does not initialize the page's lru field,
3469         * but it does always reset its private field.
3470         */
3471        if (!page_private(head)) {
3472                BUG_ON(count & COUNT_CONTINUED);
3473                INIT_LIST_HEAD(&head->lru);
3474                set_page_private(head, SWP_CONTINUED);
3475                si->flags |= SWP_CONTINUED;
3476        }
3477
3478        list_for_each_entry(list_page, &head->lru, lru) {
3479                unsigned char *map;
3480
3481                /*
3482                 * If the previous map said no continuation, but we've found
3483                 * a continuation page, free our allocation and use this one.
3484                 */
3485                if (!(count & COUNT_CONTINUED))
3486                        goto out_unlock_cont;
3487
3488                map = kmap_atomic(list_page) + offset;
3489                count = *map;
3490                kunmap_atomic(map);
3491
3492                /*
3493                 * If this continuation count now has some space in it,
3494                 * free our allocation and use this one.
3495                 */
3496                if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
3497                        goto out_unlock_cont;
3498        }
3499
3500        list_add_tail(&page->lru, &head->lru);
3501        page = NULL;                    /* now it's attached, don't free it */
3502out_unlock_cont:
3503        spin_unlock(&si->cont_lock);
3504out:
3505        unlock_cluster(ci);
3506        spin_unlock(&si->lock);
3507outer:
3508        if (page)
3509                __free_page(page);
3510        return 0;
3511}
3512
3513/*
3514 * swap_count_continued - when the original swap_map count is incremented
3515 * from SWAP_MAP_MAX, check if there is already a continuation page to carry
3516 * into, carry if so, or else fail until a new continuation page is allocated;
3517 * when the original swap_map count is decremented from 0 with continuation,
3518 * borrow from the continuation and report whether it still holds more.
3519 * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster
3520 * lock.
3521 */
3522static bool swap_count_continued(struct swap_info_struct *si,
3523                                 pgoff_t offset, unsigned char count)
3524{
3525        struct page *head;
3526        struct page *page;
3527        unsigned char *map;
3528        bool ret;
3529
3530        head = vmalloc_to_page(si->swap_map + offset);
3531        if (page_private(head) != SWP_CONTINUED) {
3532                BUG_ON(count & COUNT_CONTINUED);
3533                return false;           /* need to add count continuation */
3534        }
3535
3536        spin_lock(&si->cont_lock);
3537        offset &= ~PAGE_MASK;
3538        page = list_entry(head->lru.next, struct page, lru);
3539        map = kmap_atomic(page) + offset;
3540
3541        if (count == SWAP_MAP_MAX)      /* initial increment from swap_map */
3542                goto init_map;          /* jump over SWAP_CONT_MAX checks */
3543
3544        if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
3545                /*
3546                 * Think of how you add 1 to 999
3547                 */
3548                while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
3549                        kunmap_atomic(map);
3550                        page = list_entry(page->lru.next, struct page, lru);
3551                        BUG_ON(page == head);
3552                        map = kmap_atomic(page) + offset;
3553                }
3554                if (*map == SWAP_CONT_MAX) {
3555                        kunmap_atomic(map);
3556                        page = list_entry(page->lru.next, struct page, lru);
3557                        if (page == head) {
3558                                ret = false;    /* add count continuation */
3559                                goto out;
3560                        }
3561                        map = kmap_atomic(page) + offset;
3562init_map:               *map = 0;               /* we didn't zero the page */
3563                }
3564                *map += 1;
3565                kunmap_atomic(map);
3566                page = list_entry(page->lru.prev, struct page, lru);
3567                while (page != head) {
3568                        map = kmap_atomic(page) + offset;
3569                        *map = COUNT_CONTINUED;
3570                        kunmap_atomic(map);
3571                        page = list_entry(page->lru.prev, struct page, lru);
3572                }
3573                ret = true;                     /* incremented */
3574
3575        } else {                                /* decrementing */
3576                /*
3577                 * Think of how you subtract 1 from 1000
3578                 */
3579                BUG_ON(count != COUNT_CONTINUED);
3580                while (*map == COUNT_CONTINUED) {
3581                        kunmap_atomic(map);
3582                        page = list_entry(page->lru.next, struct page, lru);
3583                        BUG_ON(page == head);
3584                        map = kmap_atomic(page) + offset;
3585                }
3586                BUG_ON(*map == 0);
3587                *map -= 1;
3588                if (*map == 0)
3589                        count = 0;
3590                kunmap_atomic(map);
3591                page = list_entry(page->lru.prev, struct page, lru);
3592                while (page != head) {
3593                        map = kmap_atomic(page) + offset;
3594                        *map = SWAP_CONT_MAX | count;
3595                        count = COUNT_CONTINUED;
3596                        kunmap_atomic(map);
3597                        page = list_entry(page->lru.prev, struct page, lru);
3598                }
3599                ret = count == COUNT_CONTINUED;
3600        }
3601out:
3602        spin_unlock(&si->cont_lock);
3603        return ret;
3604}
3605
3606/*
3607 * free_swap_count_continuations - swapoff free all the continuation pages
3608 * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
3609 */
3610static void free_swap_count_continuations(struct swap_info_struct *si)
3611{
3612        pgoff_t offset;
3613
3614        for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
3615                struct page *head;
3616                head = vmalloc_to_page(si->swap_map + offset);
3617                if (page_private(head)) {
3618                        struct page *page, *next;
3619
3620                        list_for_each_entry_safe(page, next, &head->lru, lru) {
3621                                list_del(&page->lru);
3622                                __free_page(page);
3623                        }
3624                }
3625        }
3626}
3627
3628#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
3629void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
3630                                  gfp_t gfp_mask)
3631{
3632        struct swap_info_struct *si, *next;
3633        if (!(gfp_mask & __GFP_IO) || !memcg)
3634                return;
3635
3636        if (!blk_cgroup_congested())
3637                return;
3638
3639        /*
3640         * We've already scheduled a throttle, avoid taking the global swap
3641         * lock.
3642         */
3643        if (current->throttle_queue)
3644                return;
3645
3646        spin_lock(&swap_avail_lock);
3647        plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
3648                                  avail_lists[node]) {
3649                if (si->bdev) {
3650                        blkcg_schedule_throttle(bdev_get_queue(si->bdev),
3651                                                true);
3652                        break;
3653                }
3654        }
3655        spin_unlock(&swap_avail_lock);
3656}
3657#endif
3658
3659static int __init swapfile_init(void)
3660{
3661        int nid;
3662
3663        swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
3664                                         GFP_KERNEL);
3665        if (!swap_avail_heads) {
3666                pr_emerg("Not enough memory for swap heads, swap is disabled\n");
3667                return -ENOMEM;
3668        }
3669
3670        for_each_node(nid)
3671                plist_head_init(&swap_avail_heads[nid]);
3672
3673        return 0;
3674}
3675subsys_initcall(swapfile_init);
3676