LXR linux/mm/swapfile.c

   1/*
   2 *  linux/mm/swapfile.c
   3 *
   4 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5 *  Swap reorganised 29.12.95, Stephen Tweedie
   6 */
   7
   8#include <linux/mm.h>
   9#include <linux/hugetlb.h>
  10#include <linux/mman.h>
  11#include <linux/slab.h>
  12#include <linux/kernel_stat.h>
  13#include <linux/swap.h>
  14#include <linux/vmalloc.h>
  15#include <linux/pagemap.h>
  16#include <linux/namei.h>
  17#include <linux/shmem_fs.h>
  18#include <linux/blkdev.h>
  19#include <linux/random.h>
  20#include <linux/writeback.h>
  21#include <linux/proc_fs.h>
  22#include <linux/seq_file.h>
  23#include <linux/init.h>
  24#include <linux/ksm.h>
  25#include <linux/rmap.h>
  26#include <linux/security.h>
  27#include <linux/backing-dev.h>
  28#include <linux/mutex.h>
  29#include <linux/capability.h>
  30#include <linux/syscalls.h>
  31#include <linux/memcontrol.h>
  32#include <linux/poll.h>
  33#include <linux/oom.h>
  34#include <linux/frontswap.h>
  35#include <linux/swapfile.h>
  36#include <linux/export.h>
  37
  38#include <asm/pgtable.h>
  39#include <asm/tlbflush.h>
  40#include <linux/swapops.h>
  41#include <linux/swap_cgroup.h>
  42
  43static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
  44                                 unsigned char);
  45static void free_swap_count_continuations(struct swap_info_struct *);
  46static sector_t map_swap_entry(swp_entry_t, struct block_device**);
  47
  48DEFINE_SPINLOCK(swap_lock);
  49static unsigned int nr_swapfiles;
  50atomic_long_t nr_swap_pages;
  51/*
  52 * Some modules use swappable objects and may try to swap them out under
  53 * memory pressure (via the shrinker). Before doing so, they may wish to
  54 * check to see if any swap space is available.
  55 */
  56EXPORT_SYMBOL_GPL(nr_swap_pages);
  57/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
  58long total_swap_pages;
  59static int least_priority;
  60
  61static const char Bad_file[] = "Bad swap file entry ";
  62static const char Unused_file[] = "Unused swap file entry ";
  63static const char Bad_offset[] = "Bad swap offset entry ";
  64static const char Unused_offset[] = "Unused swap offset entry ";
  65
  66/*
  67 * all active swap_info_structs
  68 * protected with swap_lock, and ordered by priority.
  69 */
  70PLIST_HEAD(swap_active_head);
  71
  72/*
  73 * all available (active, not full) swap_info_structs
  74 * protected with swap_avail_lock, ordered by priority.
  75 * This is used by get_swap_page() instead of swap_active_head
  76 * because swap_active_head includes all swap_info_structs,
  77 * but get_swap_page() doesn't need to look at full ones.
  78 * This uses its own lock instead of swap_lock because when a
  79 * swap_info_struct changes between not-full/full, it needs to
  80 * add/remove itself to/from this list, but the swap_info_struct->lock
  81 * is held and the locking order requires swap_lock to be taken
  82 * before any swap_info_struct->lock.
  83 */
  84static PLIST_HEAD(swap_avail_head);
  85static DEFINE_SPINLOCK(swap_avail_lock);
  86
  87struct swap_info_struct *swap_info[MAX_SWAPFILES];
  88
  89static DEFINE_MUTEX(swapon_mutex);
  90
  91static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
  92/* Activity counter to indicate that a swapon or swapoff has occurred */
  93static atomic_t proc_poll_event = ATOMIC_INIT(0);
  94
  95static inline unsigned char swap_count(unsigned char ent)
  96{
  97        return ent & ~SWAP_HAS_CACHE;   /* may include SWAP_HAS_CONT flag */
  98}
  99
 100/* returns 1 if swap entry is freed */
 101static int
 102__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
 103{
 104        swp_entry_t entry = swp_entry(si->type, offset);
 105        struct page *page;
 106        int ret = 0;
 107
 108        page = find_get_page(swap_address_space(entry), entry.val);
 109        if (!page)
 110                return 0;
 111        /*
 112         * This function is called from scan_swap_map() and it's called
 113         * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
 114         * We have to use trylock for avoiding deadlock. This is a special
 115         * case and you should use try_to_free_swap() with explicit lock_page()
 116         * in usual operations.
 117         */
 118        if (trylock_page(page)) {
 119                ret = try_to_free_swap(page);
 120                unlock_page(page);
 121        }
 122        put_page(page);
 123        return ret;
 124}
 125
 126/*
 127 * swapon tell device that all the old swap contents can be discarded,
 128 * to allow the swap device to optimize its wear-levelling.
 129 */
 130static int discard_swap(struct swap_info_struct *si)
 131{
 132        struct swap_extent *se;
 133        sector_t start_block;
 134        sector_t nr_blocks;
 135        int err = 0;
 136
 137        /* Do not discard the swap header page! */
 138        se = &si->first_swap_extent;
 139        start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
 140        nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
 141        if (nr_blocks) {
 142                err = blkdev_issue_discard(si->bdev, start_block,
 143                                nr_blocks, GFP_KERNEL, 0);
 144                if (err)
 145                        return err;
 146                cond_resched();
 147        }
 148
 149        list_for_each_entry(se, &si->first_swap_extent.list, list) {
 150                start_block = se->start_block << (PAGE_SHIFT - 9);
 151                nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
 152
 153                err = blkdev_issue_discard(si->bdev, start_block,
 154                                nr_blocks, GFP_KERNEL, 0);
 155                if (err)
 156                        break;
 157
 158                cond_resched();
 159        }
 160        return err;             /* That will often be -EOPNOTSUPP */
 161}
 162
 163/*
 164 * swap allocation tell device that a cluster of swap can now be discarded,
 165 * to allow the swap device to optimize its wear-levelling.
 166 */
 167static void discard_swap_cluster(struct swap_info_struct *si,
 168                                 pgoff_t start_page, pgoff_t nr_pages)
 169{
 170        struct swap_extent *se = si->curr_swap_extent;
 171        int found_extent = 0;
 172
 173        while (nr_pages) {
 174                if (se->start_page <= start_page &&
 175                    start_page < se->start_page + se->nr_pages) {
 176                        pgoff_t offset = start_page - se->start_page;
 177                        sector_t start_block = se->start_block + offset;
 178                        sector_t nr_blocks = se->nr_pages - offset;
 179
 180                        if (nr_blocks > nr_pages)
 181                                nr_blocks = nr_pages;
 182                        start_page += nr_blocks;
 183                        nr_pages -= nr_blocks;
 184
 185                        if (!found_extent++)
 186                                si->curr_swap_extent = se;
 187
 188                        start_block <<= PAGE_SHIFT - 9;
 189                        nr_blocks <<= PAGE_SHIFT - 9;
 190                        if (blkdev_issue_discard(si->bdev, start_block,
 191                                    nr_blocks, GFP_NOIO, 0))
 192                                break;
 193                }
 194
 195                se = list_next_entry(se, list);
 196        }
 197}
 198
 199#define SWAPFILE_CLUSTER        256
 200#define LATENCY_LIMIT           256
 201
 202static inline void cluster_set_flag(struct swap_cluster_info *info,
 203        unsigned int flag)
 204{
 205        info->flags = flag;
 206}
 207
 208static inline unsigned int cluster_count(struct swap_cluster_info *info)
 209{
 210        return info->data;
 211}
 212
 213static inline void cluster_set_count(struct swap_cluster_info *info,
 214                                     unsigned int c)
 215{
 216        info->data = c;
 217}
 218
 219static inline void cluster_set_count_flag(struct swap_cluster_info *info,
 220                                         unsigned int c, unsigned int f)
 221{
 222        info->flags = f;
 223        info->data = c;
 224}
 225
 226static inline unsigned int cluster_next(struct swap_cluster_info *info)
 227{
 228        return info->data;
 229}
 230
 231static inline void cluster_set_next(struct swap_cluster_info *info,
 232                                    unsigned int n)
 233{
 234        info->data = n;
 235}
 236
 237static inline void cluster_set_next_flag(struct swap_cluster_info *info,
 238                                         unsigned int n, unsigned int f)
 239{
 240        info->flags = f;
 241        info->data = n;
 242}
 243
 244static inline bool cluster_is_free(struct swap_cluster_info *info)
 245{
 246        return info->flags & CLUSTER_FLAG_FREE;
 247}
 248
 249static inline bool cluster_is_null(struct swap_cluster_info *info)
 250{
 251        return info->flags & CLUSTER_FLAG_NEXT_NULL;
 252}
 253
 254static inline void cluster_set_null(struct swap_cluster_info *info)
 255{
 256        info->flags = CLUSTER_FLAG_NEXT_NULL;
 257        info->data = 0;
 258}
 259
 260/* Add a cluster to discard list and schedule it to do discard */
 261static void swap_cluster_schedule_discard(struct swap_info_struct *si,
 262                unsigned int idx)
 263{
 264        /*
 265         * If scan_swap_map() can't find a free cluster, it will check
 266         * si->swap_map directly. To make sure the discarding cluster isn't
 267         * taken by scan_swap_map(), mark the swap entries bad (occupied). It
 268         * will be cleared after discard
 269         */
 270        memset(si->swap_map + idx * SWAPFILE_CLUSTER,
 271                        SWAP_MAP_BAD, SWAPFILE_CLUSTER);
 272
 273        if (cluster_is_null(&si->discard_cluster_head)) {
 274                cluster_set_next_flag(&si->discard_cluster_head,
 275                                                idx, 0);
 276                cluster_set_next_flag(&si->discard_cluster_tail,
 277                                                idx, 0);
 278        } else {
 279                unsigned int tail = cluster_next(&si->discard_cluster_tail);
 280                cluster_set_next(&si->cluster_info[tail], idx);
 281                cluster_set_next_flag(&si->discard_cluster_tail,
 282                                                idx, 0);
 283        }
 284
 285        schedule_work(&si->discard_work);
 286}
 287
 288/*
 289 * Doing discard actually. After a cluster discard is finished, the cluster
 290 * will be added to free cluster list. caller should hold si->lock.
 291*/
 292static void swap_do_scheduled_discard(struct swap_info_struct *si)
 293{
 294        struct swap_cluster_info *info;
 295        unsigned int idx;
 296
 297        info = si->cluster_info;
 298
 299        while (!cluster_is_null(&si->discard_cluster_head)) {
 300                idx = cluster_next(&si->discard_cluster_head);
 301
 302                cluster_set_next_flag(&si->discard_cluster_head,
 303                                                cluster_next(&info[idx]), 0);
 304                if (cluster_next(&si->discard_cluster_tail) == idx) {
 305                        cluster_set_null(&si->discard_cluster_head);
 306                        cluster_set_null(&si->discard_cluster_tail);
 307                }
 308                spin_unlock(&si->lock);
 309
 310                discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
 311                                SWAPFILE_CLUSTER);
 312
 313                spin_lock(&si->lock);
 314                cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
 315                if (cluster_is_null(&si->free_cluster_head)) {
 316                        cluster_set_next_flag(&si->free_cluster_head,
 317                                                idx, 0);
 318                        cluster_set_next_flag(&si->free_cluster_tail,
 319                                                idx, 0);
 320                } else {
 321                        unsigned int tail;
 322
 323                        tail = cluster_next(&si->free_cluster_tail);
 324                        cluster_set_next(&info[tail], idx);
 325                        cluster_set_next_flag(&si->free_cluster_tail,
 326                                                idx, 0);
 327                }
 328                memset(si->swap_map + idx * SWAPFILE_CLUSTER,
 329                                0, SWAPFILE_CLUSTER);
 330        }
 331}
 332
 333static void swap_discard_work(struct work_struct *work)
 334{
 335        struct swap_info_struct *si;
 336
 337        si = container_of(work, struct swap_info_struct, discard_work);
 338
 339        spin_lock(&si->lock);
 340        swap_do_scheduled_discard(si);
 341        spin_unlock(&si->lock);
 342}
 343
 344/*
 345 * The cluster corresponding to page_nr will be used. The cluster will be
 346 * removed from free cluster list and its usage counter will be increased.
 347 */
 348static void inc_cluster_info_page(struct swap_info_struct *p,
 349        struct swap_cluster_info *cluster_info, unsigned long page_nr)
 350{
 351        unsigned long idx = page_nr / SWAPFILE_CLUSTER;
 352
 353        if (!cluster_info)
 354                return;
 355        if (cluster_is_free(&cluster_info[idx])) {
 356                VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx);
 357                cluster_set_next_flag(&p->free_cluster_head,
 358                        cluster_next(&cluster_info[idx]), 0);
 359                if (cluster_next(&p->free_cluster_tail) == idx) {
 360                        cluster_set_null(&p->free_cluster_tail);
 361                        cluster_set_null(&p->free_cluster_head);
 362                }
 363                cluster_set_count_flag(&cluster_info[idx], 0, 0);
 364        }
 365
 366        VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
 367        cluster_set_count(&cluster_info[idx],
 368                cluster_count(&cluster_info[idx]) + 1);
 369}
 370
 371/*
 372 * The cluster corresponding to page_nr decreases one usage. If the usage
 373 * counter becomes 0, which means no page in the cluster is in using, we can
 374 * optionally discard the cluster and add it to free cluster list.
 375 */
 376static void dec_cluster_info_page(struct swap_info_struct *p,
 377        struct swap_cluster_info *cluster_info, unsigned long page_nr)
 378{
 379        unsigned long idx = page_nr / SWAPFILE_CLUSTER;
 380
 381        if (!cluster_info)
 382                return;
 383
 384        VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
 385        cluster_set_count(&cluster_info[idx],
 386                cluster_count(&cluster_info[idx]) - 1);
 387
 388        if (cluster_count(&cluster_info[idx]) == 0) {
 389                /*
 390                 * If the swap is discardable, prepare discard the cluster
 391                 * instead of free it immediately. The cluster will be freed
 392                 * after discard.
 393                 */
 394                if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
 395                                 (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
 396                        swap_cluster_schedule_discard(p, idx);
 397                        return;
 398                }
 399
 400                cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
 401                if (cluster_is_null(&p->free_cluster_head)) {
 402                        cluster_set_next_flag(&p->free_cluster_head, idx, 0);
 403                        cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
 404                } else {
 405                        unsigned int tail = cluster_next(&p->free_cluster_tail);
 406                        cluster_set_next(&cluster_info[tail], idx);
 407                        cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
 408                }
 409        }
 410}
 411
 412/*
 413 * It's possible scan_swap_map() uses a free cluster in the middle of free
 414 * cluster list. Avoiding such abuse to avoid list corruption.
 415 */
 416static bool
 417scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
 418        unsigned long offset)
 419{
 420        struct percpu_cluster *percpu_cluster;
 421        bool conflict;
 422
 423        offset /= SWAPFILE_CLUSTER;
 424        conflict = !cluster_is_null(&si->free_cluster_head) &&
 425                offset != cluster_next(&si->free_cluster_head) &&
 426                cluster_is_free(&si->cluster_info[offset]);
 427
 428        if (!conflict)
 429                return false;
 430
 431        percpu_cluster = this_cpu_ptr(si->percpu_cluster);
 432        cluster_set_null(&percpu_cluster->index);
 433        return true;
 434}
 435
 436/*
 437 * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
 438 * might involve allocating a new cluster for current CPU too.
 439 */
 440static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
 441        unsigned long *offset, unsigned long *scan_base)
 442{
 443        struct percpu_cluster *cluster;
 444        bool found_free;
 445        unsigned long tmp;
 446
 447new_cluster:
 448        cluster = this_cpu_ptr(si->percpu_cluster);
 449        if (cluster_is_null(&cluster->index)) {
 450                if (!cluster_is_null(&si->free_cluster_head)) {
 451                        cluster->index = si->free_cluster_head;
 452                        cluster->next = cluster_next(&cluster->index) *
 453                                        SWAPFILE_CLUSTER;
 454                } else if (!cluster_is_null(&si->discard_cluster_head)) {
 455                        /*
 456                         * we don't have free cluster but have some clusters in
 457                         * discarding, do discard now and reclaim them
 458                         */
 459                        swap_do_scheduled_discard(si);
 460                        *scan_base = *offset = si->cluster_next;
 461                        goto new_cluster;
 462                } else
 463                        return;
 464        }
 465
 466        found_free = false;
 467
 468        /*
 469         * Other CPUs can use our cluster if they can't find a free cluster,
 470         * check if there is still free entry in the cluster
 471         */
 472        tmp = cluster->next;
 473        while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) *
 474               SWAPFILE_CLUSTER) {
 475                if (!si->swap_map[tmp]) {
 476                        found_free = true;
 477                        break;
 478                }
 479                tmp++;
 480        }
 481        if (!found_free) {
 482                cluster_set_null(&cluster->index);
 483                goto new_cluster;
 484        }
 485        cluster->next = tmp + 1;
 486        *offset = tmp;
 487        *scan_base = tmp;
 488}
 489
 490static unsigned long scan_swap_map(struct swap_info_struct *si,
 491                                   unsigned char usage)
 492{
 493        unsigned long offset;
 494        unsigned long scan_base;
 495        unsigned long last_in_cluster = 0;
 496        int latency_ration = LATENCY_LIMIT;
 497
 498        /*
 499         * We try to cluster swap pages by allocating them sequentially
 500         * in swap.  Once we've allocated SWAPFILE_CLUSTER pages this
 501         * way, however, we resort to first-free allocation, starting
 502         * a new cluster.  This prevents us from scattering swap pages
 503         * all over the entire swap partition, so that we reduce
 504         * overall disk seek times between swap pages.  -- sct
 505         * But we do now try to find an empty cluster.  -Andrea
 506         * And we let swap pages go all over an SSD partition.  Hugh
 507         */
 508
 509        si->flags += SWP_SCANNING;
 510        scan_base = offset = si->cluster_next;
 511
 512        /* SSD algorithm */
 513        if (si->cluster_info) {
 514                scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
 515                goto checks;
 516        }
 517
 518        if (unlikely(!si->cluster_nr--)) {
 519                if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
 520                        si->cluster_nr = SWAPFILE_CLUSTER - 1;
 521                        goto checks;
 522                }
 523
 524                spin_unlock(&si->lock);
 525
 526                /*
 527                 * If seek is expensive, start searching for new cluster from
 528                 * start of partition, to minimize the span of allocated swap.
 529                 * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
 530                 * case, just handled by scan_swap_map_try_ssd_cluster() above.
 531                 */
 532                scan_base = offset = si->lowest_bit;
 533                last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
 534
 535                /* Locate the first empty (unaligned) cluster */
 536                for (; last_in_cluster <= si->highest_bit; offset++) {
 537                        if (si->swap_map[offset])
 538                                last_in_cluster = offset + SWAPFILE_CLUSTER;
 539                        else if (offset == last_in_cluster) {
 540                                spin_lock(&si->lock);
 541                                offset -= SWAPFILE_CLUSTER - 1;
 542                                si->cluster_next = offset;
 543                                si->cluster_nr = SWAPFILE_CLUSTER - 1;
 544                                goto checks;
 545                        }
 546                        if (unlikely(--latency_ration < 0)) {
 547                                cond_resched();
 548                                latency_ration = LATENCY_LIMIT;
 549                        }
 550                }
 551
 552                offset = scan_base;
 553                spin_lock(&si->lock);
 554                si->cluster_nr = SWAPFILE_CLUSTER - 1;
 555        }
 556
 557checks:
 558        if (si->cluster_info) {
 559                while (scan_swap_map_ssd_cluster_conflict(si, offset))
 560                        scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
 561        }
 562        if (!(si->flags & SWP_WRITEOK))
 563                goto no_page;
 564        if (!si->highest_bit)
 565                goto no_page;
 566        if (offset > si->highest_bit)
 567                scan_base = offset = si->lowest_bit;
 568
 569        /* reuse swap entry of cache-only swap if not busy. */
 570        if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
 571                int swap_was_freed;
 572                spin_unlock(&si->lock);
 573                swap_was_freed = __try_to_reclaim_swap(si, offset);
 574                spin_lock(&si->lock);
 575                /* entry was freed successfully, try to use this again */
 576                if (swap_was_freed)
 577                        goto checks;
 578                goto scan; /* check next one */
 579        }
 580
 581        if (si->swap_map[offset])
 582                goto scan;
 583
 584        if (offset == si->lowest_bit)
 585                si->lowest_bit++;
 586        if (offset == si->highest_bit)
 587                si->highest_bit--;
 588        si->inuse_pages++;
 589        if (si->inuse_pages == si->pages) {
 590                si->lowest_bit = si->max;
 591                si->highest_bit = 0;
 592                spin_lock(&swap_avail_lock);
 593                plist_del(&si->avail_list, &swap_avail_head);
 594                spin_unlock(&swap_avail_lock);
 595        }
 596        si->swap_map[offset] = usage;
 597        inc_cluster_info_page(si, si->cluster_info, offset);
 598        si->cluster_next = offset + 1;
 599        si->flags -= SWP_SCANNING;
 600
 601        return offset;
 602
 603scan:
 604        spin_unlock(&si->lock);
 605        while (++offset <= si->highest_bit) {
 606                if (!si->swap_map[offset]) {
 607                        spin_lock(&si->lock);
 608                        goto checks;
 609                }
 610                if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
 611                        spin_lock(&si->lock);
 612                        goto checks;
 613                }
 614                if (unlikely(--latency_ration < 0)) {
 615                        cond_resched();
 616                        latency_ration = LATENCY_LIMIT;
 617                }
 618        }
 619        offset = si->lowest_bit;
 620        while (offset < scan_base) {
 621                if (!si->swap_map[offset]) {
 622                        spin_lock(&si->lock);
 623                        goto checks;
 624                }
 625                if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
 626                        spin_lock(&si->lock);
 627                        goto checks;
 628                }
 629                if (unlikely(--latency_ration < 0)) {
 630                        cond_resched();
 631                        latency_ration = LATENCY_LIMIT;
 632                }
 633                offset++;
 634        }
 635        spin_lock(&si->lock);
 636
 637no_page:
 638        si->flags -= SWP_SCANNING;
 639        return 0;
 640}
 641
 642swp_entry_t get_swap_page(void)
 643{
 644        struct swap_info_struct *si, *next;
 645        pgoff_t offset;
 646
 647        if (atomic_long_read(&nr_swap_pages) <= 0)
 648                goto noswap;
 649        atomic_long_dec(&nr_swap_pages);
 650
 651        spin_lock(&swap_avail_lock);
 652
 653start_over:
 654        plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
 655                /* requeue si to after same-priority siblings */
 656                plist_requeue(&si->avail_list, &swap_avail_head);
 657                spin_unlock(&swap_avail_lock);
 658                spin_lock(&si->lock);
 659                if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
 660                        spin_lock(&swap_avail_lock);
 661                        if (plist_node_empty(&si->avail_list)) {
 662                                spin_unlock(&si->lock);
 663                                goto nextsi;
 664                        }
 665                        WARN(!si->highest_bit,
 666                             "swap_info %d in list but !highest_bit\n",
 667                             si->type);
 668                        WARN(!(si->flags & SWP_WRITEOK),
 669                             "swap_info %d in list but !SWP_WRITEOK\n",
 670                             si->type);
 671                        plist_del(&si->avail_list, &swap_avail_head);
 672                        spin_unlock(&si->lock);
 673                        goto nextsi;
 674                }
 675
 676                /* This is called for allocating swap entry for cache */
 677                offset = scan_swap_map(si, SWAP_HAS_CACHE);
 678                spin_unlock(&si->lock);
 679                if (offset)
 680                        return swp_entry(si->type, offset);
 681                pr_debug("scan_swap_map of si %d failed to find offset\n",
 682                       si->type);
 683                spin_lock(&swap_avail_lock);
 684nextsi:
 685                /*
 686                 * if we got here, it's likely that si was almost full before,
 687                 * and since scan_swap_map() can drop the si->lock, multiple
 688                 * callers probably all tried to get a page from the same si
 689                 * and it filled up before we could get one; or, the si filled
 690                 * up between us dropping swap_avail_lock and taking si->lock.
 691                 * Since we dropped the swap_avail_lock, the swap_avail_head
 692                 * list may have been modified; so if next is still in the
 693                 * swap_avail_head list then try it, otherwise start over.
 694                 */
 695                if (plist_node_empty(&next->avail_list))
 696                        goto start_over;
 697        }
 698
 699        spin_unlock(&swap_avail_lock);
 700
 701        atomic_long_inc(&nr_swap_pages);
 702noswap:
 703        return (swp_entry_t) {0};
 704}
 705
 706/* The only caller of this function is now suspend routine */
 707swp_entry_t get_swap_page_of_type(int type)
 708{
 709        struct swap_info_struct *si;
 710        pgoff_t offset;
 711
 712        si = swap_info[type];
 713        spin_lock(&si->lock);
 714        if (si && (si->flags & SWP_WRITEOK)) {
 715                atomic_long_dec(&nr_swap_pages);
 716                /* This is called for allocating swap entry, not cache */
 717                offset = scan_swap_map(si, 1);
 718                if (offset) {
 719                        spin_unlock(&si->lock);
 720                        return swp_entry(type, offset);
 721                }
 722                atomic_long_inc(&nr_swap_pages);
 723        }
 724        spin_unlock(&si->lock);
 725        return (swp_entry_t) {0};
 726}
 727
 728static struct swap_info_struct *swap_info_get(swp_entry_t entry)
 729{
 730        struct swap_info_struct *p;
 731        unsigned long offset, type;
 732
 733        if (!entry.val)
 734                goto out;
 735        type = swp_type(entry);
 736        if (type >= nr_swapfiles)
 737                goto bad_nofile;
 738        p = swap_info[type];
 739        if (!(p->flags & SWP_USED))
 740                goto bad_device;
 741        offset = swp_offset(entry);
 742        if (offset >= p->max)
 743                goto bad_offset;
 744        if (!p->swap_map[offset])
 745                goto bad_free;
 746        spin_lock(&p->lock);
 747        return p;
 748
 749bad_free:
 750        pr_err("swap_free: %s%08lx\n", Unused_offset, entry.val);
 751        goto out;
 752bad_offset:
 753        pr_err("swap_free: %s%08lx\n", Bad_offset, entry.val);
 754        goto out;
 755bad_device:
 756        pr_err("swap_free: %s%08lx\n", Unused_file, entry.val);
 757        goto out;
 758bad_nofile:
 759        pr_err("swap_free: %s%08lx\n", Bad_file, entry.val);
 760out:
 761        return NULL;
 762}
 763
 764static unsigned char swap_entry_free(struct swap_info_struct *p,
 765                                     swp_entry_t entry, unsigned char usage)
 766{
 767        unsigned long offset = swp_offset(entry);
 768        unsigned char count;
 769        unsigned char has_cache;
 770
 771        count = p->swap_map[offset];
 772        has_cache = count & SWAP_HAS_CACHE;
 773        count &= ~SWAP_HAS_CACHE;
 774
 775        if (usage == SWAP_HAS_CACHE) {
 776                VM_BUG_ON(!has_cache);
 777                has_cache = 0;
 778        } else if (count == SWAP_MAP_SHMEM) {
 779                /*
 780                 * Or we could insist on shmem.c using a special
 781                 * swap_shmem_free() and free_shmem_swap_and_cache()...
 782                 */
 783                count = 0;
 784        } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
 785                if (count == COUNT_CONTINUED) {
 786                        if (swap_count_continued(p, offset, count))
 787                                count = SWAP_MAP_MAX | COUNT_CONTINUED;
 788                        else
 789                                count = SWAP_MAP_MAX;
 790                } else
 791                        count--;
 792        }
 793
 794        usage = count | has_cache;
 795        p->swap_map[offset] = usage;
 796
 797        /* free if no reference */
 798        if (!usage) {
 799                mem_cgroup_uncharge_swap(entry);
 800                dec_cluster_info_page(p, p->cluster_info, offset);
 801                if (offset < p->lowest_bit)
 802                        p->lowest_bit = offset;
 803                if (offset > p->highest_bit) {
 804                        bool was_full = !p->highest_bit;
 805                        p->highest_bit = offset;
 806                        if (was_full && (p->flags & SWP_WRITEOK)) {
 807                                spin_lock(&swap_avail_lock);
 808                                WARN_ON(!plist_node_empty(&p->avail_list));
 809                                if (plist_node_empty(&p->avail_list))
 810                                        plist_add(&p->avail_list,
 811                                                  &swap_avail_head);
 812                                spin_unlock(&swap_avail_lock);
 813                        }
 814                }
 815                atomic_long_inc(&nr_swap_pages);
 816                p->inuse_pages--;
 817                frontswap_invalidate_page(p->type, offset);
 818                if (p->flags & SWP_BLKDEV) {
 819                        struct gendisk *disk = p->bdev->bd_disk;
 820                        if (disk->fops->swap_slot_free_notify)
 821                                disk->fops->swap_slot_free_notify(p->bdev,
 822                                                                  offset);
 823                }
 824        }
 825
 826        return usage;
 827}
 828
 829/*
 830 * Caller has made sure that the swap device corresponding to entry
 831 * is still around or has not been recycled.
 832 */
 833void swap_free(swp_entry_t entry)
 834{
 835        struct swap_info_struct *p;
 836
 837        p = swap_info_get(entry);
 838        if (p) {
 839                swap_entry_free(p, entry, 1);
 840                spin_unlock(&p->lock);
 841        }
 842}
 843
 844/*
 845 * Called after dropping swapcache to decrease refcnt to swap entries.
 846 */
 847void swapcache_free(swp_entry_t entry)
 848{
 849        struct swap_info_struct *p;
 850
 851        p = swap_info_get(entry);
 852        if (p) {
 853                swap_entry_free(p, entry, SWAP_HAS_CACHE);
 854                spin_unlock(&p->lock);
 855        }
 856}
 857
 858/*
 859 * How many references to page are currently swapped out?
 860 * This does not give an exact answer when swap count is continued,
 861 * but does include the high COUNT_CONTINUED flag to allow for that.
 862 */
 863int page_swapcount(struct page *page)
 864{
 865        int count = 0;
 866        struct swap_info_struct *p;
 867        swp_entry_t entry;
 868
 869        entry.val = page_private(page);
 870        p = swap_info_get(entry);
 871        if (p) {
 872                count = swap_count(p->swap_map[swp_offset(entry)]);
 873                spin_unlock(&p->lock);
 874        }
 875        return count;
 876}
 877
 878/*
 879 * How many references to @entry are currently swapped out?
 880 * This considers COUNT_CONTINUED so it returns exact answer.
 881 */
 882int swp_swapcount(swp_entry_t entry)
 883{
 884        int count, tmp_count, n;
 885        struct swap_info_struct *p;
 886        struct page *page;
 887        pgoff_t offset;
 888        unsigned char *map;
 889
 890        p = swap_info_get(entry);
 891        if (!p)
 892                return 0;
 893
 894        count = swap_count(p->swap_map[swp_offset(entry)]);
 895        if (!(count & COUNT_CONTINUED))
 896                goto out;
 897
 898        count &= ~COUNT_CONTINUED;
 899        n = SWAP_MAP_MAX + 1;
 900
 901        offset = swp_offset(entry);
 902        page = vmalloc_to_page(p->swap_map + offset);
 903        offset &= ~PAGE_MASK;
 904        VM_BUG_ON(page_private(page) != SWP_CONTINUED);
 905
 906        do {
 907                page = list_next_entry(page, lru);
 908                map = kmap_atomic(page);
 909                tmp_count = map[offset];
 910                kunmap_atomic(map);
 911
 912                count += (tmp_count & ~COUNT_CONTINUED) * n;
 913                n *= (SWAP_CONT_MAX + 1);
 914        } while (tmp_count & COUNT_CONTINUED);
 915out:
 916        spin_unlock(&p->lock);
 917        return count;
 918}
 919
 920/*
 921 * We can write to an anon page without COW if there are no other references
 922 * to it.  And as a side-effect, free up its swap: because the old content
 923 * on disk will never be read, and seeking back there to write new content
 924 * later would only waste time away from clustering.
 925 *
 926 * NOTE: total_mapcount should not be relied upon by the caller if
 927 * reuse_swap_page() returns false, but it may be always overwritten
 928 * (see the other implementation for CONFIG_SWAP=n).
 929 */
 930bool reuse_swap_page(struct page *page, int *total_mapcount)
 931{
 932        int count;
 933
 934        VM_BUG_ON_PAGE(!PageLocked(page), page);
 935        if (unlikely(PageKsm(page)))
 936                return false;
 937        count = page_trans_huge_mapcount(page, total_mapcount);
 938        if (count <= 1 && PageSwapCache(page)) {
 939                count += page_swapcount(page);
 940                if (count == 1 && !PageWriteback(page)) {
 941                        delete_from_swap_cache(page);
 942                        SetPageDirty(page);
 943                }
 944        }
 945        return count <= 1;
 946}
 947
 948/*
 949 * If swap is getting full, or if there are no more mappings of this page,
 950 * then try_to_free_swap is called to free its swap space.
 951 */
 952int try_to_free_swap(struct page *page)
 953{
 954        VM_BUG_ON_PAGE(!PageLocked(page), page);
 955
 956        if (!PageSwapCache(page))
 957                return 0;
 958        if (PageWriteback(page))
 959                return 0;
 960        if (page_swapcount(page))
 961                return 0;
 962
 963        /*
 964         * Once hibernation has begun to create its image of memory,
 965         * there's a danger that one of the calls to try_to_free_swap()
 966         * - most probably a call from __try_to_reclaim_swap() while
 967         * hibernation is allocating its own swap pages for the image,
 968         * but conceivably even a call from memory reclaim - will free
 969         * the swap from a page which has already been recorded in the
 970         * image as a clean swapcache page, and then reuse its swap for
 971         * another page of the image.  On waking from hibernation, the
 972         * original page might be freed under memory pressure, then
 973         * later read back in from swap, now with the wrong data.
 974         *
 975         * Hibernation suspends storage while it is writing the image
 976         * to disk so check that here.
 977         */
 978        if (pm_suspended_storage())
 979                return 0;
 980
 981        delete_from_swap_cache(page);
 982        SetPageDirty(page);
 983        return 1;
 984}
 985
 986/*
 987 * Free the swap entry like above, but also try to
 988 * free the page cache entry if it is the last user.
 989 */
 990int free_swap_and_cache(swp_entry_t entry)
 991{
 992        struct swap_info_struct *p;
 993        struct page *page = NULL;
 994
 995        if (non_swap_entry(entry))
 996                return 1;
 997
 998        p = swap_info_get(entry);
 999        if (p) {
1000                if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {

1001                        page = find_get_page(swap_address_space(entry),
1002                                                entry.val);
1003                        if (page && !trylock_page(page)) {
1004                                put_page(page);
1005                                page = NULL;
1006                        }
1007                }
1008                spin_unlock(&p->lock);
1009        }
1010        if (page) {
1011                /*
1012                 * Not mapped elsewhere, or swap space full? Free it!
1013                 * Also recheck PageSwapCache now page is locked (above).
1014                 */
1015                if (PageSwapCache(page) && !PageWriteback(page) &&
1016                    (!page_mapped(page) || mem_cgroup_swap_full(page))) {
1017                        delete_from_swap_cache(page);
1018                        SetPageDirty(page);
1019                }
1020                unlock_page(page);
1021                put_page(page);
1022        }
1023        return p != NULL;
1024}
1025
1026#ifdef CONFIG_HIBERNATION
1027/*
1028 * Find the swap type that corresponds to given device (if any).
1029 *
1030 * @offset - number of the PAGE_SIZE-sized block of the device, starting
1031 * from 0, in which the swap header is expected to be located.
1032 *
1033 * This is needed for the suspend to disk (aka swsusp).
1034 */
1035int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
1036{
1037        struct block_device *bdev = NULL;
1038        int type;
1039
1040        if (device)
1041                bdev = bdget(device);
1042
1043        spin_lock(&swap_lock);
1044        for (type = 0; type < nr_swapfiles; type++) {
1045                struct swap_info_struct *sis = swap_info[type];
1046
1047                if (!(sis->flags & SWP_WRITEOK))
1048                        continue;
1049
1050                if (!bdev) {
1051                        if (bdev_p)
1052                                *bdev_p = bdgrab(sis->bdev);
1053
1054                        spin_unlock(&swap_lock);
1055                        return type;
1056                }
1057                if (bdev == sis->bdev) {
1058                        struct swap_extent *se = &sis->first_swap_extent;
1059
1060                        if (se->start_block == offset) {
1061                                if (bdev_p)
1062                                        *bdev_p = bdgrab(sis->bdev);
1063
1064                                spin_unlock(&swap_lock);
1065                                bdput(bdev);
1066                                return type;
1067                        }
1068                }
1069        }
1070        spin_unlock(&swap_lock);
1071        if (bdev)
1072                bdput(bdev);
1073
1074        return -ENODEV;
1075}
1076
1077/*
1078 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
1079 * corresponding to given index in swap_info (swap type).
1080 */
1081sector_t swapdev_block(int type, pgoff_t offset)
1082{
1083        struct block_device *bdev;
1084
1085        if ((unsigned int)type >= nr_swapfiles)
1086                return 0;
1087        if (!(swap_info[type]->flags & SWP_WRITEOK))
1088                return 0;
1089        return map_swap_entry(swp_entry(type, offset), &bdev);
1090}
1091
1092/*
1093 * Return either the total number of swap pages of given type, or the number
1094 * of free pages of that type (depending on @free)
1095 *
1096 * This is needed for software suspend
1097 */
1098unsigned int count_swap_pages(int type, int free)
1099{
1100        unsigned int n = 0;
1101
1102        spin_lock(&swap_lock);
1103        if ((unsigned int)type < nr_swapfiles) {
1104                struct swap_info_struct *sis = swap_info[type];
1105
1106                spin_lock(&sis->lock);
1107                if (sis->flags & SWP_WRITEOK) {
1108                        n = sis->pages;
1109                        if (free)
1110                                n -= sis->inuse_pages;
1111                }
1112                spin_unlock(&sis->lock);
1113        }
1114        spin_unlock(&swap_lock);
1115        return n;
1116}
1117#endif /* CONFIG_HIBERNATION */
1118
1119static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
1120{
1121        return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
1122}
1123
1124/*
1125 * No need to decide whether this PTE shares the swap entry with others,
1126 * just let do_wp_page work it out if a write is requested later - to
1127 * force COW, vm_page_prot omits write permission from any private vma.
1128 */
1129static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
1130                unsigned long addr, swp_entry_t entry, struct page *page)
1131{
1132        struct page *swapcache;
1133        struct mem_cgroup *memcg;
1134        spinlock_t *ptl;
1135        pte_t *pte;
1136        int ret = 1;
1137
1138        swapcache = page;
1139        page = ksm_might_need_to_copy(page, vma, addr);
1140        if (unlikely(!page))
1141                return -ENOMEM;
1142
1143        if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
1144                                &memcg, false)) {
1145                ret = -ENOMEM;
1146                goto out_nolock;
1147        }
1148
1149        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1150        if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
1151                mem_cgroup_cancel_charge(page, memcg, false);
1152                ret = 0;
1153                goto out;
1154        }
1155
1156        dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
1157        inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
1158        get_page(page);
1159        set_pte_at(vma->vm_mm, addr, pte,
1160                   pte_mkold(mk_pte(page, vma->vm_page_prot)));
1161        if (page == swapcache) {
1162                page_add_anon_rmap(page, vma, addr, false);
1163                mem_cgroup_commit_charge(page, memcg, true, false);
1164        } else { /* ksm created a completely new copy */
1165                page_add_new_anon_rmap(page, vma, addr, false);
1166                mem_cgroup_commit_charge(page, memcg, false, false);
1167                lru_cache_add_active_or_unevictable(page, vma);
1168        }
1169        swap_free(entry);
1170        /*
1171         * Move the page to the active list so it is not
1172         * immediately swapped out again after swapon.
1173         */
1174        activate_page(page);
1175out:
1176        pte_unmap_unlock(pte, ptl);
1177out_nolock:
1178        if (page != swapcache) {
1179                unlock_page(page);
1180                put_page(page);
1181        }
1182        return ret;
1183}
1184
1185static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
1186                                unsigned long addr, unsigned long end,
1187                                swp_entry_t entry, struct page *page)
1188{
1189        pte_t swp_pte = swp_entry_to_pte(entry);
1190        pte_t *pte;
1191        int ret = 0;
1192
1193        /*
1194         * We don't actually need pte lock while scanning for swp_pte: since
1195         * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
1196         * page table while we're scanning; though it could get zapped, and on
1197         * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
1198         * of unmatched parts which look like swp_pte, so unuse_pte must
1199         * recheck under pte lock.  Scanning without pte lock lets it be
1200         * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
1201         */
1202        pte = pte_offset_map(pmd, addr);
1203        do {
1204                /*
1205                 * swapoff spends a _lot_ of time in this loop!
1206                 * Test inline before going to call unuse_pte.
1207                 */
1208                if (unlikely(pte_same_as_swp(*pte, swp_pte))) {
1209                        pte_unmap(pte);
1210                        ret = unuse_pte(vma, pmd, addr, entry, page);
1211                        if (ret)
1212                                goto out;
1213                        pte = pte_offset_map(pmd, addr);
1214                }
1215        } while (pte++, addr += PAGE_SIZE, addr != end);
1216        pte_unmap(pte - 1);
1217out:
1218        return ret;
1219}
1220
1221static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
1222                                unsigned long addr, unsigned long end,
1223                                swp_entry_t entry, struct page *page)
1224{
1225        pmd_t *pmd;
1226        unsigned long next;
1227        int ret;
1228
1229        pmd = pmd_offset(pud, addr);
1230        do {
1231                next = pmd_addr_end(addr, end);
1232                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1233                        continue;
1234                ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
1235                if (ret)
1236                        return ret;
1237        } while (pmd++, addr = next, addr != end);
1238        return 0;
1239}
1240
1241static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
1242                                unsigned long addr, unsigned long end,
1243                                swp_entry_t entry, struct page *page)
1244{
1245        pud_t *pud;
1246        unsigned long next;
1247        int ret;
1248
1249        pud = pud_offset(pgd, addr);
1250        do {
1251                next = pud_addr_end(addr, end);
1252                if (pud_none_or_clear_bad(pud))
1253                        continue;
1254                ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
1255                if (ret)
1256                        return ret;
1257        } while (pud++, addr = next, addr != end);
1258        return 0;
1259}
1260
1261static int unuse_vma(struct vm_area_struct *vma,
1262                                swp_entry_t entry, struct page *page)
1263{
1264        pgd_t *pgd;
1265        unsigned long addr, end, next;
1266        int ret;
1267
1268        if (page_anon_vma(page)) {
1269                addr = page_address_in_vma(page, vma);
1270                if (addr == -EFAULT)
1271                        return 0;
1272                else
1273                        end = addr + PAGE_SIZE;
1274        } else {
1275                addr = vma->vm_start;
1276                end = vma->vm_end;
1277        }
1278
1279        pgd = pgd_offset(vma->vm_mm, addr);
1280        do {
1281                next = pgd_addr_end(addr, end);
1282                if (pgd_none_or_clear_bad(pgd))
1283                        continue;
1284                ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
1285                if (ret)
1286                        return ret;
1287        } while (pgd++, addr = next, addr != end);
1288        return 0;
1289}
1290
1291static int unuse_mm(struct mm_struct *mm,
1292                                swp_entry_t entry, struct page *page)
1293{
1294        struct vm_area_struct *vma;
1295        int ret = 0;
1296
1297        if (!down_read_trylock(&mm->mmap_sem)) {
1298                /*
1299                 * Activate page so shrink_inactive_list is unlikely to unmap
1300                 * its ptes while lock is dropped, so swapoff can make progress.
1301                 */
1302                activate_page(page);
1303                unlock_page(page);
1304                down_read(&mm->mmap_sem);
1305                lock_page(page);
1306        }
1307        for (vma = mm->mmap; vma; vma = vma->vm_next) {
1308                if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
1309                        break;
1310        }
1311        up_read(&mm->mmap_sem);
1312        return (ret < 0)? ret: 0;
1313}
1314
1315/*
1316 * Scan swap_map (or frontswap_map if frontswap parameter is true)
1317 * from current position to next entry still in use.
1318 * Recycle to start on reaching the end, returning 0 when empty.
1319 */
1320static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1321                                        unsigned int prev, bool frontswap)
1322{
1323        unsigned int max = si->max;
1324        unsigned int i = prev;
1325        unsigned char count;
1326
1327        /*
1328         * No need for swap_lock here: we're just looking
1329         * for whether an entry is in use, not modifying it; false
1330         * hits are okay, and sys_swapoff() has already prevented new
1331         * allocations from this area (while holding swap_lock).
1332         */
1333        for (;;) {
1334                if (++i >= max) {
1335                        if (!prev) {
1336                                i = 0;
1337                                break;
1338                        }
1339                        /*
1340                         * No entries in use at top of swap_map,
1341                         * loop back to start and recheck there.
1342                         */
1343                        max = prev + 1;
1344                        prev = 0;
1345                        i = 1;
1346                }
1347                if (frontswap) {
1348                        if (frontswap_test(si, i))
1349                                break;
1350                        else
1351                                continue;
1352                }
1353                count = READ_ONCE(si->swap_map[i]);
1354                if (count && swap_count(count) != SWAP_MAP_BAD)
1355                        break;
1356        }
1357        return i;
1358}
1359
1360/*
1361 * We completely avoid races by reading each swap page in advance,
1362 * and then search for the process using it.  All the necessary
1363 * page table adjustments can then be made atomically.
1364 *
1365 * if the boolean frontswap is true, only unuse pages_to_unuse pages;
1366 * pages_to_unuse==0 means all pages; ignored if frontswap is false
1367 */
1368int try_to_unuse(unsigned int type, bool frontswap,
1369                 unsigned long pages_to_unuse)
1370{
1371        struct swap_info_struct *si = swap_info[type];
1372        struct mm_struct *start_mm;
1373        volatile unsigned char *swap_map; /* swap_map is accessed without
1374                                           * locking. Mark it as volatile
1375                                           * to prevent compiler doing
1376                                           * something odd.
1377                                           */
1378        unsigned char swcount;
1379        struct page *page;
1380        swp_entry_t entry;
1381        unsigned int i = 0;
1382        int retval = 0;
1383
1384        /*
1385         * When searching mms for an entry, a good strategy is to
1386         * start at the first mm we freed the previous entry from
1387         * (though actually we don't notice whether we or coincidence
1388         * freed the entry).  Initialize this start_mm with a hold.
1389         *
1390         * A simpler strategy would be to start at the last mm we
1391         * freed the previous entry from; but that would take less
1392         * advantage of mmlist ordering, which clusters forked mms
1393         * together, child after parent.  If we race with dup_mmap(), we
1394         * prefer to resolve parent before child, lest we miss entries
1395         * duplicated after we scanned child: using last mm would invert
1396         * that.
1397         */
1398        start_mm = &init_mm;
1399        atomic_inc(&init_mm.mm_users);
1400
1401        /*
1402         * Keep on scanning until all entries have gone.  Usually,
1403         * one pass through swap_map is enough, but not necessarily:
1404         * there are races when an instance of an entry might be missed.
1405         */
1406        while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
1407                if (signal_pending(current)) {
1408                        retval = -EINTR;
1409                        break;
1410                }
1411
1412                /*
1413                 * Get a page for the entry, using the existing swap
1414                 * cache page if there is one.  Otherwise, get a clean
1415                 * page and read the swap into it.
1416                 */
1417                swap_map = &si->swap_map[i];
1418                entry = swp_entry(type, i);
1419                page = read_swap_cache_async(entry,
1420                                        GFP_HIGHUSER_MOVABLE, NULL, 0);
1421                if (!page) {
1422                        /*
1423                         * Either swap_duplicate() failed because entry
1424                         * has been freed independently, and will not be
1425                         * reused since sys_swapoff() already disabled
1426                         * allocation from here, or alloc_page() failed.
1427                         */
1428                        swcount = *swap_map;
1429                        /*
1430                         * We don't hold lock here, so the swap entry could be
1431                         * SWAP_MAP_BAD (when the cluster is discarding).
1432                         * Instead of fail out, We can just skip the swap
1433                         * entry because swapoff will wait for discarding
1434                         * finish anyway.
1435                         */
1436                        if (!swcount || swcount == SWAP_MAP_BAD)
1437                                continue;
1438                        retval = -ENOMEM;
1439                        break;
1440                }
1441
1442                /*
1443                 * Don't hold on to start_mm if it looks like exiting.
1444                 */
1445                if (atomic_read(&start_mm->mm_users) == 1) {
1446                        mmput(start_mm);
1447                        start_mm = &init_mm;
1448                        atomic_inc(&init_mm.mm_users);
1449                }
1450
1451                /*
1452                 * Wait for and lock page.  When do_swap_page races with
1453                 * try_to_unuse, do_swap_page can handle the fault much
1454                 * faster than try_to_unuse can locate the entry.  This
1455                 * apparently redundant "wait_on_page_locked" lets try_to_unuse
1456                 * defer to do_swap_page in such a case - in some tests,
1457                 * do_swap_page and try_to_unuse repeatedly compete.
1458                 */
1459                wait_on_page_locked(page);
1460                wait_on_page_writeback(page);
1461                lock_page(page);
1462                wait_on_page_writeback(page);
1463
1464                /*
1465                 * Remove all references to entry.
1466                 */
1467                swcount = *swap_map;
1468                if (swap_count(swcount) == SWAP_MAP_SHMEM) {
1469                        retval = shmem_unuse(entry, page);
1470                        /* page has already been unlocked and released */
1471                        if (retval < 0)
1472                                break;
1473                        continue;
1474                }
1475                if (swap_count(swcount) && start_mm != &init_mm)
1476                        retval = unuse_mm(start_mm, entry, page);
1477
1478                if (swap_count(*swap_map)) {
1479                        int set_start_mm = (*swap_map >= swcount);
1480                        struct list_head *p = &start_mm->mmlist;
1481                        struct mm_struct *new_start_mm = start_mm;
1482                        struct mm_struct *prev_mm = start_mm;
1483                        struct mm_struct *mm;
1484
1485                        atomic_inc(&new_start_mm->mm_users);
1486                        atomic_inc(&prev_mm->mm_users);
1487                        spin_lock(&mmlist_lock);
1488                        while (swap_count(*swap_map) && !retval &&
1489                                        (p = p->next) != &start_mm->mmlist) {
1490                                mm = list_entry(p, struct mm_struct, mmlist);
1491                                if (!atomic_inc_not_zero(&mm->mm_users))
1492                                        continue;
1493                                spin_unlock(&mmlist_lock);
1494                                mmput(prev_mm);
1495                                prev_mm = mm;
1496
1497                                cond_resched();
1498
1499                                swcount = *swap_map;
1500                                if (!swap_count(swcount)) /* any usage ? */
1501                                        ;
1502                                else if (mm == &init_mm)
1503                                        set_start_mm = 1;
1504                                else
1505                                        retval = unuse_mm(mm, entry, page);
1506
1507                                if (set_start_mm && *swap_map < swcount) {
1508                                        mmput(new_start_mm);
1509                                        atomic_inc(&mm->mm_users);
1510                                        new_start_mm = mm;
1511                                        set_start_mm = 0;
1512                                }
1513                                spin_lock(&mmlist_lock);
1514                        }
1515                        spin_unlock(&mmlist_lock);
1516                        mmput(prev_mm);
1517                        mmput(start_mm);
1518                        start_mm = new_start_mm;
1519                }
1520                if (retval) {
1521                        unlock_page(page);
1522                        put_page(page);
1523                        break;
1524                }
1525
1526                /*
1527                 * If a reference remains (rare), we would like to leave
1528                 * the page in the swap cache; but try_to_unmap could
1529                 * then re-duplicate the entry once we drop page lock,
1530                 * so we might loop indefinitely; also, that page could
1531                 * not be swapped out to other storage meanwhile.  So:
1532                 * delete from cache even if there's another reference,
1533                 * after ensuring that the data has been saved to disk -
1534                 * since if the reference remains (rarer), it will be
1535                 * read from disk into another page.  Splitting into two
1536                 * pages would be incorrect if swap supported "shared
1537                 * private" pages, but they are handled by tmpfs files.
1538                 *
1539                 * Given how unuse_vma() targets one particular offset
1540                 * in an anon_vma, once the anon_vma has been determined,
1541                 * this splitting happens to be just what is needed to
1542                 * handle where KSM pages have been swapped out: re-reading
1543                 * is unnecessarily slow, but we can fix that later on.
1544                 */
1545                if (swap_count(*swap_map) &&
1546                     PageDirty(page) && PageSwapCache(page)) {
1547                        struct writeback_control wbc = {
1548                                .sync_mode = WB_SYNC_NONE,
1549                        };
1550
1551                        swap_writepage(page, &wbc);
1552                        lock_page(page);
1553                        wait_on_page_writeback(page);
1554                }
1555
1556                /*
1557                 * It is conceivable that a racing task removed this page from
1558                 * swap cache just before we acquired the page lock at the top,
1559                 * or while we dropped it in unuse_mm().  The page might even
1560                 * be back in swap cache on another swap area: that we must not
1561                 * delete, since it may not have been written out to swap yet.
1562                 */
1563                if (PageSwapCache(page) &&
1564                    likely(page_private(page) == entry.val))
1565                        delete_from_swap_cache(page);
1566
1567                /*
1568                 * So we could skip searching mms once swap count went
1569                 * to 1, we did not mark any present ptes as dirty: must
1570                 * mark page dirty so shrink_page_list will preserve it.
1571                 */
1572                SetPageDirty(page);
1573                unlock_page(page);
1574                put_page(page);
1575
1576                /*
1577                 * Make sure that we aren't completely killing
1578                 * interactive performance.
1579                 */
1580                cond_resched();
1581                if (frontswap && pages_to_unuse > 0) {
1582                        if (!--pages_to_unuse)
1583                                break;
1584                }
1585        }
1586
1587        mmput(start_mm);
1588        return retval;
1589}
1590
1591/*
1592 * After a successful try_to_unuse, if no swap is now in use, we know
1593 * we can empty the mmlist.  swap_lock must be held on entry and exit.
1594 * Note that mmlist_lock nests inside swap_lock, and an mm must be
1595 * added to the mmlist just after page_duplicate - before would be racy.
1596 */
1597static void drain_mmlist(void)
1598{
1599        struct list_head *p, *next;
1600        unsigned int type;
1601
1602        for (type = 0; type < nr_swapfiles; type++)
1603                if (swap_info[type]->inuse_pages)
1604                        return;
1605        spin_lock(&mmlist_lock);
1606        list_for_each_safe(p, next, &init_mm.mmlist)
1607                list_del_init(p);
1608        spin_unlock(&mmlist_lock);
1609}
1610
1611/*
1612 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
1613 * corresponds to page offset for the specified swap entry.
1614 * Note that the type of this function is sector_t, but it returns page offset
1615 * into the bdev, not sector offset.
1616 */
1617static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
1618{
1619        struct swap_info_struct *sis;
1620        struct swap_extent *start_se;
1621        struct swap_extent *se;
1622        pgoff_t offset;
1623
1624        sis = swap_info[swp_type(entry)];
1625        *bdev = sis->bdev;
1626
1627        offset = swp_offset(entry);
1628        start_se = sis->curr_swap_extent;
1629        se = start_se;
1630
1631        for ( ; ; ) {
1632                if (se->start_page <= offset &&
1633                                offset < (se->start_page + se->nr_pages)) {
1634                        return se->start_block + (offset - se->start_page);
1635                }
1636                se = list_next_entry(se, list);
1637                sis->curr_swap_extent = se;
1638                BUG_ON(se == start_se);         /* It *must* be present */
1639        }
1640}
1641
1642/*
1643 * Returns the page offset into bdev for the specified page's swap entry.
1644 */
1645sector_t map_swap_page(struct page *page, struct block_device **bdev)
1646{
1647        swp_entry_t entry;
1648        entry.val = page_private(page);
1649        return map_swap_entry(entry, bdev);
1650}
1651
1652/*
1653 * Free all of a swapdev's extent information
1654 */
1655static void destroy_swap_extents(struct swap_info_struct *sis)
1656{
1657        while (!list_empty(&sis->first_swap_extent.list)) {
1658                struct swap_extent *se;
1659
1660                se = list_first_entry(&sis->first_swap_extent.list,
1661                                struct swap_extent, list);
1662                list_del(&se->list);
1663                kfree(se);
1664        }
1665
1666        if (sis->flags & SWP_FILE) {
1667                struct file *swap_file = sis->swap_file;
1668                struct address_space *mapping = swap_file->f_mapping;
1669
1670                sis->flags &= ~SWP_FILE;
1671                mapping->a_ops->swap_deactivate(swap_file);
1672        }
1673}
1674
1675/*
1676 * Add a block range (and the corresponding page range) into this swapdev's
1677 * extent list.  The extent list is kept sorted in page order.
1678 *
1679 * This function rather assumes that it is called in ascending page order.
1680 */
1681int
1682add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1683                unsigned long nr_pages, sector_t start_block)
1684{
1685        struct swap_extent *se;
1686        struct swap_extent *new_se;
1687        struct list_head *lh;
1688
1689        if (start_page == 0) {
1690                se = &sis->first_swap_extent;
1691                sis->curr_swap_extent = se;
1692                se->start_page = 0;
1693                se->nr_pages = nr_pages;
1694                se->start_block = start_block;
1695                return 1;
1696        } else {
1697                lh = sis->first_swap_extent.list.prev;  /* Highest extent */
1698                se = list_entry(lh, struct swap_extent, list);
1699                BUG_ON(se->start_page + se->nr_pages != start_page);
1700                if (se->start_block + se->nr_pages == start_block) {
1701                        /* Merge it */
1702                        se->nr_pages += nr_pages;
1703                        return 0;
1704                }
1705        }
1706
1707        /*
1708         * No merge.  Insert a new extent, preserving ordering.
1709         */
1710        new_se = kmalloc(sizeof(*se), GFP_KERNEL);
1711        if (new_se == NULL)
1712                return -ENOMEM;
1713        new_se->start_page = start_page;
1714        new_se->nr_pages = nr_pages;
1715        new_se->start_block = start_block;
1716
1717        list_add_tail(&new_se->list, &sis->first_swap_extent.list);
1718        return 1;
1719}
1720
1721/*
1722 * A `swap extent' is a simple thing which maps a contiguous range of pages
1723 * onto a contiguous range of disk blocks.  An ordered list of swap extents
1724 * is built at swapon time and is then used at swap_writepage/swap_readpage
1725 * time for locating where on disk a page belongs.
1726 *
1727 * If the swapfile is an S_ISBLK block device, a single extent is installed.
1728 * This is done so that the main operating code can treat S_ISBLK and S_ISREG
1729 * swap files identically.
1730 *
1731 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
1732 * extent list operates in PAGE_SIZE disk blocks.  Both S_ISREG and S_ISBLK
1733 * swapfiles are handled *identically* after swapon time.
1734 *
1735 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
1736 * and will parse them into an ordered extent list, in PAGE_SIZE chunks.  If
1737 * some stray blocks are found which do not fall within the PAGE_SIZE alignment
1738 * requirements, they are simply tossed out - we will never use those blocks
1739 * for swapping.
1740 *
1741 * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon.  This
1742 * prevents root from shooting her foot off by ftruncating an in-use swapfile,
1743 * which will scribble on the fs.
1744 *
1745 * The amount of disk space which a single swap extent represents varies.
1746 * Typically it is in the 1-4 megabyte range.  So we can have hundreds of
1747 * extents in the list.  To avoid much list walking, we cache the previous
1748 * search location in `curr_swap_extent', and start new searches from there.
1749 * This is extremely effective.  The average number of iterations in
1750 * map_swap_page() has been measured at about 0.3 per page.  - akpm.
1751 */
1752static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1753{
1754        struct file *swap_file = sis->swap_file;
1755        struct address_space *mapping = swap_file->f_mapping;
1756        struct inode *inode = mapping->host;
1757        int ret;
1758
1759        if (S_ISBLK(inode->i_mode)) {
1760                ret = add_swap_extent(sis, 0, sis->max, 0);
1761                *span = sis->pages;
1762                return ret;
1763        }
1764
1765        if (mapping->a_ops->swap_activate) {
1766                ret = mapping->a_ops->swap_activate(sis, swap_file, span);
1767                if (!ret) {
1768                        sis->flags |= SWP_FILE;
1769                        ret = add_swap_extent(sis, 0, sis->max, 0);
1770                        *span = sis->pages;
1771                }
1772                return ret;
1773        }
1774
1775        return generic_swapfile_activate(sis, swap_file, span);
1776}
1777
1778static void _enable_swap_info(struct swap_info_struct *p, int prio,
1779                                unsigned char *swap_map,
1780                                struct swap_cluster_info *cluster_info)
1781{
1782        if (prio >= 0)
1783                p->prio = prio;
1784        else
1785                p->prio = --least_priority;
1786        /*
1787         * the plist prio is negated because plist ordering is
1788         * low-to-high, while swap ordering is high-to-low
1789         */
1790        p->list.prio = -p->prio;
1791        p->avail_list.prio = -p->prio;
1792        p->swap_map = swap_map;
1793        p->cluster_info = cluster_info;
1794        p->flags |= SWP_WRITEOK;
1795        atomic_long_add(p->pages, &nr_swap_pages);
1796        total_swap_pages += p->pages;
1797
1798        assert_spin_locked(&swap_lock);
1799        /*
1800         * both lists are plists, and thus priority ordered.
1801         * swap_active_head needs to be priority ordered for swapoff(),
1802         * which on removal of any swap_info_struct with an auto-assigned
1803         * (i.e. negative) priority increments the auto-assigned priority
1804         * of any lower-priority swap_info_structs.
1805         * swap_avail_head needs to be priority ordered for get_swap_page(),
1806         * which allocates swap pages from the highest available priority
1807         * swap_info_struct.
1808         */
1809        plist_add(&p->list, &swap_active_head);
1810        spin_lock(&swap_avail_lock);
1811        plist_add(&p->avail_list, &swap_avail_head);
1812        spin_unlock(&swap_avail_lock);
1813}
1814
1815static void enable_swap_info(struct swap_info_struct *p, int prio,
1816                                unsigned char *swap_map,
1817                                struct swap_cluster_info *cluster_info,
1818                                unsigned long *frontswap_map)
1819{
1820        frontswap_init(p->type, frontswap_map);
1821        spin_lock(&swap_lock);
1822        spin_lock(&p->lock);
1823         _enable_swap_info(p, prio, swap_map, cluster_info);
1824        spin_unlock(&p->lock);
1825        spin_unlock(&swap_lock);
1826}
1827
1828static void reinsert_swap_info(struct swap_info_struct *p)
1829{
1830        spin_lock(&swap_lock);
1831        spin_lock(&p->lock);
1832        _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
1833        spin_unlock(&p->lock);
1834        spin_unlock(&swap_lock);
1835}
1836
1837SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1838{
1839        struct swap_info_struct *p = NULL;
1840        unsigned char *swap_map;
1841        struct swap_cluster_info *cluster_info;
1842        unsigned long *frontswap_map;
1843        struct file *swap_file, *victim;
1844        struct address_space *mapping;
1845        struct inode *inode;
1846        struct filename *pathname;
1847        int err, found = 0;
1848        unsigned int old_block_size;
1849
1850        if (!capable(CAP_SYS_ADMIN))
1851                return -EPERM;
1852
1853        BUG_ON(!current->mm);
1854
1855        pathname = getname(specialfile);
1856        if (IS_ERR(pathname))
1857                return PTR_ERR(pathname);
1858
1859        victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
1860        err = PTR_ERR(victim);
1861        if (IS_ERR(victim))
1862                goto out;
1863
1864        mapping = victim->f_mapping;
1865        spin_lock(&swap_lock);
1866        plist_for_each_entry(p, &swap_active_head, list) {
1867                if (p->flags & SWP_WRITEOK) {
1868                        if (p->swap_file->f_mapping == mapping) {
1869                                found = 1;
1870                                break;
1871                        }
1872                }
1873        }
1874        if (!found) {
1875                err = -EINVAL;
1876                spin_unlock(&swap_lock);
1877                goto out_dput;
1878        }
1879        if (!security_vm_enough_memory_mm(current->mm, p->pages))
1880                vm_unacct_memory(p->pages);
1881        else {
1882                err = -ENOMEM;
1883                spin_unlock(&swap_lock);
1884                goto out_dput;
1885        }
1886        spin_lock(&swap_avail_lock);
1887        plist_del(&p->avail_list, &swap_avail_head);
1888        spin_unlock(&swap_avail_lock);
1889        spin_lock(&p->lock);
1890        if (p->prio < 0) {
1891                struct swap_info_struct *si = p;
1892
1893                plist_for_each_entry_continue(si, &swap_active_head, list) {
1894                        si->prio++;
1895                        si->list.prio--;
1896                        si->avail_list.prio--;
1897                }
1898                least_priority++;
1899        }
1900        plist_del(&p->list, &swap_active_head);
1901        atomic_long_sub(p->pages, &nr_swap_pages);
1902        total_swap_pages -= p->pages;
1903        p->flags &= ~SWP_WRITEOK;
1904        spin_unlock(&p->lock);
1905        spin_unlock(&swap_lock);
1906
1907        set_current_oom_origin();
1908        err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
1909        clear_current_oom_origin();
1910
1911        if (err) {
1912                /* re-insert swap space back into swap_list */
1913                reinsert_swap_info(p);
1914                goto out_dput;
1915        }
1916
1917        flush_work(&p->discard_work);
1918
1919        destroy_swap_extents(p);
1920        if (p->flags & SWP_CONTINUED)
1921                free_swap_count_continuations(p);
1922
1923        mutex_lock(&swapon_mutex);
1924        spin_lock(&swap_lock);
1925        spin_lock(&p->lock);
1926        drain_mmlist();
1927
1928        /* wait for anyone still in scan_swap_map */
1929        p->highest_bit = 0;             /* cuts scans short */
1930        while (p->flags >= SWP_SCANNING) {
1931                spin_unlock(&p->lock);
1932                spin_unlock(&swap_lock);
1933                schedule_timeout_uninterruptible(1);
1934                spin_lock(&swap_lock);
1935                spin_lock(&p->lock);
1936        }
1937
1938        swap_file = p->swap_file;
1939        old_block_size = p->old_block_size;
1940        p->swap_file = NULL;
1941        p->max = 0;
1942        swap_map = p->swap_map;
1943        p->swap_map = NULL;
1944        cluster_info = p->cluster_info;
1945        p->cluster_info = NULL;
1946        frontswap_map = frontswap_map_get(p);
1947        spin_unlock(&p->lock);
1948        spin_unlock(&swap_lock);
1949        frontswap_invalidate_area(p->type);
1950        frontswap_map_set(p, NULL);
1951        mutex_unlock(&swapon_mutex);
1952        free_percpu(p->percpu_cluster);
1953        p->percpu_cluster = NULL;
1954        vfree(swap_map);
1955        vfree(cluster_info);
1956        vfree(frontswap_map);
1957        /* Destroy swap account information */
1958        swap_cgroup_swapoff(p->type);
1959
1960        inode = mapping->host;
1961        if (S_ISBLK(inode->i_mode)) {
1962                struct block_device *bdev = I_BDEV(inode);
1963                set_blocksize(bdev, old_block_size);
1964                blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1965        } else {
1966                inode_lock(inode);
1967                inode->i_flags &= ~S_SWAPFILE;
1968                inode_unlock(inode);
1969        }
1970        filp_close(swap_file, NULL);
1971
1972        /*
1973         * Clear the SWP_USED flag after all resources are freed so that swapon
1974         * can reuse this swap_info in alloc_swap_info() safely.  It is ok to
1975         * not hold p->lock after we cleared its SWP_WRITEOK.
1976         */
1977        spin_lock(&swap_lock);
1978        p->flags = 0;
1979        spin_unlock(&swap_lock);
1980
1981        err = 0;
1982        atomic_inc(&proc_poll_event);
1983        wake_up_interruptible(&proc_poll_wait);
1984
1985out_dput:
1986        filp_close(victim, NULL);
1987out:
1988        putname(pathname);
1989        return err;
1990}
1991
1992#ifdef CONFIG_PROC_FS
1993static unsigned swaps_poll(struct file *file, poll_table *wait)
1994{
1995        struct seq_file *seq = file->private_data;
1996
1997        poll_wait(file, &proc_poll_wait, wait);
1998
1999        if (seq->poll_event != atomic_read(&proc_poll_event)) {
2000                seq->poll_event = atomic_read(&proc_poll_event);

2001                return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
2002        }
2003
2004        return POLLIN | POLLRDNORM;
2005}
2006
2007/* iterator */
2008static void *swap_start(struct seq_file *swap, loff_t *pos)
2009{
2010        struct swap_info_struct *si;
2011        int type;
2012        loff_t l = *pos;
2013
2014        mutex_lock(&swapon_mutex);
2015
2016        if (!l)
2017                return SEQ_START_TOKEN;
2018
2019        for (type = 0; type < nr_swapfiles; type++) {
2020                smp_rmb();      /* read nr_swapfiles before swap_info[type] */
2021                si = swap_info[type];
2022                if (!(si->flags & SWP_USED) || !si->swap_map)
2023                        continue;
2024                if (!--l)
2025                        return si;
2026        }
2027
2028        return NULL;
2029}
2030
2031static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
2032{
2033        struct swap_info_struct *si = v;
2034        int type;
2035
2036        if (v == SEQ_START_TOKEN)
2037                type = 0;
2038        else
2039                type = si->type + 1;
2040
2041        for (; type < nr_swapfiles; type++) {
2042                smp_rmb();      /* read nr_swapfiles before swap_info[type] */
2043                si = swap_info[type];
2044                if (!(si->flags & SWP_USED) || !si->swap_map)
2045                        continue;
2046                ++*pos;
2047                return si;
2048        }
2049
2050        return NULL;
2051}
2052
2053static void swap_stop(struct seq_file *swap, void *v)
2054{
2055        mutex_unlock(&swapon_mutex);
2056}
2057
2058static int swap_show(struct seq_file *swap, void *v)
2059{
2060        struct swap_info_struct *si = v;
2061        struct file *file;
2062        int len;
2063
2064        if (si == SEQ_START_TOKEN) {
2065                seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
2066                return 0;
2067        }
2068
2069        file = si->swap_file;
2070        len = seq_file_path(swap, file, " \t\n\\");
2071        seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
2072                        len < 40 ? 40 - len : 1, " ",
2073                        S_ISBLK(file_inode(file)->i_mode) ?
2074                                "partition" : "file\t",
2075                        si->pages << (PAGE_SHIFT - 10),
2076                        si->inuse_pages << (PAGE_SHIFT - 10),
2077                        si->prio);
2078        return 0;
2079}
2080
2081static const struct seq_operations swaps_op = {
2082        .start =        swap_start,
2083        .next =         swap_next,
2084        .stop =         swap_stop,
2085        .show =         swap_show
2086};
2087
2088static int swaps_open(struct inode *inode, struct file *file)
2089{
2090        struct seq_file *seq;
2091        int ret;
2092
2093        ret = seq_open(file, &swaps_op);
2094        if (ret)
2095                return ret;
2096
2097        seq = file->private_data;
2098        seq->poll_event = atomic_read(&proc_poll_event);
2099        return 0;
2100}
2101
2102static const struct file_operations proc_swaps_operations = {
2103        .open           = swaps_open,
2104        .read           = seq_read,
2105        .llseek         = seq_lseek,
2106        .release        = seq_release,
2107        .poll           = swaps_poll,
2108};
2109
2110static int __init procswaps_init(void)
2111{
2112        proc_create("swaps", 0, NULL, &proc_swaps_operations);
2113        return 0;
2114}
2115__initcall(procswaps_init);
2116#endif /* CONFIG_PROC_FS */
2117
2118#ifdef MAX_SWAPFILES_CHECK
2119static int __init max_swapfiles_check(void)
2120{
2121        MAX_SWAPFILES_CHECK();
2122        return 0;
2123}
2124late_initcall(max_swapfiles_check);
2125#endif
2126
2127static struct swap_info_struct *alloc_swap_info(void)
2128{
2129        struct swap_info_struct *p;
2130        unsigned int type;
2131
2132        p = kzalloc(sizeof(*p), GFP_KERNEL);
2133        if (!p)
2134                return ERR_PTR(-ENOMEM);
2135
2136        spin_lock(&swap_lock);
2137        for (type = 0; type < nr_swapfiles; type++) {
2138                if (!(swap_info[type]->flags & SWP_USED))
2139                        break;
2140        }
2141        if (type >= MAX_SWAPFILES) {
2142                spin_unlock(&swap_lock);
2143                kfree(p);
2144                return ERR_PTR(-EPERM);
2145        }
2146        if (type >= nr_swapfiles) {
2147                p->type = type;
2148                swap_info[type] = p;
2149                /*
2150                 * Write swap_info[type] before nr_swapfiles, in case a
2151                 * racing procfs swap_start() or swap_next() is reading them.
2152                 * (We never shrink nr_swapfiles, we never free this entry.)
2153                 */
2154                smp_wmb();
2155                nr_swapfiles++;
2156        } else {
2157                kfree(p);
2158                p = swap_info[type];
2159                /*
2160                 * Do not memset this entry: a racing procfs swap_next()
2161                 * would be relying on p->type to remain valid.
2162                 */
2163        }
2164        INIT_LIST_HEAD(&p->first_swap_extent.list);
2165        plist_node_init(&p->list, 0);
2166        plist_node_init(&p->avail_list, 0);
2167        p->flags = SWP_USED;
2168        spin_unlock(&swap_lock);
2169        spin_lock_init(&p->lock);
2170
2171        return p;
2172}
2173
2174static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
2175{
2176        int error;
2177
2178        if (S_ISBLK(inode->i_mode)) {
2179                p->bdev = bdgrab(I_BDEV(inode));
2180                error = blkdev_get(p->bdev,
2181                                   FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
2182                if (error < 0) {
2183                        p->bdev = NULL;
2184                        return error;
2185                }
2186                p->old_block_size = block_size(p->bdev);
2187                error = set_blocksize(p->bdev, PAGE_SIZE);
2188                if (error < 0)
2189                        return error;
2190                p->flags |= SWP_BLKDEV;
2191        } else if (S_ISREG(inode->i_mode)) {
2192                p->bdev = inode->i_sb->s_bdev;
2193                inode_lock(inode);
2194                if (IS_SWAPFILE(inode))
2195                        return -EBUSY;
2196        } else
2197                return -EINVAL;
2198
2199        return 0;
2200}
2201
2202static unsigned long read_swap_header(struct swap_info_struct *p,
2203                                        union swap_header *swap_header,
2204                                        struct inode *inode)
2205{
2206        int i;
2207        unsigned long maxpages;
2208        unsigned long swapfilepages;
2209        unsigned long last_page;
2210
2211        if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
2212                pr_err("Unable to find swap-space signature\n");
2213                return 0;
2214        }
2215
2216        /* swap partition endianess hack... */
2217        if (swab32(swap_header->info.version) == 1) {
2218                swab32s(&swap_header->info.version);
2219                swab32s(&swap_header->info.last_page);
2220                swab32s(&swap_header->info.nr_badpages);
2221                for (i = 0; i < swap_header->info.nr_badpages; i++)
2222                        swab32s(&swap_header->info.badpages[i]);
2223        }
2224        /* Check the swap header's sub-version */
2225        if (swap_header->info.version != 1) {
2226                pr_warn("Unable to handle swap header version %d\n",
2227                        swap_header->info.version);
2228                return 0;
2229        }
2230
2231        p->lowest_bit  = 1;
2232        p->cluster_next = 1;
2233        p->cluster_nr = 0;
2234
2235        /*
2236         * Find out how many pages are allowed for a single swap
2237         * device. There are two limiting factors: 1) the number
2238         * of bits for the swap offset in the swp_entry_t type, and
2239         * 2) the number of bits in the swap pte as defined by the
2240         * different architectures. In order to find the
2241         * largest possible bit mask, a swap entry with swap type 0
2242         * and swap offset ~0UL is created, encoded to a swap pte,
2243         * decoded to a swp_entry_t again, and finally the swap
2244         * offset is extracted. This will mask all the bits from
2245         * the initial ~0UL mask that can't be encoded in either
2246         * the swp_entry_t or the architecture definition of a
2247         * swap pte.
2248         */
2249        maxpages = swp_offset(pte_to_swp_entry(
2250                        swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
2251        last_page = swap_header->info.last_page;
2252        if (last_page > maxpages) {
2253                pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
2254                        maxpages << (PAGE_SHIFT - 10),
2255                        last_page << (PAGE_SHIFT - 10));
2256        }
2257        if (maxpages > last_page) {
2258                maxpages = last_page + 1;
2259                /* p->max is an unsigned int: don't overflow it */
2260                if ((unsigned int)maxpages == 0)
2261                        maxpages = UINT_MAX;
2262        }
2263        p->highest_bit = maxpages - 1;
2264
2265        if (!maxpages)
2266                return 0;
2267        swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
2268        if (swapfilepages && maxpages > swapfilepages) {
2269                pr_warn("Swap area shorter than signature indicates\n");
2270                return 0;
2271        }
2272        if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
2273                return 0;
2274        if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2275                return 0;
2276
2277        return maxpages;
2278}
2279
2280static int setup_swap_map_and_extents(struct swap_info_struct *p,
2281                                        union swap_header *swap_header,
2282                                        unsigned char *swap_map,
2283                                        struct swap_cluster_info *cluster_info,
2284                                        unsigned long maxpages,
2285                                        sector_t *span)
2286{
2287        int i;
2288        unsigned int nr_good_pages;
2289        int nr_extents;
2290        unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
2291        unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER;
2292
2293        nr_good_pages = maxpages - 1;   /* omit header page */
2294
2295        cluster_set_null(&p->free_cluster_head);
2296        cluster_set_null(&p->free_cluster_tail);
2297        cluster_set_null(&p->discard_cluster_head);
2298        cluster_set_null(&p->discard_cluster_tail);
2299
2300        for (i = 0; i < swap_header->info.nr_badpages; i++) {
2301                unsigned int page_nr = swap_header->info.badpages[i];
2302                if (page_nr == 0 || page_nr > swap_header->info.last_page)
2303                        return -EINVAL;
2304                if (page_nr < maxpages) {
2305                        swap_map[page_nr] = SWAP_MAP_BAD;
2306                        nr_good_pages--;
2307                        /*
2308                         * Haven't marked the cluster free yet, no list
2309                         * operation involved
2310                         */
2311                        inc_cluster_info_page(p, cluster_info, page_nr);
2312                }
2313        }
2314
2315        /* Haven't marked the cluster free yet, no list operation involved */
2316        for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
2317                inc_cluster_info_page(p, cluster_info, i);
2318
2319        if (nr_good_pages) {
2320                swap_map[0] = SWAP_MAP_BAD;
2321                /*
2322                 * Not mark the cluster free yet, no list
2323                 * operation involved
2324                 */
2325                inc_cluster_info_page(p, cluster_info, 0);
2326                p->max = maxpages;
2327                p->pages = nr_good_pages;
2328                nr_extents = setup_swap_extents(p, span);
2329                if (nr_extents < 0)
2330                        return nr_extents;
2331                nr_good_pages = p->pages;
2332        }
2333        if (!nr_good_pages) {
2334                pr_warn("Empty swap-file\n");
2335                return -EINVAL;
2336        }
2337
2338        if (!cluster_info)
2339                return nr_extents;
2340
2341        for (i = 0; i < nr_clusters; i++) {
2342                if (!cluster_count(&cluster_info[idx])) {
2343                        cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
2344                        if (cluster_is_null(&p->free_cluster_head)) {
2345                                cluster_set_next_flag(&p->free_cluster_head,
2346                                                                idx, 0);
2347                                cluster_set_next_flag(&p->free_cluster_tail,
2348                                                                idx, 0);
2349                        } else {
2350                                unsigned int tail;
2351
2352                                tail = cluster_next(&p->free_cluster_tail);
2353                                cluster_set_next(&cluster_info[tail], idx);
2354                                cluster_set_next_flag(&p->free_cluster_tail,
2355                                                                idx, 0);
2356                        }
2357                }
2358                idx++;
2359                if (idx == nr_clusters)
2360                        idx = 0;
2361        }
2362        return nr_extents;
2363}
2364
2365/*
2366 * Helper to sys_swapon determining if a given swap
2367 * backing device queue supports DISCARD operations.
2368 */
2369static bool swap_discardable(struct swap_info_struct *si)
2370{
2371        struct request_queue *q = bdev_get_queue(si->bdev);
2372
2373        if (!q || !blk_queue_discard(q))
2374                return false;
2375
2376        return true;
2377}
2378
2379SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2380{
2381        struct swap_info_struct *p;
2382        struct filename *name;
2383        struct file *swap_file = NULL;
2384        struct address_space *mapping;
2385        int prio;
2386        int error;
2387        union swap_header *swap_header;
2388        int nr_extents;
2389        sector_t span;
2390        unsigned long maxpages;
2391        unsigned char *swap_map = NULL;
2392        struct swap_cluster_info *cluster_info = NULL;
2393        unsigned long *frontswap_map = NULL;
2394        struct page *page = NULL;
2395        struct inode *inode = NULL;
2396
2397        if (swap_flags & ~SWAP_FLAGS_VALID)
2398                return -EINVAL;
2399
2400        if (!capable(CAP_SYS_ADMIN))
2401                return -EPERM;
2402
2403        p = alloc_swap_info();
2404        if (IS_ERR(p))
2405                return PTR_ERR(p);
2406
2407        INIT_WORK(&p->discard_work, swap_discard_work);
2408
2409        name = getname(specialfile);
2410        if (IS_ERR(name)) {
2411                error = PTR_ERR(name);
2412                name = NULL;
2413                goto bad_swap;
2414        }
2415        swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
2416        if (IS_ERR(swap_file)) {
2417                error = PTR_ERR(swap_file);
2418                swap_file = NULL;
2419                goto bad_swap;
2420        }
2421
2422        p->swap_file = swap_file;
2423        mapping = swap_file->f_mapping;
2424        inode = mapping->host;
2425
2426        /* If S_ISREG(inode->i_mode) will do inode_lock(inode); */
2427        error = claim_swapfile(p, inode);
2428        if (unlikely(error))
2429                goto bad_swap;
2430
2431        /*
2432         * Read the swap header.
2433         */
2434        if (!mapping->a_ops->readpage) {
2435                error = -EINVAL;
2436                goto bad_swap;
2437        }
2438        page = read_mapping_page(mapping, 0, swap_file);
2439        if (IS_ERR(page)) {
2440                error = PTR_ERR(page);
2441                goto bad_swap;
2442        }
2443        swap_header = kmap(page);
2444
2445        maxpages = read_swap_header(p, swap_header, inode);
2446        if (unlikely(!maxpages)) {
2447                error = -EINVAL;
2448                goto bad_swap;
2449        }
2450
2451        /* OK, set up the swap map and apply the bad block list */
2452        swap_map = vzalloc(maxpages);
2453        if (!swap_map) {
2454                error = -ENOMEM;
2455                goto bad_swap;
2456        }
2457        if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
2458                int cpu;
2459
2460                p->flags |= SWP_SOLIDSTATE;
2461                /*
2462                 * select a random position to start with to help wear leveling
2463                 * SSD
2464                 */
2465                p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
2466
2467                cluster_info = vzalloc(DIV_ROUND_UP(maxpages,
2468                        SWAPFILE_CLUSTER) * sizeof(*cluster_info));
2469                if (!cluster_info) {
2470                        error = -ENOMEM;
2471                        goto bad_swap;
2472                }
2473                p->percpu_cluster = alloc_percpu(struct percpu_cluster);
2474                if (!p->percpu_cluster) {
2475                        error = -ENOMEM;
2476                        goto bad_swap;
2477                }
2478                for_each_possible_cpu(cpu) {
2479                        struct percpu_cluster *cluster;
2480                        cluster = per_cpu_ptr(p->percpu_cluster, cpu);
2481                        cluster_set_null(&cluster->index);
2482                }
2483        }
2484
2485        error = swap_cgroup_swapon(p->type, maxpages);
2486        if (error)
2487                goto bad_swap;
2488
2489        nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
2490                cluster_info, maxpages, &span);
2491        if (unlikely(nr_extents < 0)) {
2492                error = nr_extents;
2493                goto bad_swap;
2494        }
2495        /* frontswap enabled? set up bit-per-page map for frontswap */
2496        if (frontswap_enabled)
2497                frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long));
2498
2499        if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
2500                /*
2501                 * When discard is enabled for swap with no particular
2502                 * policy flagged, we set all swap discard flags here in
2503                 * order to sustain backward compatibility with older
2504                 * swapon(8) releases.
2505                 */
2506                p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
2507                             SWP_PAGE_DISCARD);
2508
2509                /*
2510                 * By flagging sys_swapon, a sysadmin can tell us to
2511                 * either do single-time area discards only, or to just
2512                 * perform discards for released swap page-clusters.
2513                 * Now it's time to adjust the p->flags accordingly.
2514                 */
2515                if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
2516                        p->flags &= ~SWP_PAGE_DISCARD;
2517                else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
2518                        p->flags &= ~SWP_AREA_DISCARD;
2519
2520                /* issue a swapon-time discard if it's still required */
2521                if (p->flags & SWP_AREA_DISCARD) {
2522                        int err = discard_swap(p);
2523                        if (unlikely(err))
2524                                pr_err("swapon: discard_swap(%p): %d\n",
2525                                        p, err);
2526                }
2527        }
2528
2529        mutex_lock(&swapon_mutex);
2530        prio = -1;
2531        if (swap_flags & SWAP_FLAG_PREFER)
2532                prio =
2533                  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2534        enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
2535
2536        pr_info("Adding %uk swap on %s.  Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
2537                p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
2538                nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2539                (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2540                (p->flags & SWP_DISCARDABLE) ? "D" : "",
2541                (p->flags & SWP_AREA_DISCARD) ? "s" : "",
2542                (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
2543                (frontswap_map) ? "FS" : "");
2544
2545        mutex_unlock(&swapon_mutex);
2546        atomic_inc(&proc_poll_event);
2547        wake_up_interruptible(&proc_poll_wait);
2548
2549        if (S_ISREG(inode->i_mode))
2550                inode->i_flags |= S_SWAPFILE;
2551        error = 0;
2552        goto out;
2553bad_swap:
2554        free_percpu(p->percpu_cluster);
2555        p->percpu_cluster = NULL;
2556        if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
2557                set_blocksize(p->bdev, p->old_block_size);
2558                blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2559        }
2560        destroy_swap_extents(p);
2561        swap_cgroup_swapoff(p->type);
2562        spin_lock(&swap_lock);
2563        p->swap_file = NULL;
2564        p->flags = 0;
2565        spin_unlock(&swap_lock);
2566        vfree(swap_map);
2567        vfree(cluster_info);
2568        if (swap_file) {
2569                if (inode && S_ISREG(inode->i_mode)) {
2570                        inode_unlock(inode);
2571                        inode = NULL;
2572                }
2573                filp_close(swap_file, NULL);
2574        }
2575out:
2576        if (page && !IS_ERR(page)) {
2577                kunmap(page);
2578                put_page(page);
2579        }
2580        if (name)
2581                putname(name);
2582        if (inode && S_ISREG(inode->i_mode))
2583                inode_unlock(inode);
2584        return error;
2585}
2586
2587void si_swapinfo(struct sysinfo *val)
2588{
2589        unsigned int type;
2590        unsigned long nr_to_be_unused = 0;
2591
2592        spin_lock(&swap_lock);
2593        for (type = 0; type < nr_swapfiles; type++) {
2594                struct swap_info_struct *si = swap_info[type];
2595
2596                if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
2597                        nr_to_be_unused += si->inuse_pages;
2598        }
2599        val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
2600        val->totalswap = total_swap_pages + nr_to_be_unused;
2601        spin_unlock(&swap_lock);
2602}
2603
2604/*
2605 * Verify that a swap entry is valid and increment its swap map count.
2606 *
2607 * Returns error code in following case.
2608 * - success -> 0
2609 * - swp_entry is invalid -> EINVAL
2610 * - swp_entry is migration entry -> EINVAL
2611 * - swap-cache reference is requested but there is already one. -> EEXIST
2612 * - swap-cache reference is requested but the entry is not used. -> ENOENT
2613 * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
2614 */
2615static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2616{
2617        struct swap_info_struct *p;
2618        unsigned long offset, type;
2619        unsigned char count;
2620        unsigned char has_cache;
2621        int err = -EINVAL;
2622
2623        if (non_swap_entry(entry))
2624                goto out;
2625
2626        type = swp_type(entry);
2627        if (type >= nr_swapfiles)
2628                goto bad_file;
2629        p = swap_info[type];
2630        offset = swp_offset(entry);
2631
2632        spin_lock(&p->lock);
2633        if (unlikely(offset >= p->max))
2634                goto unlock_out;
2635
2636        count = p->swap_map[offset];
2637
2638        /*
2639         * swapin_readahead() doesn't check if a swap entry is valid, so the
2640         * swap entry could be SWAP_MAP_BAD. Check here with lock held.
2641         */
2642        if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
2643                err = -ENOENT;
2644                goto unlock_out;
2645        }
2646
2647        has_cache = count & SWAP_HAS_CACHE;
2648        count &= ~SWAP_HAS_CACHE;
2649        err = 0;
2650
2651        if (usage == SWAP_HAS_CACHE) {
2652
2653                /* set SWAP_HAS_CACHE if there is no cache and entry is used */
2654                if (!has_cache && count)
2655                        has_cache = SWAP_HAS_CACHE;
2656                else if (has_cache)             /* someone else added cache */
2657                        err = -EEXIST;
2658                else                            /* no users remaining */
2659                        err = -ENOENT;
2660
2661        } else if (count || has_cache) {
2662
2663                if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
2664                        count += usage;
2665                else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
2666                        err = -EINVAL;
2667                else if (swap_count_continued(p, offset, count))
2668                        count = COUNT_CONTINUED;
2669                else
2670                        err = -ENOMEM;
2671        } else
2672                err = -ENOENT;                  /* unused swap entry */
2673
2674        p->swap_map[offset] = count | has_cache;
2675
2676unlock_out:
2677        spin_unlock(&p->lock);
2678out:
2679        return err;
2680
2681bad_file:
2682        pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
2683        goto out;
2684}
2685
2686/*
2687 * Help swapoff by noting that swap entry belongs to shmem/tmpfs
2688 * (in which case its reference count is never incremented).
2689 */
2690void swap_shmem_alloc(swp_entry_t entry)
2691{
2692        __swap_duplicate(entry, SWAP_MAP_SHMEM);
2693}
2694
2695/*
2696 * Increase reference count of swap entry by 1.
2697 * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
2698 * but could not be atomically allocated.  Returns 0, just as if it succeeded,
2699 * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
2700 * might occur if a page table entry has got corrupted.
2701 */
2702int swap_duplicate(swp_entry_t entry)
2703{
2704        int err = 0;
2705
2706        while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
2707                err = add_swap_count_continuation(entry, GFP_ATOMIC);
2708        return err;
2709}
2710
2711/*
2712 * @entry: swap entry for which we allocate swap cache.
2713 *
2714 * Called when allocating swap cache for existing swap entry,
2715 * This can return error codes. Returns 0 at success.
2716 * -EBUSY means there is a swap cache.
2717 * Note: return code is different from swap_duplicate().
2718 */
2719int swapcache_prepare(swp_entry_t entry)
2720{
2721        return __swap_duplicate(entry, SWAP_HAS_CACHE);
2722}
2723
2724struct swap_info_struct *page_swap_info(struct page *page)
2725{
2726        swp_entry_t swap = { .val = page_private(page) };
2727        BUG_ON(!PageSwapCache(page));
2728        return swap_info[swp_type(swap)];
2729}
2730
2731/*
2732 * out-of-line __page_file_ methods to avoid include hell.
2733 */
2734struct address_space *__page_file_mapping(struct page *page)
2735{
2736        VM_BUG_ON_PAGE(!PageSwapCache(page), page);
2737        return page_swap_info(page)->swap_file->f_mapping;
2738}
2739EXPORT_SYMBOL_GPL(__page_file_mapping);
2740
2741pgoff_t __page_file_index(struct page *page)
2742{
2743        swp_entry_t swap = { .val = page_private(page) };
2744        VM_BUG_ON_PAGE(!PageSwapCache(page), page);
2745        return swp_offset(swap);
2746}
2747EXPORT_SYMBOL_GPL(__page_file_index);
2748
2749/*
2750 * add_swap_count_continuation - called when a swap count is duplicated
2751 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
2752 * page of the original vmalloc'ed swap_map, to hold the continuation count
2753 * (for that entry and for its neighbouring PAGE_SIZE swap entries).  Called
2754 * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
2755 *
2756 * These continuation pages are seldom referenced: the common paths all work
2757 * on the original swap_map, only referring to a continuation page when the
2758 * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
2759 *
2760 * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
2761 * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
2762 * can be called after dropping locks.
2763 */
2764int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2765{
2766        struct swap_info_struct *si;
2767        struct page *head;
2768        struct page *page;
2769        struct page *list_page;
2770        pgoff_t offset;
2771        unsigned char count;
2772
2773        /*
2774         * When debugging, it's easier to use __GFP_ZERO here; but it's better
2775         * for latency not to zero a page while GFP_ATOMIC and holding locks.
2776         */
2777        page = alloc_page(gfp_mask | __GFP_HIGHMEM);
2778
2779        si = swap_info_get(entry);
2780        if (!si) {
2781                /*
2782                 * An acceptable race has occurred since the failing
2783                 * __swap_duplicate(): the swap entry has been freed,
2784                 * perhaps even the whole swap_map cleared for swapoff.
2785                 */
2786                goto outer;
2787        }
2788
2789        offset = swp_offset(entry);
2790        count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
2791
2792        if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
2793                /*
2794                 * The higher the swap count, the more likely it is that tasks
2795                 * will race to add swap count continuation: we need to avoid
2796                 * over-provisioning.
2797                 */
2798                goto out;
2799        }
2800
2801        if (!page) {
2802                spin_unlock(&si->lock);
2803                return -ENOMEM;
2804        }
2805
2806        /*
2807         * We are fortunate that although vmalloc_to_page uses pte_offset_map,
2808         * no architecture is using highmem pages for kernel page tables: so it
2809         * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps.
2810         */
2811        head = vmalloc_to_page(si->swap_map + offset);
2812        offset &= ~PAGE_MASK;
2813
2814        /*
2815         * Page allocation does not initialize the page's lru field,
2816         * but it does always reset its private field.
2817         */
2818        if (!page_private(head)) {
2819                BUG_ON(count & COUNT_CONTINUED);
2820                INIT_LIST_HEAD(&head->lru);
2821                set_page_private(head, SWP_CONTINUED);
2822                si->flags |= SWP_CONTINUED;
2823        }
2824
2825        list_for_each_entry(list_page, &head->lru, lru) {
2826                unsigned char *map;
2827
2828                /*
2829                 * If the previous map said no continuation, but we've found
2830                 * a continuation page, free our allocation and use this one.
2831                 */
2832                if (!(count & COUNT_CONTINUED))
2833                        goto out;
2834
2835                map = kmap_atomic(list_page) + offset;
2836                count = *map;
2837                kunmap_atomic(map);
2838
2839                /*
2840                 * If this continuation count now has some space in it,
2841                 * free our allocation and use this one.
2842                 */
2843                if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
2844                        goto out;
2845        }
2846
2847        list_add_tail(&page->lru, &head->lru);
2848        page = NULL;                    /* now it's attached, don't free it */
2849out:
2850        spin_unlock(&si->lock);
2851outer:
2852        if (page)
2853                __free_page(page);
2854        return 0;
2855}
2856
2857/*
2858 * swap_count_continued - when the original swap_map count is incremented
2859 * from SWAP_MAP_MAX, check if there is already a continuation page to carry
2860 * into, carry if so, or else fail until a new continuation page is allocated;
2861 * when the original swap_map count is decremented from 0 with continuation,
2862 * borrow from the continuation and report whether it still holds more.
2863 * Called while __swap_duplicate() or swap_entry_free() holds swap_lock.
2864 */
2865static bool swap_count_continued(struct swap_info_struct *si,
2866                                 pgoff_t offset, unsigned char count)
2867{
2868        struct page *head;
2869        struct page *page;
2870        unsigned char *map;
2871
2872        head = vmalloc_to_page(si->swap_map + offset);
2873        if (page_private(head) != SWP_CONTINUED) {
2874                BUG_ON(count & COUNT_CONTINUED);
2875                return false;           /* need to add count continuation */
2876        }
2877
2878        offset &= ~PAGE_MASK;
2879        page = list_entry(head->lru.next, struct page, lru);
2880        map = kmap_atomic(page) + offset;
2881
2882        if (count == SWAP_MAP_MAX)      /* initial increment from swap_map */
2883                goto init_map;          /* jump over SWAP_CONT_MAX checks */
2884
2885        if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
2886                /*
2887                 * Think of how you add 1 to 999
2888                 */
2889                while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
2890                        kunmap_atomic(map);
2891                        page = list_entry(page->lru.next, struct page, lru);
2892                        BUG_ON(page == head);
2893                        map = kmap_atomic(page) + offset;
2894                }
2895                if (*map == SWAP_CONT_MAX) {
2896                        kunmap_atomic(map);
2897                        page = list_entry(page->lru.next, struct page, lru);
2898                        if (page == head)
2899                                return false;   /* add count continuation */
2900                        map = kmap_atomic(page) + offset;
2901init_map:               *map = 0;               /* we didn't zero the page */
2902                }
2903                *map += 1;
2904                kunmap_atomic(map);
2905                page = list_entry(page->lru.prev, struct page, lru);
2906                while (page != head) {
2907                        map = kmap_atomic(page) + offset;
2908                        *map = COUNT_CONTINUED;
2909                        kunmap_atomic(map);
2910                        page = list_entry(page->lru.prev, struct page, lru);
2911                }
2912                return true;                    /* incremented */
2913
2914        } else {                                /* decrementing */
2915                /*
2916                 * Think of how you subtract 1 from 1000
2917                 */
2918                BUG_ON(count != COUNT_CONTINUED);
2919                while (*map == COUNT_CONTINUED) {
2920                        kunmap_atomic(map);
2921                        page = list_entry(page->lru.next, struct page, lru);
2922                        BUG_ON(page == head);
2923                        map = kmap_atomic(page) + offset;
2924                }
2925                BUG_ON(*map == 0);
2926                *map -= 1;
2927                if (*map == 0)
2928                        count = 0;
2929                kunmap_atomic(map);
2930                page = list_entry(page->lru.prev, struct page, lru);
2931                while (page != head) {
2932                        map = kmap_atomic(page) + offset;
2933                        *map = SWAP_CONT_MAX | count;
2934                        count = COUNT_CONTINUED;
2935                        kunmap_atomic(map);
2936                        page = list_entry(page->lru.prev, struct page, lru);
2937                }
2938                return count == COUNT_CONTINUED;
2939        }
2940}
2941
2942/*
2943 * free_swap_count_continuations - swapoff free all the continuation pages
2944 * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
2945 */
2946static void free_swap_count_continuations(struct swap_info_struct *si)
2947{
2948        pgoff_t offset;
2949
2950        for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
2951                struct page *head;
2952                head = vmalloc_to_page(si->swap_map + offset);
2953                if (page_private(head)) {
2954                        struct page *page, *next;
2955
2956                        list_for_each_entry_safe(page, next, &head->lru, lru) {
2957                                list_del(&page->lru);
2958                                __free_page(page);
2959                        }
2960                }
2961        }
2962}
2963