LXR linux/mm/memory

   1/*
   2 *  linux/mm/memory_hotplug.c
   3 *
   4 *  Copyright (C)
   5 */
   6
   7#include <linux/stddef.h>
   8#include <linux/mm.h>
   9#include <linux/swap.h>
  10#include <linux/interrupt.h>
  11#include <linux/pagemap.h>
  12#include <linux/bootmem.h>
  13#include <linux/compiler.h>
  14#include <linux/module.h>
  15#include <linux/pagevec.h>
  16#include <linux/writeback.h>
  17#include <linux/slab.h>
  18#include <linux/sysctl.h>
  19#include <linux/cpu.h>
  20#include <linux/memory.h>
  21#include <linux/memory_hotplug.h>
  22#include <linux/highmem.h>
  23#include <linux/vmalloc.h>
  24#include <linux/ioport.h>
  25#include <linux/delay.h>
  26#include <linux/migrate.h>
  27#include <linux/page-isolation.h>
  28#include <linux/pfn.h>
  29#include <linux/suspend.h>
  30#include <linux/mm_inline.h>
  31#include <linux/firmware-map.h>
  32
  33#include <asm/tlbflush.h>
  34
  35#include "internal.h"
  36
  37DEFINE_MUTEX(mem_hotplug_mutex);
  38
  39void lock_memory_hotplug(void)
  40{
  41        mutex_lock(&mem_hotplug_mutex);
  42
  43        /* for exclusive hibernation if CONFIG_HIBERNATION=y */
  44        lock_system_sleep();
  45}
  46
  47void unlock_memory_hotplug(void)
  48{
  49        unlock_system_sleep();
  50        mutex_unlock(&mem_hotplug_mutex);
  51}
  52
  53
  54/* add this memory to iomem resource */
  55static struct resource *register_memory_resource(u64 start, u64 size)
  56{
  57        struct resource *res;
  58        res = kzalloc(sizeof(struct resource), GFP_KERNEL);
  59        BUG_ON(!res);
  60
  61        res->name = "System RAM";
  62        res->start = start;
  63        res->end = start + size - 1;
  64        res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
  65        if (request_resource(&iomem_resource, res) < 0) {
  66                printk("System RAM resource %llx - %llx cannot be added\n",
  67                (unsigned long long)res->start, (unsigned long long)res->end);
  68                kfree(res);
  69                res = NULL;
  70        }
  71        return res;
  72}
  73
  74static void release_memory_resource(struct resource *res)
  75{
  76        if (!res)
  77                return;
  78        release_resource(res);
  79        kfree(res);
  80        return;
  81}
  82
  83#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
  84#ifndef CONFIG_SPARSEMEM_VMEMMAP
  85static void get_page_bootmem(unsigned long info,  struct page *page,
  86                             unsigned long type)
  87{
  88        page->lru.next = (struct list_head *) type;
  89        SetPagePrivate(page);
  90        set_page_private(page, info);
  91        atomic_inc(&page->_count);
  92}
  93
  94/* reference to __meminit __free_pages_bootmem is valid
  95 * so use __ref to tell modpost not to generate a warning */
  96void __ref put_page_bootmem(struct page *page)
  97{
  98        unsigned long type;
  99
 100        type = (unsigned long) page->lru.next;
 101        BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
 102               type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
 103
 104        if (atomic_dec_return(&page->_count) == 1) {
 105                ClearPagePrivate(page);
 106                set_page_private(page, 0);
 107                INIT_LIST_HEAD(&page->lru);
 108                __free_pages_bootmem(page, 0);
 109        }
 110
 111}
 112
 113static void register_page_bootmem_info_section(unsigned long start_pfn)
 114{
 115        unsigned long *usemap, mapsize, section_nr, i;
 116        struct mem_section *ms;
 117        struct page *page, *memmap;
 118
 119        if (!pfn_valid(start_pfn))
 120                return;
 121
 122        section_nr = pfn_to_section_nr(start_pfn);
 123        ms = __nr_to_section(section_nr);
 124
 125        /* Get section's memmap address */
 126        memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
 127
 128        /*
 129         * Get page for the memmap's phys address
 130         * XXX: need more consideration for sparse_vmemmap...
 131         */
 132        page = virt_to_page(memmap);
 133        mapsize = sizeof(struct page) * PAGES_PER_SECTION;
 134        mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
 135
 136        /* remember memmap's page */
 137        for (i = 0; i < mapsize; i++, page++)
 138                get_page_bootmem(section_nr, page, SECTION_INFO);
 139
 140        usemap = __nr_to_section(section_nr)->pageblock_flags;
 141        page = virt_to_page(usemap);
 142
 143        mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
 144
 145        for (i = 0; i < mapsize; i++, page++)
 146                get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
 147
 148}
 149
 150void register_page_bootmem_info_node(struct pglist_data *pgdat)
 151{
 152        unsigned long i, pfn, end_pfn, nr_pages;
 153        int node = pgdat->node_id;
 154        struct page *page;
 155        struct zone *zone;
 156
 157        nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
 158        page = virt_to_page(pgdat);
 159
 160        for (i = 0; i < nr_pages; i++, page++)
 161                get_page_bootmem(node, page, NODE_INFO);
 162
 163        zone = &pgdat->node_zones[0];
 164        for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
 165                if (zone->wait_table) {
 166                        nr_pages = zone->wait_table_hash_nr_entries
 167                                * sizeof(wait_queue_head_t);
 168                        nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
 169                        page = virt_to_page(zone->wait_table);
 170
 171                        for (i = 0; i < nr_pages; i++, page++)
 172                                get_page_bootmem(node, page, NODE_INFO);
 173                }
 174        }
 175
 176        pfn = pgdat->node_start_pfn;
 177        end_pfn = pfn + pgdat->node_spanned_pages;
 178
 179        /* register_section info */
 180        for (; pfn < end_pfn; pfn += PAGES_PER_SECTION)
 181                register_page_bootmem_info_section(pfn);
 182
 183}
 184#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
 185
 186static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
 187                           unsigned long end_pfn)
 188{
 189        unsigned long old_zone_end_pfn;
 190
 191        zone_span_writelock(zone);
 192
 193        old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
 194        if (start_pfn < zone->zone_start_pfn)
 195                zone->zone_start_pfn = start_pfn;
 196
 197        zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
 198                                zone->zone_start_pfn;
 199
 200        zone_span_writeunlock(zone);
 201}
 202
 203static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
 204                            unsigned long end_pfn)
 205{
 206        unsigned long old_pgdat_end_pfn =
 207                pgdat->node_start_pfn + pgdat->node_spanned_pages;
 208
 209        if (start_pfn < pgdat->node_start_pfn)
 210                pgdat->node_start_pfn = start_pfn;
 211
 212        pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
 213                                        pgdat->node_start_pfn;
 214}
 215
 216static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 217{
 218        struct pglist_data *pgdat = zone->zone_pgdat;
 219        int nr_pages = PAGES_PER_SECTION;
 220        int nid = pgdat->node_id;
 221        int zone_type;
 222        unsigned long flags;
 223
 224        zone_type = zone - pgdat->node_zones;
 225        if (!zone->wait_table) {
 226                int ret;
 227
 228                ret = init_currently_empty_zone(zone, phys_start_pfn,
 229                                                nr_pages, MEMMAP_HOTPLUG);
 230                if (ret)
 231                        return ret;
 232        }
 233        pgdat_resize_lock(zone->zone_pgdat, &flags);
 234        grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
 235        grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
 236                        phys_start_pfn + nr_pages);
 237        pgdat_resize_unlock(zone->zone_pgdat, &flags);
 238        memmap_init_zone(nr_pages, nid, zone_type,
 239                         phys_start_pfn, MEMMAP_HOTPLUG);
 240        return 0;
 241}
 242
 243static int __meminit __add_section(int nid, struct zone *zone,
 244                                        unsigned long phys_start_pfn)
 245{
 246        int nr_pages = PAGES_PER_SECTION;
 247        int ret;
 248
 249        if (pfn_valid(phys_start_pfn))
 250                return -EEXIST;
 251
 252        ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
 253
 254        if (ret < 0)
 255                return ret;
 256
 257        ret = __add_zone(zone, phys_start_pfn);
 258
 259        if (ret < 0)
 260                return ret;
 261
 262        return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
 263}
 264
 265#ifdef CONFIG_SPARSEMEM_VMEMMAP
 266static int __remove_section(struct zone *zone, struct mem_section *ms)
 267{
 268        /*
 269         * XXX: Freeing memmap with vmemmap is not implement yet.
 270         *      This should be removed later.
 271         */
 272        return -EBUSY;
 273}
 274#else
 275static int __remove_section(struct zone *zone, struct mem_section *ms)
 276{
 277        unsigned long flags;
 278        struct pglist_data *pgdat = zone->zone_pgdat;
 279        int ret = -EINVAL;
 280
 281        if (!valid_section(ms))
 282                return ret;
 283
 284        ret = unregister_memory_section(ms);
 285        if (ret)
 286                return ret;
 287
 288        pgdat_resize_lock(pgdat, &flags);
 289        sparse_remove_one_section(zone, ms);
 290        pgdat_resize_unlock(pgdat, &flags);
 291        return 0;
 292}
 293#endif
 294
 295/*
 296 * Reasonably generic function for adding memory.  It is
 297 * expected that archs that support memory hotplug will
 298 * call this function after deciding the zone to which to
 299 * add the new pages.
 300 */
 301int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
 302                        unsigned long nr_pages)
 303{
 304        unsigned long i;
 305        int err = 0;
 306        int start_sec, end_sec;
 307        /* during initialize mem_map, align hot-added range to section */
 308        start_sec = pfn_to_section_nr(phys_start_pfn);
 309        end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
 310
 311        for (i = start_sec; i <= end_sec; i++) {
 312                err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
 313
 314                /*
 315                 * EEXIST is finally dealt with by ioresource collision
 316                 * check. see add_memory() => register_memory_resource()
 317                 * Warning will be printed if there is collision.
 318                 */
 319                if (err && (err != -EEXIST))
 320                        break;
 321                err = 0;
 322        }
 323
 324        return err;
 325}
 326EXPORT_SYMBOL_GPL(__add_pages);
 327
 328/**
 329 * __remove_pages() - remove sections of pages from a zone
 330 * @zone: zone from which pages need to be removed
 331 * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
 332 * @nr_pages: number of pages to remove (must be multiple of section size)
 333 *
 334 * Generic helper function to remove section mappings and sysfs entries
 335 * for the section of the memory we are removing. Caller needs to make
 336 * sure that pages are marked reserved and zones are adjust properly by
 337 * calling offline_pages().
 338 */
 339int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
 340                 unsigned long nr_pages)
 341{
 342        unsigned long i, ret = 0;
 343        int sections_to_remove;
 344
 345        /*
 346         * We can only remove entire sections
 347         */
 348        BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
 349        BUG_ON(nr_pages % PAGES_PER_SECTION);
 350
 351        sections_to_remove = nr_pages / PAGES_PER_SECTION;
 352        for (i = 0; i < sections_to_remove; i++) {
 353                unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
 354                release_mem_region(pfn << PAGE_SHIFT,
 355                                   PAGES_PER_SECTION << PAGE_SHIFT);
 356                ret = __remove_section(zone, __pfn_to_section(pfn));
 357                if (ret)
 358                        break;
 359        }
 360        return ret;
 361}
 362EXPORT_SYMBOL_GPL(__remove_pages);
 363
 364void online_page(struct page *page)
 365{
 366        unsigned long pfn = page_to_pfn(page);
 367
 368        totalram_pages++;
 369        if (pfn >= num_physpages)
 370                num_physpages = pfn + 1;
 371
 372#ifdef CONFIG_HIGHMEM
 373        if (PageHighMem(page))
 374                totalhigh_pages++;
 375#endif
 376
 377#ifdef CONFIG_FLATMEM
 378        max_mapnr = max(page_to_pfn(page), max_mapnr);
 379#endif
 380
 381        ClearPageReserved(page);
 382        init_page_count(page);
 383        __free_page(page);
 384}
 385
 386static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
 387                        void *arg)
 388{
 389        unsigned long i;
 390        unsigned long onlined_pages = *(unsigned long *)arg;
 391        struct page *page;
 392        if (PageReserved(pfn_to_page(start_pfn)))
 393                for (i = 0; i < nr_pages; i++) {
 394                        page = pfn_to_page(start_pfn + i);
 395                        online_page(page);
 396                        onlined_pages++;
 397                }
 398        *(unsigned long *)arg = onlined_pages;
 399        return 0;
 400}
 401
 402
 403int online_pages(unsigned long pfn, unsigned long nr_pages)
 404{
 405        unsigned long onlined_pages = 0;
 406        struct zone *zone;
 407        int need_zonelists_rebuild = 0;
 408        int nid;
 409        int ret;
 410        struct memory_notify arg;
 411
 412        lock_memory_hotplug();
 413        arg.start_pfn = pfn;
 414        arg.nr_pages = nr_pages;
 415        arg.status_change_nid = -1;
 416
 417        nid = page_to_nid(pfn_to_page(pfn));
 418        if (node_present_pages(nid) == 0)
 419                arg.status_change_nid = nid;
 420
 421        ret = memory_notify(MEM_GOING_ONLINE, &arg);
 422        ret = notifier_to_errno(ret);
 423        if (ret) {
 424                memory_notify(MEM_CANCEL_ONLINE, &arg);
 425                unlock_memory_hotplug();
 426                return ret;
 427        }
 428        /*
 429         * This doesn't need a lock to do pfn_to_page().
 430         * The section can't be removed here because of the
 431         * memory_block->state_mutex.
 432         */
 433        zone = page_zone(pfn_to_page(pfn));
 434        /*
 435         * If this zone is not populated, then it is not in zonelist.
 436         * This means the page allocator ignores this zone.
 437         * So, zonelist must be updated after online.
 438         */
 439        mutex_lock(&zonelists_mutex);
 440        if (!populated_zone(zone))
 441                need_zonelists_rebuild = 1;
 442
 443        ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
 444                online_pages_range);
 445        if (ret) {
 446                mutex_unlock(&zonelists_mutex);
 447                printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
 448                        nr_pages, pfn);
 449                memory_notify(MEM_CANCEL_ONLINE, &arg);
 450                unlock_memory_hotplug();
 451                return ret;
 452        }
 453
 454        zone->present_pages += onlined_pages;
 455        zone->zone_pgdat->node_present_pages += onlined_pages;
 456        if (need_zonelists_rebuild)
 457                build_all_zonelists(zone);
 458        else
 459                zone_pcp_update(zone);
 460
 461        mutex_unlock(&zonelists_mutex);
 462        setup_per_zone_wmarks();
 463        calculate_zone_inactive_ratio(zone);
 464        if (onlined_pages) {
 465                kswapd_run(zone_to_nid(zone));
 466                node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
 467        }
 468
 469        vm_total_pages = nr_free_pagecache_pages();
 470
 471        writeback_set_ratelimit();
 472
 473        if (onlined_pages)
 474                memory_notify(MEM_ONLINE, &arg);
 475        unlock_memory_hotplug();
 476
 477        return 0;
 478}
 479#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
 480
 481/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
 482static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
 483{
 484        struct pglist_data *pgdat;
 485        unsigned long zones_size[MAX_NR_ZONES] = {0};
 486        unsigned long zholes_size[MAX_NR_ZONES] = {0};
 487        unsigned long start_pfn = start >> PAGE_SHIFT;
 488
 489        pgdat = arch_alloc_nodedata(nid);
 490        if (!pgdat)
 491                return NULL;
 492
 493        arch_refresh_nodedata(nid, pgdat);
 494
 495        /* we can use NODE_DATA(nid) from here */
 496
 497        /* init node's zones as empty zones, we don't have any present pages.*/
 498        free_area_init_node(nid, zones_size, start_pfn, zholes_size);
 499
 500        return pgdat;
 501}
 502
 503static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
 504{
 505        arch_refresh_nodedata(nid, NULL);
 506        arch_free_nodedata(pgdat);
 507        return;
 508}
 509
 510
 511/*
 512 * called by cpu_up() to online a node without onlined memory.
 513 */
 514int mem_online_node(int nid)
 515{
 516        pg_data_t       *pgdat;
 517        int     ret;
 518
 519        lock_memory_hotplug();
 520        pgdat = hotadd_new_pgdat(nid, 0);
 521        if (pgdat) {
 522                ret = -ENOMEM;
 523                goto out;
 524        }
 525        node_set_online(nid);
 526        ret = register_one_node(nid);
 527        BUG_ON(ret);
 528
 529out:
 530        unlock_memory_hotplug();
 531        return ret;
 532}
 533
 534/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
 535int __ref add_memory(int nid, u64 start, u64 size)
 536{
 537        pg_data_t *pgdat = NULL;
 538        int new_pgdat = 0;
 539        struct resource *res;
 540        int ret;
 541
 542        lock_memory_hotplug();
 543
 544        res = register_memory_resource(start, size);
 545        ret = -EEXIST;
 546        if (!res)
 547                goto out;
 548
 549        if (!node_online(nid)) {
 550                pgdat = hotadd_new_pgdat(nid, start);
 551                ret = -ENOMEM;
 552                if (!pgdat)
 553                        goto out;
 554                new_pgdat = 1;
 555        }
 556
 557        /* call arch's memory hotadd */
 558        ret = arch_add_memory(nid, start, size);
 559
 560        if (ret < 0)
 561                goto error;
 562
 563        /* we online node here. we can't roll back from here. */
 564        node_set_online(nid);
 565
 566        if (new_pgdat) {
 567                ret = register_one_node(nid);
 568                /*
 569                 * If sysfs file of new node can't create, cpu on the node
 570                 * can't be hot-added. There is no rollback way now.
 571                 * So, check by BUG_ON() to catch it reluctantly..
 572                 */
 573                BUG_ON(ret);
 574        }
 575
 576        /* create new memmap entry */
 577        firmware_map_add_hotplug(start, start + size, "System RAM");
 578
 579        goto out;
 580
 581error:
 582        /* rollback pgdat allocation and others */
 583        if (new_pgdat)
 584                rollback_node_hotadd(nid, pgdat);
 585        if (res)
 586                release_memory_resource(res);
 587
 588out:
 589        unlock_memory_hotplug();
 590        return ret;
 591}
 592EXPORT_SYMBOL_GPL(add_memory);
 593
 594#ifdef CONFIG_MEMORY_HOTREMOVE
 595/*
 596 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
 597 * set and the size of the free page is given by page_order(). Using this,
 598 * the function determines if the pageblock contains only free pages.
 599 * Due to buddy contraints, a free page at least the size of a pageblock will
 600 * be located at the start of the pageblock
 601 */
 602static inline int pageblock_free(struct page *page)
 603{
 604        return PageBuddy(page) && page_order(page) >= pageblock_order;
 605}
 606
 607/* Return the start of the next active pageblock after a given page */
 608static struct page *next_active_pageblock(struct page *page)
 609{
 610        /* Ensure the starting page is pageblock-aligned */
 611        BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
 612
 613        /* If the entire pageblock is free, move to the end of free page */
 614        if (pageblock_free(page)) {
 615                int order;
 616                /* be careful. we don't have locks, page_order can be changed.*/
 617                order = page_order(page);
 618                if ((order < MAX_ORDER) && (order >= pageblock_order))
 619                        return page + (1 << order);
 620        }
 621
 622        return page + pageblock_nr_pages;
 623}
 624
 625/* Checks if this range of memory is likely to be hot-removable. */
 626int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
 627{
 628        struct page *page = pfn_to_page(start_pfn);
 629        struct page *end_page = page + nr_pages;
 630
 631        /* Check the starting page of each pageblock within the range */
 632        for (; page < end_page; page = next_active_pageblock(page)) {
 633                if (!is_pageblock_removable_nolock(page))
 634                        return 0;
 635                cond_resched();
 636        }
 637
 638        /* All pageblocks in the memory block are likely to be hot-removable */
 639        return 1;
 640}
 641
 642/*
 643 * Confirm all pages in a range [start, end) is belongs to the same zone.
 644 */
 645static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
 646{
 647        unsigned long pfn;
 648        struct zone *zone = NULL;
 649        struct page *page;
 650        int i;
 651        for (pfn = start_pfn;
 652             pfn < end_pfn;
 653             pfn += MAX_ORDER_NR_PAGES) {
 654                i = 0;
 655                /* This is just a CONFIG_HOLES_IN_ZONE check.*/
 656                while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
 657                        i++;
 658                if (i == MAX_ORDER_NR_PAGES)
 659                        continue;
 660                page = pfn_to_page(pfn + i);
 661                if (zone && page_zone(page) != zone)
 662                        return 0;
 663                zone = page_zone(page);
 664        }
 665        return 1;
 666}
 667
 668/*
 669 * Scanning pfn is much easier than scanning lru list.
 670 * Scan pfn from start to end and Find LRU page.
 671 */
 672static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
 673{
 674        unsigned long pfn;
 675        struct page *page;
 676        for (pfn = start; pfn < end; pfn++) {
 677                if (pfn_valid(pfn)) {
 678                        page = pfn_to_page(pfn);
 679                        if (PageLRU(page))
 680                                return pfn;
 681                }
 682        }
 683        return 0;
 684}
 685
 686static struct page *
 687hotremove_migrate_alloc(struct page *page, unsigned long private, int **x)
 688{
 689        /* This should be improooooved!! */
 690        return alloc_page(GFP_HIGHUSER_MOVABLE);
 691}
 692
 693#define NR_OFFLINE_AT_ONCE_PAGES        (256)
 694static int
 695do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 696{
 697        unsigned long pfn;
 698        struct page *page;
 699        int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
 700        int not_managed = 0;
 701        int ret = 0;
 702        LIST_HEAD(source);
 703
 704        for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
 705                if (!pfn_valid(pfn))
 706                        continue;
 707                page = pfn_to_page(pfn);
 708                if (!page_count(page))
 709                        continue;
 710                /*
 711                 * We can skip free pages. And we can only deal with pages on
 712                 * LRU.
 713                 */
 714                ret = isolate_lru_page(page);
 715                if (!ret) { /* Success */
 716                        list_add_tail(&page->lru, &source);
 717                        move_pages--;
 718                        inc_zone_page_state(page, NR_ISOLATED_ANON +
 719                                            page_is_file_cache(page));
 720
 721                } else {
 722#ifdef CONFIG_DEBUG_VM
 723                        printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
 724                               pfn);
 725                        dump_page(page);
 726#endif
 727                        /* Becasue we don't have big zone->lock. we should
 728                           check this again here. */
 729                        if (page_count(page)) {
 730                                not_managed++;
 731                                ret = -EBUSY;
 732                                break;
 733                        }
 734                }
 735        }
 736        if (!list_empty(&source)) {
 737                if (not_managed) {
 738                        putback_lru_pages(&source);
 739                        goto out;
 740                }
 741                /* this function returns # of failed pages */
 742                ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
 743                                                                true, true);
 744                if (ret)
 745                        putback_lru_pages(&source);
 746        }
 747out:
 748        return ret;
 749}
 750
 751/*
 752 * remove from free_area[] and mark all as Reserved.
 753 */
 754static int
 755offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
 756                        void *data)
 757{
 758        __offline_isolated_pages(start, start + nr_pages);
 759        return 0;
 760}
 761
 762static void
 763offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 764{
 765        walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
 766                                offline_isolated_pages_cb);
 767}
 768
 769/*
 770 * Check all pages in range, recoreded as memory resource, are isolated.
 771 */
 772static int
 773check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
 774                        void *data)
 775{
 776        int ret;
 777        long offlined = *(long *)data;
 778        ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
 779        offlined = nr_pages;
 780        if (!ret)
 781                *(long *)data += offlined;
 782        return ret;
 783}
 784
 785static long
 786check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
 787{
 788        long offlined = 0;
 789        int ret;
 790
 791        ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
 792                        check_pages_isolated_cb);
 793        if (ret < 0)
 794                offlined = (long)ret;
 795        return offlined;
 796}
 797
 798static int offline_pages(unsigned long start_pfn,
 799                  unsigned long end_pfn, unsigned long timeout)
 800{
 801        unsigned long pfn, nr_pages, expire;
 802        long offlined_pages;
 803        int ret, drain, retry_max, node;
 804        struct zone *zone;
 805        struct memory_notify arg;
 806
 807        BUG_ON(start_pfn >= end_pfn);
 808        /* at least, alignment against pageblock is necessary */
 809        if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
 810                return -EINVAL;
 811        if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
 812                return -EINVAL;
 813        /* This makes hotplug much easier...and readable.
 814           we assume this for now. .*/
 815        if (!test_pages_in_a_zone(start_pfn, end_pfn))
 816                return -EINVAL;
 817
 818        lock_memory_hotplug();
 819
 820        zone = page_zone(pfn_to_page(start_pfn));
 821        node = zone_to_nid(zone);
 822        nr_pages = end_pfn - start_pfn;
 823
 824        /* set above range as isolated */
 825        ret = start_isolate_page_range(start_pfn, end_pfn);
 826        if (ret)
 827                goto out;
 828
 829        arg.start_pfn = start_pfn;
 830        arg.nr_pages = nr_pages;
 831        arg.status_change_nid = -1;
 832        if (nr_pages >= node_present_pages(node))
 833                arg.status_change_nid = node;
 834
 835        ret = memory_notify(MEM_GOING_OFFLINE, &arg);
 836        ret = notifier_to_errno(ret);
 837        if (ret)
 838                goto failed_removal;
 839
 840        pfn = start_pfn;
 841        expire = jiffies + timeout;
 842        drain = 0;
 843        retry_max = 5;
 844repeat:
 845        /* start memory hot removal */
 846        ret = -EAGAIN;
 847        if (time_after(jiffies, expire))
 848                goto failed_removal;
 849        ret = -EINTR;
 850        if (signal_pending(current))
 851                goto failed_removal;
 852        ret = 0;
 853        if (drain) {
 854                lru_add_drain_all();
 855                cond_resched();
 856                drain_all_pages();
 857        }
 858
 859        pfn = scan_lru_pages(start_pfn, end_pfn);
 860        if (pfn) { /* We have page on LRU */
 861                ret = do_migrate_range(pfn, end_pfn);
 862                if (!ret) {
 863                        drain = 1;
 864                        goto repeat;
 865                } else {
 866                        if (ret < 0)
 867                                if (--retry_max == 0)
 868                                        goto failed_removal;
 869                        yield();
 870                        drain = 1;
 871                        goto repeat;
 872                }
 873        }
 874        /* drain all zone's lru pagevec, this is asyncronous... */
 875        lru_add_drain_all();
 876        yield();
 877        /* drain pcp pages , this is synchrouns. */
 878        drain_all_pages();
 879        /* check again */
 880        offlined_pages = check_pages_isolated(start_pfn, end_pfn);
 881        if (offlined_pages < 0) {
 882                ret = -EBUSY;
 883                goto failed_removal;
 884        }
 885        printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
 886        /* Ok, all of our target is islaoted.
 887           We cannot do rollback at this point. */
 888        offline_isolated_pages(start_pfn, end_pfn);
 889        /* reset pagetype flags and makes migrate type to be MOVABLE */
 890        undo_isolate_page_range(start_pfn, end_pfn);
 891        /* removal success */
 892        zone->present_pages -= offlined_pages;
 893        zone->zone_pgdat->node_present_pages -= offlined_pages;
 894        totalram_pages -= offlined_pages;
 895
 896        setup_per_zone_wmarks();
 897        calculate_zone_inactive_ratio(zone);
 898        if (!node_present_pages(node)) {
 899                node_clear_state(node, N_HIGH_MEMORY);
 900                kswapd_stop(node);
 901        }
 902
 903        vm_total_pages = nr_free_pagecache_pages();
 904        writeback_set_ratelimit();
 905
 906        memory_notify(MEM_OFFLINE, &arg);
 907        unlock_memory_hotplug();
 908        return 0;
 909
 910failed_removal:
 911        printk(KERN_INFO "memory offlining %lx to %lx failed\n",
 912                start_pfn, end_pfn);
 913        memory_notify(MEM_CANCEL_OFFLINE, &arg);
 914        /* pushback to free area */
 915        undo_isolate_page_range(start_pfn, end_pfn);
 916
 917out:
 918        unlock_memory_hotplug();
 919        return ret;
 920}
 921
 922int remove_memory(u64 start, u64 size)
 923{
 924        unsigned long start_pfn, end_pfn;
 925
 926        start_pfn = PFN_DOWN(start);
 927        end_pfn = start_pfn + PFN_DOWN(size);
 928        return offline_pages(start_pfn, end_pfn, 120 * HZ);
 929}
 930#else
 931int remove_memory(u64 start, u64 size)
 932{
 933        return -EINVAL;
 934}
 935#endif /* CONFIG_MEMORY_HOTREMOVE */
 936EXPORT_SYMBOL_GPL(remove_memory);
 937