linux/drivers/xen/balloon.c
<<
>>
Prefs
   1/******************************************************************************
   2 * Xen balloon driver - enables returning/claiming memory to/from Xen.
   3 *
   4 * Copyright (c) 2003, B Dragovic
   5 * Copyright (c) 2003-2004, M Williamson, K Fraser
   6 * Copyright (c) 2005 Dan M. Smith, IBM Corporation
   7 * Copyright (c) 2010 Daniel Kiper
   8 *
   9 * Memory hotplug support was written by Daniel Kiper. Work on
  10 * it was sponsored by Google under Google Summer of Code 2010
  11 * program. Jeremy Fitzhardinge from Citrix was the mentor for
  12 * this project.
  13 *
  14 * This program is free software; you can redistribute it and/or
  15 * modify it under the terms of the GNU General Public License version 2
  16 * as published by the Free Software Foundation; or, when distributed
  17 * separately from the Linux kernel or incorporated into other
  18 * software packages, subject to the following license:
  19 *
  20 * Permission is hereby granted, free of charge, to any person obtaining a copy
  21 * of this source file (the "Software"), to deal in the Software without
  22 * restriction, including without limitation the rights to use, copy, modify,
  23 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
  24 * and to permit persons to whom the Software is furnished to do so, subject to
  25 * the following conditions:
  26 *
  27 * The above copyright notice and this permission notice shall be included in
  28 * all copies or substantial portions of the Software.
  29 *
  30 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  31 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  32 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  33 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  34 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  35 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  36 * IN THE SOFTWARE.
  37 */
  38
  39#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
  40
  41#include <linux/kernel.h>
  42#include <linux/sched.h>
  43#include <linux/errno.h>
  44#include <linux/module.h>
  45#include <linux/mm.h>
  46#include <linux/bootmem.h>
  47#include <linux/pagemap.h>
  48#include <linux/highmem.h>
  49#include <linux/mutex.h>
  50#include <linux/list.h>
  51#include <linux/gfp.h>
  52#include <linux/notifier.h>
  53#include <linux/memory.h>
  54#include <linux/memory_hotplug.h>
  55
  56#include <asm/page.h>
  57#include <asm/pgalloc.h>
  58#include <asm/pgtable.h>
  59#include <asm/tlb.h>
  60
  61#include <asm/xen/hypervisor.h>
  62#include <asm/xen/hypercall.h>
  63
  64#include <xen/xen.h>
  65#include <xen/interface/xen.h>
  66#include <xen/interface/memory.h>
  67#include <xen/balloon.h>
  68#include <xen/features.h>
  69#include <xen/page.h>
  70
  71/*
  72 * balloon_process() state:
  73 *
  74 * BP_DONE: done or nothing to do,
  75 * BP_EAGAIN: error, go to sleep,
  76 * BP_ECANCELED: error, balloon operation canceled.
  77 */
  78
  79enum bp_state {
  80        BP_DONE,
  81        BP_EAGAIN,
  82        BP_ECANCELED
  83};
  84
  85
  86static DEFINE_MUTEX(balloon_mutex);
  87
  88struct balloon_stats balloon_stats;
  89EXPORT_SYMBOL_GPL(balloon_stats);
  90
  91/* We increase/decrease in batches which fit in a page */
  92static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)];
  93
  94/* List of ballooned pages, threaded through the mem_map array. */
  95static LIST_HEAD(ballooned_pages);
  96
  97/* Main work function, always executed in process context. */
  98static void balloon_process(struct work_struct *work);
  99static DECLARE_DELAYED_WORK(balloon_worker, balloon_process);
 100
 101/* When ballooning out (allocating memory to return to Xen) we don't really
 102   want the kernel to try too hard since that can trigger the oom killer. */
 103#define GFP_BALLOON \
 104        (GFP_HIGHUSER | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC)
 105
 106static void scrub_page(struct page *page)
 107{
 108#ifdef CONFIG_XEN_SCRUB_PAGES
 109        clear_highpage(page);
 110#endif
 111}
 112
 113/* balloon_append: add the given page to the balloon. */
 114static void __balloon_append(struct page *page)
 115{
 116        /* Lowmem is re-populated first, so highmem pages go at list tail. */
 117        if (PageHighMem(page)) {
 118                list_add_tail(&page->lru, &ballooned_pages);
 119                balloon_stats.balloon_high++;
 120        } else {
 121                list_add(&page->lru, &ballooned_pages);
 122                balloon_stats.balloon_low++;
 123        }
 124}
 125
 126static void balloon_append(struct page *page)
 127{
 128        __balloon_append(page);
 129        adjust_managed_page_count(page, -1);
 130}
 131
 132/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
 133static struct page *balloon_retrieve(bool prefer_highmem)
 134{
 135        struct page *page;
 136
 137        if (list_empty(&ballooned_pages))
 138                return NULL;
 139
 140        if (prefer_highmem)
 141                page = list_entry(ballooned_pages.prev, struct page, lru);
 142        else
 143                page = list_entry(ballooned_pages.next, struct page, lru);
 144        list_del(&page->lru);
 145
 146        if (PageHighMem(page))
 147                balloon_stats.balloon_high--;
 148        else
 149                balloon_stats.balloon_low--;
 150
 151        adjust_managed_page_count(page, 1);
 152
 153        return page;
 154}
 155
 156static struct page *balloon_first_page(void)
 157{
 158        if (list_empty(&ballooned_pages))
 159                return NULL;
 160        return list_entry(ballooned_pages.next, struct page, lru);
 161}
 162
 163static struct page *balloon_next_page(struct page *page)
 164{
 165        struct list_head *next = page->lru.next;
 166        if (next == &ballooned_pages)
 167                return NULL;
 168        return list_entry(next, struct page, lru);
 169}
 170
 171static enum bp_state update_schedule(enum bp_state state)
 172{
 173        if (state == BP_DONE) {
 174                balloon_stats.schedule_delay = 1;
 175                balloon_stats.retry_count = 1;
 176                return BP_DONE;
 177        }
 178
 179        ++balloon_stats.retry_count;
 180
 181        if (balloon_stats.max_retry_count != RETRY_UNLIMITED &&
 182                        balloon_stats.retry_count > balloon_stats.max_retry_count) {
 183                balloon_stats.schedule_delay = 1;
 184                balloon_stats.retry_count = 1;
 185                return BP_ECANCELED;
 186        }
 187
 188        balloon_stats.schedule_delay <<= 1;
 189
 190        if (balloon_stats.schedule_delay > balloon_stats.max_schedule_delay)
 191                balloon_stats.schedule_delay = balloon_stats.max_schedule_delay;
 192
 193        return BP_EAGAIN;
 194}
 195
 196#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
 197static long current_credit(void)
 198{
 199        return balloon_stats.target_pages - balloon_stats.current_pages -
 200                balloon_stats.hotplug_pages;
 201}
 202
 203static bool balloon_is_inflated(void)
 204{
 205        if (balloon_stats.balloon_low || balloon_stats.balloon_high ||
 206                        balloon_stats.balloon_hotplug)
 207                return true;
 208        else
 209                return false;
 210}
 211
 212/*
 213 * reserve_additional_memory() adds memory region of size >= credit above
 214 * max_pfn. New region is section aligned and size is modified to be multiple
 215 * of section size. Those features allow optimal use of address space and
 216 * establish proper alignment when this function is called first time after
 217 * boot (last section not fully populated at boot time contains unused memory
 218 * pages with PG_reserved bit not set; online_pages_range() does not allow page
 219 * onlining in whole range if first onlined page does not have PG_reserved
 220 * bit set). Real size of added memory is established at page onlining stage.
 221 */
 222
 223static enum bp_state reserve_additional_memory(long credit)
 224{
 225        int nid, rc;
 226        u64 hotplug_start_paddr;
 227        unsigned long balloon_hotplug = credit;
 228
 229        hotplug_start_paddr = PFN_PHYS(SECTION_ALIGN_UP(max_pfn));
 230        balloon_hotplug = round_up(balloon_hotplug, PAGES_PER_SECTION);
 231        nid = memory_add_physaddr_to_nid(hotplug_start_paddr);
 232
 233        rc = add_memory(nid, hotplug_start_paddr, balloon_hotplug << PAGE_SHIFT);
 234
 235        if (rc) {
 236                pr_info("%s: add_memory() failed: %i\n", __func__, rc);
 237                return BP_EAGAIN;
 238        }
 239
 240        balloon_hotplug -= credit;
 241
 242        balloon_stats.hotplug_pages += credit;
 243        balloon_stats.balloon_hotplug = balloon_hotplug;
 244
 245        return BP_DONE;
 246}
 247
 248static void xen_online_page(struct page *page)
 249{
 250        __online_page_set_limits(page);
 251
 252        mutex_lock(&balloon_mutex);
 253
 254        __balloon_append(page);
 255
 256        if (balloon_stats.hotplug_pages)
 257                --balloon_stats.hotplug_pages;
 258        else
 259                --balloon_stats.balloon_hotplug;
 260
 261        mutex_unlock(&balloon_mutex);
 262}
 263
 264static int xen_memory_notifier(struct notifier_block *nb, unsigned long val, void *v)
 265{
 266        if (val == MEM_ONLINE)
 267                schedule_delayed_work(&balloon_worker, 0);
 268
 269        return NOTIFY_OK;
 270}
 271
 272static struct notifier_block xen_memory_nb = {
 273        .notifier_call = xen_memory_notifier,
 274        .priority = 0
 275};
 276#else
 277static long current_credit(void)
 278{
 279        unsigned long target = balloon_stats.target_pages;
 280
 281        target = min(target,
 282                     balloon_stats.current_pages +
 283                     balloon_stats.balloon_low +
 284                     balloon_stats.balloon_high);
 285
 286        return target - balloon_stats.current_pages;
 287}
 288
 289static bool balloon_is_inflated(void)
 290{
 291        if (balloon_stats.balloon_low || balloon_stats.balloon_high)
 292                return true;
 293        else
 294                return false;
 295}
 296
 297static enum bp_state reserve_additional_memory(long credit)
 298{
 299        balloon_stats.target_pages = balloon_stats.current_pages;
 300        return BP_DONE;
 301}
 302#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */
 303
 304static enum bp_state increase_reservation(unsigned long nr_pages)
 305{
 306        int rc;
 307        unsigned long  pfn, i;
 308        struct page   *page;
 309        struct xen_memory_reservation reservation = {
 310                .address_bits = 0,
 311                .extent_order = 0,
 312                .domid        = DOMID_SELF
 313        };
 314
 315#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
 316        if (!balloon_stats.balloon_low && !balloon_stats.balloon_high) {
 317                nr_pages = min(nr_pages, balloon_stats.balloon_hotplug);
 318                balloon_stats.hotplug_pages += nr_pages;
 319                balloon_stats.balloon_hotplug -= nr_pages;
 320                return BP_DONE;
 321        }
 322#endif
 323
 324        if (nr_pages > ARRAY_SIZE(frame_list))
 325                nr_pages = ARRAY_SIZE(frame_list);
 326
 327        page = balloon_first_page();
 328        for (i = 0; i < nr_pages; i++) {
 329                if (!page) {
 330                        nr_pages = i;
 331                        break;
 332                }
 333                frame_list[i] = page_to_pfn(page);
 334                page = balloon_next_page(page);
 335        }
 336
 337        set_xen_guest_handle(reservation.extent_start, frame_list);
 338        reservation.nr_extents = nr_pages;
 339        rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
 340        if (rc <= 0)
 341                return BP_EAGAIN;
 342
 343        for (i = 0; i < rc; i++) {
 344                page = balloon_retrieve(false);
 345                BUG_ON(page == NULL);
 346
 347                pfn = page_to_pfn(page);
 348                BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
 349                       phys_to_machine_mapping_valid(pfn));
 350
 351                set_phys_to_machine(pfn, frame_list[i]);
 352
 353#ifdef CONFIG_XEN_HAVE_PVMMU
 354                /* Link back into the page tables if not highmem. */
 355                if (xen_pv_domain() && !PageHighMem(page)) {
 356                        int ret;
 357                        ret = HYPERVISOR_update_va_mapping(
 358                                (unsigned long)__va(pfn << PAGE_SHIFT),
 359                                mfn_pte(frame_list[i], PAGE_KERNEL),
 360                                0);
 361                        BUG_ON(ret);
 362                }
 363#endif
 364
 365                /* Relinquish the page back to the allocator. */
 366                __free_reserved_page(page);
 367        }
 368
 369        balloon_stats.current_pages += rc;
 370
 371        return BP_DONE;
 372}
 373
 374static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
 375{
 376        enum bp_state state = BP_DONE;
 377        unsigned long  pfn, i;
 378        struct page   *page;
 379        int ret;
 380        struct xen_memory_reservation reservation = {
 381                .address_bits = 0,
 382                .extent_order = 0,
 383                .domid        = DOMID_SELF
 384        };
 385
 386#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
 387        if (balloon_stats.hotplug_pages) {
 388                nr_pages = min(nr_pages, balloon_stats.hotplug_pages);
 389                balloon_stats.hotplug_pages -= nr_pages;
 390                balloon_stats.balloon_hotplug += nr_pages;
 391                return BP_DONE;
 392        }
 393#endif
 394
 395        if (nr_pages > ARRAY_SIZE(frame_list))
 396                nr_pages = ARRAY_SIZE(frame_list);
 397
 398        for (i = 0; i < nr_pages; i++) {
 399                page = alloc_page(gfp);
 400                if (page == NULL) {
 401                        nr_pages = i;
 402                        state = BP_EAGAIN;
 403                        break;
 404                }
 405
 406                pfn = page_to_pfn(page);
 407                frame_list[i] = pfn_to_mfn(pfn);
 408
 409                scrub_page(page);
 410
 411#ifdef CONFIG_XEN_HAVE_PVMMU
 412                if (xen_pv_domain() && !PageHighMem(page)) {
 413                        ret = HYPERVISOR_update_va_mapping(
 414                                (unsigned long)__va(pfn << PAGE_SHIFT),
 415                                __pte_ma(0), 0);
 416                        BUG_ON(ret);
 417                }
 418#endif
 419        }
 420
 421        /* Ensure that ballooned highmem pages don't have kmaps. */
 422        kmap_flush_unused();
 423        flush_tlb_all();
 424
 425        /* No more mappings: invalidate P2M and add to balloon. */
 426        for (i = 0; i < nr_pages; i++) {
 427                pfn = mfn_to_pfn(frame_list[i]);
 428                __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
 429                balloon_append(pfn_to_page(pfn));
 430        }
 431
 432        set_xen_guest_handle(reservation.extent_start, frame_list);
 433        reservation.nr_extents   = nr_pages;
 434        ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
 435        BUG_ON(ret != nr_pages);
 436
 437        balloon_stats.current_pages -= nr_pages;
 438
 439        return state;
 440}
 441
 442/*
 443 * We avoid multiple worker processes conflicting via the balloon mutex.
 444 * We may of course race updates of the target counts (which are protected
 445 * by the balloon lock), or with changes to the Xen hard limit, but we will
 446 * recover from these in time.
 447 */
 448static void balloon_process(struct work_struct *work)
 449{
 450        enum bp_state state = BP_DONE;
 451        long credit;
 452
 453        mutex_lock(&balloon_mutex);
 454
 455        do {
 456                credit = current_credit();
 457
 458                if (credit > 0) {
 459                        if (balloon_is_inflated())
 460                                state = increase_reservation(credit);
 461                        else
 462                                state = reserve_additional_memory(credit);
 463                }
 464
 465                if (credit < 0)
 466                        state = decrease_reservation(-credit, GFP_BALLOON);
 467
 468                state = update_schedule(state);
 469
 470#ifndef CONFIG_PREEMPT
 471                if (need_resched())
 472                        schedule();
 473#endif
 474        } while (credit && state == BP_DONE);
 475
 476        /* Schedule more work if there is some still to be done. */
 477        if (state == BP_EAGAIN)
 478                schedule_delayed_work(&balloon_worker, balloon_stats.schedule_delay * HZ);
 479
 480        mutex_unlock(&balloon_mutex);
 481}
 482
 483/* Resets the Xen limit, sets new target, and kicks off processing. */
 484void balloon_set_new_target(unsigned long target)
 485{
 486        /* No need for lock. Not read-modify-write updates. */
 487        balloon_stats.target_pages = target;
 488        schedule_delayed_work(&balloon_worker, 0);
 489}
 490EXPORT_SYMBOL_GPL(balloon_set_new_target);
 491
 492/**
 493 * alloc_xenballooned_pages - get pages that have been ballooned out
 494 * @nr_pages: Number of pages to get
 495 * @pages: pages returned
 496 * @highmem: allow highmem pages
 497 * @return 0 on success, error otherwise
 498 */
 499int alloc_xenballooned_pages(int nr_pages, struct page **pages, bool highmem)
 500{
 501        int pgno = 0;
 502        struct page *page;
 503        mutex_lock(&balloon_mutex);
 504        while (pgno < nr_pages) {
 505                page = balloon_retrieve(highmem);
 506                if (page && (highmem || !PageHighMem(page))) {
 507                        pages[pgno++] = page;
 508                } else {
 509                        enum bp_state st;
 510                        if (page)
 511                                balloon_append(page);
 512                        st = decrease_reservation(nr_pages - pgno,
 513                                        highmem ? GFP_HIGHUSER : GFP_USER);
 514                        if (st != BP_DONE)
 515                                goto out_undo;
 516                }
 517        }
 518        mutex_unlock(&balloon_mutex);
 519        return 0;
 520 out_undo:
 521        while (pgno)
 522                balloon_append(pages[--pgno]);
 523        /* Free the memory back to the kernel soon */
 524        schedule_delayed_work(&balloon_worker, 0);
 525        mutex_unlock(&balloon_mutex);
 526        return -ENOMEM;
 527}
 528EXPORT_SYMBOL(alloc_xenballooned_pages);
 529
 530/**
 531 * free_xenballooned_pages - return pages retrieved with get_ballooned_pages
 532 * @nr_pages: Number of pages
 533 * @pages: pages to return
 534 */
 535void free_xenballooned_pages(int nr_pages, struct page **pages)
 536{
 537        int i;
 538
 539        mutex_lock(&balloon_mutex);
 540
 541        for (i = 0; i < nr_pages; i++) {
 542                if (pages[i])
 543                        balloon_append(pages[i]);
 544        }
 545
 546        /* The balloon may be too large now. Shrink it if needed. */
 547        if (current_credit())
 548                schedule_delayed_work(&balloon_worker, 0);
 549
 550        mutex_unlock(&balloon_mutex);
 551}
 552EXPORT_SYMBOL(free_xenballooned_pages);
 553
 554static void __init balloon_add_region(unsigned long start_pfn,
 555                                      unsigned long pages)
 556{
 557        unsigned long pfn, extra_pfn_end;
 558        struct page *page;
 559
 560        /*
 561         * If the amount of usable memory has been limited (e.g., with
 562         * the 'mem' command line parameter), don't add pages beyond
 563         * this limit.
 564         */
 565        extra_pfn_end = min(max_pfn, start_pfn + pages);
 566
 567        for (pfn = start_pfn; pfn < extra_pfn_end; pfn++) {
 568                page = pfn_to_page(pfn);
 569                /* totalram_pages and totalhigh_pages do not
 570                   include the boot-time balloon extension, so
 571                   don't subtract from it. */
 572                __balloon_append(page);
 573        }
 574}
 575
 576static int __init balloon_init(void)
 577{
 578        int i;
 579
 580        if (!xen_domain())
 581                return -ENODEV;
 582
 583        pr_info("Initialising balloon driver\n");
 584
 585        balloon_stats.current_pages = xen_pv_domain()
 586                ? min(xen_start_info->nr_pages - xen_released_pages, max_pfn)
 587                : get_num_physpages();
 588        balloon_stats.target_pages  = balloon_stats.current_pages;
 589        balloon_stats.balloon_low   = 0;
 590        balloon_stats.balloon_high  = 0;
 591
 592        balloon_stats.schedule_delay = 1;
 593        balloon_stats.max_schedule_delay = 32;
 594        balloon_stats.retry_count = 1;
 595        balloon_stats.max_retry_count = RETRY_UNLIMITED;
 596
 597#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
 598        balloon_stats.hotplug_pages = 0;
 599        balloon_stats.balloon_hotplug = 0;
 600
 601        set_online_page_callback(&xen_online_page);
 602        register_memory_notifier(&xen_memory_nb);
 603#endif
 604
 605        /*
 606         * Initialize the balloon with pages from the extra memory
 607         * regions (see arch/x86/xen/setup.c).
 608         */
 609        for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++)
 610                if (xen_extra_mem[i].size)
 611                        balloon_add_region(PFN_UP(xen_extra_mem[i].start),
 612                                           PFN_DOWN(xen_extra_mem[i].size));
 613
 614        /* Init the xen-balloon driver. */
 615        xen_balloon_init();
 616
 617        return 0;
 618}
 619
 620subsys_initcall(balloon_init);
 621
 622MODULE_LICENSE("GPL");
 623