linux/drivers/xen/xen-selfballoon.c
<<
>>
Prefs
   1/******************************************************************************
   2 * Xen selfballoon driver (and optional frontswap self-shrinking driver)
   3 *
   4 * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp.
   5 *
   6 * This code complements the cleancache and frontswap patchsets to optimize
   7 * support for Xen Transcendent Memory ("tmem").  The policy it implements
   8 * is rudimentary and will likely improve over time, but it does work well
   9 * enough today.
  10 *
  11 * Two functionalities are implemented here which both use "control theory"
  12 * (feedback) to optimize memory utilization. In a virtualized environment
  13 * such as Xen, RAM is often a scarce resource and we would like to ensure
  14 * that each of a possibly large number of virtual machines is using RAM
  15 * efficiently, i.e. using as little as possible when under light load
  16 * and obtaining as much as possible when memory demands are high.
  17 * Since RAM needs vary highly dynamically and sometimes dramatically,
  18 * "hysteresis" is used, that is, memory target is determined not just
  19 * on current data but also on past data stored in the system.
  20 *
  21 * "Selfballooning" creates memory pressure by managing the Xen balloon
  22 * driver to decrease and increase available kernel memory, driven
  23 * largely by the target value of "Committed_AS" (see /proc/meminfo).
  24 * Since Committed_AS does not account for clean mapped pages (i.e. pages
  25 * in RAM that are identical to pages on disk), selfballooning has the
  26 * affect of pushing less frequently used clean pagecache pages out of
  27 * kernel RAM and, presumably using cleancache, into Xen tmem where
  28 * Xen can more efficiently optimize RAM utilization for such pages.
  29 *
  30 * When kernel memory demand unexpectedly increases faster than Xen, via
  31 * the selfballoon driver, is able to (or chooses to) provide usable RAM,
  32 * the kernel may invoke swapping.  In most cases, frontswap is able
  33 * to absorb this swapping into Xen tmem.  However, due to the fact
  34 * that the kernel swap subsystem assumes swapping occurs to a disk,
  35 * swapped pages may sit on the disk for a very long time; even if
  36 * the kernel knows the page will never be used again.  This is because
  37 * the disk space costs very little and can be overwritten when
  38 * necessary.  When such stale pages are in frontswap, however, they
  39 * are taking up valuable real estate.  "Frontswap selfshrinking" works
  40 * to resolve this:  When frontswap activity is otherwise stable
  41 * and the guest kernel is not under memory pressure, the "frontswap
  42 * selfshrinking" accounts for this by providing pressure to remove some
  43 * pages from frontswap and return them to kernel memory.
  44 *
  45 * For both "selfballooning" and "frontswap-selfshrinking", a worker
  46 * thread is used and sysfs tunables are provided to adjust the frequency
  47 * and rate of adjustments to achieve the goal, as well as to disable one
  48 * or both functions independently.
  49 *
  50 * While some argue that this functionality can and should be implemented
  51 * in userspace, it has been observed that bad things happen (e.g. OOMs).
  52 *
  53 * System configuration note: Selfballooning should not be enabled on
  54 * systems without a sufficiently large swap device configured; for best
  55 * results, it is recommended that total swap be increased by the size
  56 * of the guest memory.  Also, while technically not required to be
  57 * configured, it is highly recommended that frontswap also be configured
  58 * and enabled when selfballooning is running.  So, selfballooning
  59 * is disabled by default if frontswap is not configured and can only
  60 * be enabled with the "selfballooning" kernel boot option; similarly
  61 * selfballooning is enabled by default if frontswap is configured and
  62 * can be disabled with the "noselfballooning" kernel boot option.  Finally,
  63 * when frontswap is configured, frontswap-selfshrinking can be disabled
  64 * with the "noselfshrink" kernel boot option.
  65 *
  66 * Selfballooning is disallowed in domain0 and force-disabled.
  67 *
  68 */
  69
  70#include <linux/kernel.h>
  71#include <linux/bootmem.h>
  72#include <linux/swap.h>
  73#include <linux/mm.h>
  74#include <linux/mman.h>
  75#include <linux/module.h>
  76#include <linux/workqueue.h>
  77#include <linux/device.h>
  78#include <xen/balloon.h>
  79#include <xen/tmem.h>
  80#include <xen/xen.h>
  81
  82/* Enable/disable with sysfs. */
  83static int xen_selfballooning_enabled __read_mostly;
  84
  85/*
  86 * Controls rate at which memory target (this iteration) approaches
  87 * ultimate goal when memory need is increasing (up-hysteresis) or
  88 * decreasing (down-hysteresis). Higher values of hysteresis cause
  89 * slower increases/decreases. The default values for the various
  90 * parameters were deemed reasonable by experimentation, may be
  91 * workload-dependent, and can all be adjusted via sysfs.
  92 */
  93static unsigned int selfballoon_downhysteresis __read_mostly = 8;
  94static unsigned int selfballoon_uphysteresis __read_mostly = 1;
  95
  96/* In HZ, controls frequency of worker invocation. */
  97static unsigned int selfballoon_interval __read_mostly = 5;
  98
  99/*
 100 * Minimum usable RAM in MB for selfballooning target for balloon.
 101 * If non-zero, it is added to totalreserve_pages and self-ballooning
 102 * will not balloon below the sum.  If zero, a piecewise linear function
 103 * is calculated as a minimum and added to totalreserve_pages.  Note that
 104 * setting this value indiscriminately may cause OOMs and crashes.
 105 */
 106static unsigned int selfballoon_min_usable_mb;
 107
 108static void selfballoon_process(struct work_struct *work);
 109static DECLARE_DELAYED_WORK(selfballoon_worker, selfballoon_process);
 110
 111#ifdef CONFIG_FRONTSWAP
 112#include <linux/frontswap.h>
 113
 114/* Enable/disable with sysfs. */
 115static bool frontswap_selfshrinking __read_mostly;
 116
 117/* Enable/disable with kernel boot option. */
 118static bool use_frontswap_selfshrink __initdata = true;
 119
 120/*
 121 * The default values for the following parameters were deemed reasonable
 122 * by experimentation, may be workload-dependent, and can all be
 123 * adjusted via sysfs.
 124 */
 125
 126/* Control rate for frontswap shrinking. Higher hysteresis is slower. */
 127static unsigned int frontswap_hysteresis __read_mostly = 20;
 128
 129/*
 130 * Number of selfballoon worker invocations to wait before observing that
 131 * frontswap selfshrinking should commence. Note that selfshrinking does
 132 * not use a separate worker thread.
 133 */
 134static unsigned int frontswap_inertia __read_mostly = 3;
 135
 136/* Countdown to next invocation of frontswap_shrink() */
 137static unsigned long frontswap_inertia_counter;
 138
 139/*
 140 * Invoked by the selfballoon worker thread, uses current number of pages
 141 * in frontswap (frontswap_curr_pages()), previous status, and control
 142 * values (hysteresis and inertia) to determine if frontswap should be
 143 * shrunk and what the new frontswap size should be.  Note that
 144 * frontswap_shrink is essentially a partial swapoff that immediately
 145 * transfers pages from the "swap device" (frontswap) back into kernel
 146 * RAM; despite the name, frontswap "shrinking" is very different from
 147 * the "shrinker" interface used by the kernel MM subsystem to reclaim
 148 * memory.
 149 */
 150static void frontswap_selfshrink(void)
 151{
 152        static unsigned long cur_frontswap_pages;
 153        static unsigned long last_frontswap_pages;
 154        static unsigned long tgt_frontswap_pages;
 155
 156        last_frontswap_pages = cur_frontswap_pages;
 157        cur_frontswap_pages = frontswap_curr_pages();
 158        if (!cur_frontswap_pages ||
 159                        (cur_frontswap_pages > last_frontswap_pages)) {
 160                frontswap_inertia_counter = frontswap_inertia;
 161                return;
 162        }
 163        if (frontswap_inertia_counter && --frontswap_inertia_counter)
 164                return;
 165        if (cur_frontswap_pages <= frontswap_hysteresis)
 166                tgt_frontswap_pages = 0;
 167        else
 168                tgt_frontswap_pages = cur_frontswap_pages -
 169                        (cur_frontswap_pages / frontswap_hysteresis);
 170        frontswap_shrink(tgt_frontswap_pages);
 171}
 172
 173static int __init xen_nofrontswap_selfshrink_setup(char *s)
 174{
 175        use_frontswap_selfshrink = false;
 176        return 1;
 177}
 178
 179__setup("noselfshrink", xen_nofrontswap_selfshrink_setup);
 180
 181/* Disable with kernel boot option. */
 182static bool use_selfballooning __initdata = true;
 183
 184static int __init xen_noselfballooning_setup(char *s)
 185{
 186        use_selfballooning = false;
 187        return 1;
 188}
 189
 190__setup("noselfballooning", xen_noselfballooning_setup);
 191#else /* !CONFIG_FRONTSWAP */
 192/* Enable with kernel boot option. */
 193static bool use_selfballooning __initdata = false;
 194
 195static int __init xen_selfballooning_setup(char *s)
 196{
 197        use_selfballooning = true;
 198        return 1;
 199}
 200
 201__setup("selfballooning", xen_selfballooning_setup);
 202#endif /* CONFIG_FRONTSWAP */
 203
 204#define MB2PAGES(mb)    ((mb) << (20 - PAGE_SHIFT))
 205
 206/*
 207 * Use current balloon size, the goal (vm_committed_as), and hysteresis
 208 * parameters to set a new target balloon size
 209 */
 210static void selfballoon_process(struct work_struct *work)
 211{
 212        unsigned long cur_pages, goal_pages, tgt_pages, floor_pages;
 213        unsigned long useful_pages;
 214        bool reset_timer = false;
 215
 216        if (xen_selfballooning_enabled) {
 217                cur_pages = totalram_pages;
 218                tgt_pages = cur_pages; /* default is no change */
 219                goal_pages = percpu_counter_read_positive(&vm_committed_as) +
 220                                totalreserve_pages;
 221#ifdef CONFIG_FRONTSWAP
 222                /* allow space for frontswap pages to be repatriated */
 223                if (frontswap_selfshrinking && frontswap_enabled)
 224                        goal_pages += frontswap_curr_pages();
 225#endif
 226                if (cur_pages > goal_pages)
 227                        tgt_pages = cur_pages -
 228                                ((cur_pages - goal_pages) /
 229                                  selfballoon_downhysteresis);
 230                else if (cur_pages < goal_pages)
 231                        tgt_pages = cur_pages +
 232                                ((goal_pages - cur_pages) /
 233                                  selfballoon_uphysteresis);
 234                /* else if cur_pages == goal_pages, no change */
 235                useful_pages = max_pfn - totalreserve_pages;
 236                if (selfballoon_min_usable_mb != 0)
 237                        floor_pages = totalreserve_pages +
 238                                        MB2PAGES(selfballoon_min_usable_mb);
 239                /* piecewise linear function ending in ~3% slope */
 240                else if (useful_pages < MB2PAGES(16))
 241                        floor_pages = max_pfn; /* not worth ballooning */
 242                else if (useful_pages < MB2PAGES(64))
 243                        floor_pages = totalreserve_pages + MB2PAGES(16) +
 244                                        ((useful_pages - MB2PAGES(16)) >> 1);
 245                else if (useful_pages < MB2PAGES(512))
 246                        floor_pages = totalreserve_pages + MB2PAGES(40) +
 247                                        ((useful_pages - MB2PAGES(40)) >> 3);
 248                else /* useful_pages >= MB2PAGES(512) */
 249                        floor_pages = totalreserve_pages + MB2PAGES(99) +
 250                                        ((useful_pages - MB2PAGES(99)) >> 5);
 251                if (tgt_pages < floor_pages)
 252                        tgt_pages = floor_pages;
 253                balloon_set_new_target(tgt_pages +
 254                        balloon_stats.current_pages - totalram_pages);
 255                reset_timer = true;
 256        }
 257#ifdef CONFIG_FRONTSWAP
 258        if (frontswap_selfshrinking && frontswap_enabled) {
 259                frontswap_selfshrink();
 260                reset_timer = true;
 261        }
 262#endif
 263        if (reset_timer)
 264                schedule_delayed_work(&selfballoon_worker,
 265                        selfballoon_interval * HZ);
 266}
 267
 268#ifdef CONFIG_SYSFS
 269
 270#include <linux/capability.h>
 271
 272#define SELFBALLOON_SHOW(name, format, args...)                         \
 273        static ssize_t show_##name(struct device *dev,  \
 274                                          struct device_attribute *attr, \
 275                                          char *buf) \
 276        { \
 277                return sprintf(buf, format, ##args); \
 278        }
 279
 280SELFBALLOON_SHOW(selfballooning, "%d\n", xen_selfballooning_enabled);
 281
 282static ssize_t store_selfballooning(struct device *dev,
 283                            struct device_attribute *attr,
 284                            const char *buf,
 285                            size_t count)
 286{
 287        bool was_enabled = xen_selfballooning_enabled;
 288        unsigned long tmp;
 289        int err;
 290
 291        if (!capable(CAP_SYS_ADMIN))
 292                return -EPERM;
 293
 294        err = strict_strtoul(buf, 10, &tmp);
 295        if (err || ((tmp != 0) && (tmp != 1)))
 296                return -EINVAL;
 297
 298        xen_selfballooning_enabled = !!tmp;
 299        if (!was_enabled && xen_selfballooning_enabled)
 300                schedule_delayed_work(&selfballoon_worker,
 301                        selfballoon_interval * HZ);
 302
 303        return count;
 304}
 305
 306static DEVICE_ATTR(selfballooning, S_IRUGO | S_IWUSR,
 307                   show_selfballooning, store_selfballooning);
 308
 309SELFBALLOON_SHOW(selfballoon_interval, "%d\n", selfballoon_interval);
 310
 311static ssize_t store_selfballoon_interval(struct device *dev,
 312                                          struct device_attribute *attr,
 313                                          const char *buf,
 314                                          size_t count)
 315{
 316        unsigned long val;
 317        int err;
 318
 319        if (!capable(CAP_SYS_ADMIN))
 320                return -EPERM;
 321        err = strict_strtoul(buf, 10, &val);
 322        if (err || val == 0)
 323                return -EINVAL;
 324        selfballoon_interval = val;
 325        return count;
 326}
 327
 328static DEVICE_ATTR(selfballoon_interval, S_IRUGO | S_IWUSR,
 329                   show_selfballoon_interval, store_selfballoon_interval);
 330
 331SELFBALLOON_SHOW(selfballoon_downhys, "%d\n", selfballoon_downhysteresis);
 332
 333static ssize_t store_selfballoon_downhys(struct device *dev,
 334                                         struct device_attribute *attr,
 335                                         const char *buf,
 336                                         size_t count)
 337{
 338        unsigned long val;
 339        int err;
 340
 341        if (!capable(CAP_SYS_ADMIN))
 342                return -EPERM;
 343        err = strict_strtoul(buf, 10, &val);
 344        if (err || val == 0)
 345                return -EINVAL;
 346        selfballoon_downhysteresis = val;
 347        return count;
 348}
 349
 350static DEVICE_ATTR(selfballoon_downhysteresis, S_IRUGO | S_IWUSR,
 351                   show_selfballoon_downhys, store_selfballoon_downhys);
 352
 353
 354SELFBALLOON_SHOW(selfballoon_uphys, "%d\n", selfballoon_uphysteresis);
 355
 356static ssize_t store_selfballoon_uphys(struct device *dev,
 357                                       struct device_attribute *attr,
 358                                       const char *buf,
 359                                       size_t count)
 360{
 361        unsigned long val;
 362        int err;
 363
 364        if (!capable(CAP_SYS_ADMIN))
 365                return -EPERM;
 366        err = strict_strtoul(buf, 10, &val);
 367        if (err || val == 0)
 368                return -EINVAL;
 369        selfballoon_uphysteresis = val;
 370        return count;
 371}
 372
 373static DEVICE_ATTR(selfballoon_uphysteresis, S_IRUGO | S_IWUSR,
 374                   show_selfballoon_uphys, store_selfballoon_uphys);
 375
 376SELFBALLOON_SHOW(selfballoon_min_usable_mb, "%d\n",
 377                                selfballoon_min_usable_mb);
 378
 379static ssize_t store_selfballoon_min_usable_mb(struct device *dev,
 380                                               struct device_attribute *attr,
 381                                               const char *buf,
 382                                               size_t count)
 383{
 384        unsigned long val;
 385        int err;
 386
 387        if (!capable(CAP_SYS_ADMIN))
 388                return -EPERM;
 389        err = strict_strtoul(buf, 10, &val);
 390        if (err || val == 0)
 391                return -EINVAL;
 392        selfballoon_min_usable_mb = val;
 393        return count;
 394}
 395
 396static DEVICE_ATTR(selfballoon_min_usable_mb, S_IRUGO | S_IWUSR,
 397                   show_selfballoon_min_usable_mb,
 398                   store_selfballoon_min_usable_mb);
 399
 400
 401#ifdef CONFIG_FRONTSWAP
 402SELFBALLOON_SHOW(frontswap_selfshrinking, "%d\n", frontswap_selfshrinking);
 403
 404static ssize_t store_frontswap_selfshrinking(struct device *dev,
 405                                             struct device_attribute *attr,
 406                                             const char *buf,
 407                                             size_t count)
 408{
 409        bool was_enabled = frontswap_selfshrinking;
 410        unsigned long tmp;
 411        int err;
 412
 413        if (!capable(CAP_SYS_ADMIN))
 414                return -EPERM;
 415        err = strict_strtoul(buf, 10, &tmp);
 416        if (err || ((tmp != 0) && (tmp != 1)))
 417                return -EINVAL;
 418        frontswap_selfshrinking = !!tmp;
 419        if (!was_enabled && !xen_selfballooning_enabled &&
 420             frontswap_selfshrinking)
 421                schedule_delayed_work(&selfballoon_worker,
 422                        selfballoon_interval * HZ);
 423
 424        return count;
 425}
 426
 427static DEVICE_ATTR(frontswap_selfshrinking, S_IRUGO | S_IWUSR,
 428                   show_frontswap_selfshrinking, store_frontswap_selfshrinking);
 429
 430SELFBALLOON_SHOW(frontswap_inertia, "%d\n", frontswap_inertia);
 431
 432static ssize_t store_frontswap_inertia(struct device *dev,
 433                                       struct device_attribute *attr,
 434                                       const char *buf,
 435                                       size_t count)
 436{
 437        unsigned long val;
 438        int err;
 439
 440        if (!capable(CAP_SYS_ADMIN))
 441                return -EPERM;
 442        err = strict_strtoul(buf, 10, &val);
 443        if (err || val == 0)
 444                return -EINVAL;
 445        frontswap_inertia = val;
 446        frontswap_inertia_counter = val;
 447        return count;
 448}
 449
 450static DEVICE_ATTR(frontswap_inertia, S_IRUGO | S_IWUSR,
 451                   show_frontswap_inertia, store_frontswap_inertia);
 452
 453SELFBALLOON_SHOW(frontswap_hysteresis, "%d\n", frontswap_hysteresis);
 454
 455static ssize_t store_frontswap_hysteresis(struct device *dev,
 456                                          struct device_attribute *attr,
 457                                          const char *buf,
 458                                          size_t count)
 459{
 460        unsigned long val;
 461        int err;
 462
 463        if (!capable(CAP_SYS_ADMIN))
 464                return -EPERM;
 465        err = strict_strtoul(buf, 10, &val);
 466        if (err || val == 0)
 467                return -EINVAL;
 468        frontswap_hysteresis = val;
 469        return count;
 470}
 471
 472static DEVICE_ATTR(frontswap_hysteresis, S_IRUGO | S_IWUSR,
 473                   show_frontswap_hysteresis, store_frontswap_hysteresis);
 474
 475#endif /* CONFIG_FRONTSWAP */
 476
 477static struct attribute *selfballoon_attrs[] = {
 478        &dev_attr_selfballooning.attr,
 479        &dev_attr_selfballoon_interval.attr,
 480        &dev_attr_selfballoon_downhysteresis.attr,
 481        &dev_attr_selfballoon_uphysteresis.attr,
 482        &dev_attr_selfballoon_min_usable_mb.attr,
 483#ifdef CONFIG_FRONTSWAP
 484        &dev_attr_frontswap_selfshrinking.attr,
 485        &dev_attr_frontswap_hysteresis.attr,
 486        &dev_attr_frontswap_inertia.attr,
 487#endif
 488        NULL
 489};
 490
 491static struct attribute_group selfballoon_group = {
 492        .name = "selfballoon",
 493        .attrs = selfballoon_attrs
 494};
 495#endif
 496
 497int register_xen_selfballooning(struct device *dev)
 498{
 499        int error = -1;
 500
 501#ifdef CONFIG_SYSFS
 502        error = sysfs_create_group(&dev->kobj, &selfballoon_group);
 503#endif
 504        return error;
 505}
 506EXPORT_SYMBOL(register_xen_selfballooning);
 507
 508static int __init xen_selfballoon_init(void)
 509{
 510        bool enable = false;
 511
 512        if (!xen_domain())
 513                return -ENODEV;
 514
 515        if (xen_initial_domain()) {
 516                pr_info("xen/balloon: Xen selfballooning driver "
 517                                "disabled for domain0.\n");
 518                return -ENODEV;
 519        }
 520
 521        xen_selfballooning_enabled = tmem_enabled && use_selfballooning;
 522        if (xen_selfballooning_enabled) {
 523                pr_info("xen/balloon: Initializing Xen "
 524                                        "selfballooning driver.\n");
 525                enable = true;
 526        }
 527#ifdef CONFIG_FRONTSWAP
 528        frontswap_selfshrinking = tmem_enabled && use_frontswap_selfshrink;
 529        if (frontswap_selfshrinking) {
 530                pr_info("xen/balloon: Initializing frontswap "
 531                                        "selfshrinking driver.\n");
 532                enable = true;
 533        }
 534#endif
 535        if (!enable)
 536                return -ENODEV;
 537
 538        schedule_delayed_work(&selfballoon_worker, selfballoon_interval * HZ);
 539
 540        return 0;
 541}
 542
 543subsys_initcall(xen_selfballoon_init);
 544
 545MODULE_LICENSE("GPL");
 546