LXR linux/mm/vmpressure.c

   1/*
   2 * Linux VM pressure
   3 *
   4 * Copyright 2012 Linaro Ltd.
   5 *                Anton Vorontsov <anton.vorontsov@linaro.org>
   6 *
   7 * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
   8 * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
   9 *
  10 * This program is free software; you can redistribute it and/or modify it
  11 * under the terms of the GNU General Public License version 2 as published
  12 * by the Free Software Foundation.
  13 */
  14
  15#include <linux/cgroup.h>
  16#include <linux/fs.h>
  17#include <linux/log2.h>
  18#include <linux/sched.h>
  19#include <linux/mm.h>
  20#include <linux/vmstat.h>
  21#include <linux/eventfd.h>
  22#include <linux/slab.h>
  23#include <linux/swap.h>
  24#include <linux/printk.h>
  25#include <linux/vmpressure.h>
  26
  27/*
  28 * The window size (vmpressure_win) is the number of scanned pages before
  29 * we try to analyze scanned/reclaimed ratio. So the window is used as a
  30 * rate-limit tunable for the "low" level notification, and also for
  31 * averaging the ratio for medium/critical levels. Using small window
  32 * sizes can cause lot of false positives, but too big window size will
  33 * delay the notifications.
  34 *
  35 * As the vmscan reclaimer logic works with chunks which are multiple of
  36 * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well.
  37 *
  38 * TODO: Make the window size depend on machine size, as we do for vmstat
  39 * thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
  40 */
  41static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
  42
  43/*
  44 * These thresholds are used when we account memory pressure through
  45 * scanned/reclaimed ratio. The current values were chosen empirically. In
  46 * essence, they are percents: the higher the value, the more number
  47 * unsuccessful reclaims there were.
  48 */
  49static const unsigned int vmpressure_level_med = 60;
  50static const unsigned int vmpressure_level_critical = 95;
  51
  52/*
  53 * When there are too little pages left to scan, vmpressure() may miss the
  54 * critical pressure as number of pages will be less than "window size".
  55 * However, in that case the vmscan priority will raise fast as the
  56 * reclaimer will try to scan LRUs more deeply.
  57 *
  58 * The vmscan logic considers these special priorities:
  59 *
  60 * prio == DEF_PRIORITY (12): reclaimer starts with that value
  61 * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed
  62 * prio == 0                : close to OOM, kernel scans every page in an lru
  63 *
  64 * Any value in this range is acceptable for this tunable (i.e. from 12 to
  65 * 0). Current value for the vmpressure_level_critical_prio is chosen
  66 * empirically, but the number, in essence, means that we consider
  67 * critical level when scanning depth is ~10% of the lru size (vmscan
  68 * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one
  69 * eights).
  70 */
  71static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
  72
  73static struct vmpressure *work_to_vmpressure(struct work_struct *work)
  74{
  75        return container_of(work, struct vmpressure, work);
  76}
  77
  78static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
  79{
  80        struct cgroup_subsys_state *css = vmpressure_to_css(vmpr);
  81        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
  82
  83        memcg = parent_mem_cgroup(memcg);
  84        if (!memcg)
  85                return NULL;
  86        return memcg_to_vmpressure(memcg);
  87}
  88
  89enum vmpressure_levels {
  90        VMPRESSURE_LOW = 0,
  91        VMPRESSURE_MEDIUM,
  92        VMPRESSURE_CRITICAL,
  93        VMPRESSURE_NUM_LEVELS,
  94};
  95
  96enum vmpressure_modes {
  97        VMPRESSURE_NO_PASSTHROUGH = 0,
  98        VMPRESSURE_HIERARCHY,
  99        VMPRESSURE_LOCAL,
 100        VMPRESSURE_NUM_MODES,
 101};
 102
 103static const char * const vmpressure_str_levels[] = {
 104        [VMPRESSURE_LOW] = "low",
 105        [VMPRESSURE_MEDIUM] = "medium",
 106        [VMPRESSURE_CRITICAL] = "critical",
 107};
 108
 109static const char * const vmpressure_str_modes[] = {
 110        [VMPRESSURE_NO_PASSTHROUGH] = "default",
 111        [VMPRESSURE_HIERARCHY] = "hierarchy",
 112        [VMPRESSURE_LOCAL] = "local",
 113};
 114
 115static enum vmpressure_levels vmpressure_level(unsigned long pressure)
 116{
 117        if (pressure >= vmpressure_level_critical)
 118                return VMPRESSURE_CRITICAL;
 119        else if (pressure >= vmpressure_level_med)
 120                return VMPRESSURE_MEDIUM;
 121        return VMPRESSURE_LOW;
 122}
 123
 124static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
 125                                                    unsigned long reclaimed)
 126{
 127        unsigned long scale = scanned + reclaimed;
 128        unsigned long pressure = 0;
 129
 130        /*
 131         * reclaimed can be greater than scanned for things such as reclaimed
 132         * slab pages. shrink_node() just adds reclaimed pages without a
 133         * related increment to scanned pages.
 134         */
 135        if (reclaimed >= scanned)
 136                goto out;
 137        /*
 138         * We calculate the ratio (in percents) of how many pages were
 139         * scanned vs. reclaimed in a given time frame (window). Note that
 140         * time is in VM reclaimer's "ticks", i.e. number of pages
 141         * scanned. This makes it possible to set desired reaction time
 142         * and serves as a ratelimit.
 143         */
 144        pressure = scale - (reclaimed * scale / scanned);
 145        pressure = pressure * 100 / scale;
 146
 147out:
 148        pr_debug("%s: %3lu  (s: %lu  r: %lu)\n", __func__, pressure,
 149                 scanned, reclaimed);
 150
 151        return vmpressure_level(pressure);
 152}
 153
 154struct vmpressure_event {
 155        struct eventfd_ctx *efd;
 156        enum vmpressure_levels level;
 157        enum vmpressure_modes mode;
 158        struct list_head node;
 159};
 160
 161static bool vmpressure_event(struct vmpressure *vmpr,
 162                             const enum vmpressure_levels level,
 163                             bool ancestor, bool signalled)
 164{
 165        struct vmpressure_event *ev;
 166        bool ret = false;
 167
 168        mutex_lock(&vmpr->events_lock);
 169        list_for_each_entry(ev, &vmpr->events, node) {
 170                if (ancestor && ev->mode == VMPRESSURE_LOCAL)
 171                        continue;
 172                if (signalled && ev->mode == VMPRESSURE_NO_PASSTHROUGH)
 173                        continue;
 174                if (level < ev->level)
 175                        continue;
 176                eventfd_signal(ev->efd, 1);
 177                ret = true;
 178        }
 179        mutex_unlock(&vmpr->events_lock);
 180
 181        return ret;
 182}
 183
 184static void vmpressure_work_fn(struct work_struct *work)
 185{
 186        struct vmpressure *vmpr = work_to_vmpressure(work);
 187        unsigned long scanned;
 188        unsigned long reclaimed;
 189        enum vmpressure_levels level;
 190        bool ancestor = false;
 191        bool signalled = false;
 192
 193        spin_lock(&vmpr->sr_lock);
 194        /*
 195         * Several contexts might be calling vmpressure(), so it is
 196         * possible that the work was rescheduled again before the old
 197         * work context cleared the counters. In that case we will run
 198         * just after the old work returns, but then scanned might be zero
 199         * here. No need for any locks here since we don't care if
 200         * vmpr->reclaimed is in sync.
 201         */
 202        scanned = vmpr->tree_scanned;
 203        if (!scanned) {
 204                spin_unlock(&vmpr->sr_lock);
 205                return;
 206        }
 207
 208        reclaimed = vmpr->tree_reclaimed;
 209        vmpr->tree_scanned = 0;
 210        vmpr->tree_reclaimed = 0;
 211        spin_unlock(&vmpr->sr_lock);
 212
 213        level = vmpressure_calc_level(scanned, reclaimed);
 214
 215        do {
 216                if (vmpressure_event(vmpr, level, ancestor, signalled))
 217                        signalled = true;
 218                ancestor = true;
 219        } while ((vmpr = vmpressure_parent(vmpr)));
 220}
 221
 222/**
 223 * vmpressure() - Account memory pressure through scanned/reclaimed ratio
 224 * @gfp:        reclaimer's gfp mask
 225 * @memcg:      cgroup memory controller handle
 226 * @tree:       legacy subtree mode
 227 * @scanned:    number of pages scanned
 228 * @reclaimed:  number of pages reclaimed
 229 *
 230 * This function should be called from the vmscan reclaim path to account
 231 * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
 232 * pressure index is then further refined and averaged over time.
 233 *
 234 * If @tree is set, vmpressure is in traditional userspace reporting
 235 * mode: @memcg is considered the pressure root and userspace is
 236 * notified of the entire subtree's reclaim efficiency.
 237 *
 238 * If @tree is not set, reclaim efficiency is recorded for @memcg, and
 239 * only in-kernel users are notified.
 240 *
 241 * This function does not return any value.
 242 */
 243void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
 244                unsigned long scanned, unsigned long reclaimed)
 245{
 246        struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
 247
 248        /*
 249         * Here we only want to account pressure that userland is able to
 250         * help us with. For example, suppose that DMA zone is under
 251         * pressure; if we notify userland about that kind of pressure,
 252         * then it will be mostly a waste as it will trigger unnecessary
 253         * freeing of memory by userland (since userland is more likely to
 254         * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That
 255         * is why we include only movable, highmem and FS/IO pages.
 256         * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so
 257         * we account it too.
 258         */
 259        if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
 260                return;
 261
 262        /*
 263         * If we got here with no pages scanned, then that is an indicator
 264         * that reclaimer was unable to find any shrinkable LRUs at the
 265         * current scanning depth. But it does not mean that we should
 266         * report the critical pressure, yet. If the scanning priority
 267         * (scanning depth) goes too high (deep), we will be notified
 268         * through vmpressure_prio(). But so far, keep calm.
 269         */
 270        if (!scanned)
 271                return;
 272
 273        if (tree) {
 274                spin_lock(&vmpr->sr_lock);
 275                scanned = vmpr->tree_scanned += scanned;
 276                vmpr->tree_reclaimed += reclaimed;
 277                spin_unlock(&vmpr->sr_lock);
 278
 279                if (scanned < vmpressure_win)
 280                        return;
 281                schedule_work(&vmpr->work);
 282        } else {
 283                enum vmpressure_levels level;
 284
 285                /* For now, no users for root-level efficiency */
 286                if (!memcg || memcg == root_mem_cgroup)
 287                        return;
 288
 289                spin_lock(&vmpr->sr_lock);
 290                scanned = vmpr->scanned += scanned;
 291                reclaimed = vmpr->reclaimed += reclaimed;
 292                if (scanned < vmpressure_win) {
 293                        spin_unlock(&vmpr->sr_lock);
 294                        return;
 295                }
 296                vmpr->scanned = vmpr->reclaimed = 0;
 297                spin_unlock(&vmpr->sr_lock);
 298
 299                level = vmpressure_calc_level(scanned, reclaimed);
 300
 301                if (level > VMPRESSURE_LOW) {
 302                        /*
 303                         * Let the socket buffer allocator know that
 304                         * we are having trouble reclaiming LRU pages.
 305                         *
 306                         * For hysteresis keep the pressure state
 307                         * asserted for a second in which subsequent
 308                         * pressure events can occur.
 309                         */
 310                        memcg->socket_pressure = jiffies + HZ;
 311                }
 312        }
 313}
 314
 315/**
 316 * vmpressure_prio() - Account memory pressure through reclaimer priority level
 317 * @gfp:        reclaimer's gfp mask
 318 * @memcg:      cgroup memory controller handle
 319 * @prio:       reclaimer's priority
 320 *
 321 * This function should be called from the reclaim path every time when
 322 * the vmscan's reclaiming priority (scanning depth) changes.
 323 *
 324 * This function does not return any value.
 325 */
 326void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
 327{
 328        /*
 329         * We only use prio for accounting critical level. For more info
 330         * see comment for vmpressure_level_critical_prio variable above.
 331         */
 332        if (prio > vmpressure_level_critical_prio)
 333                return;
 334
 335        /*
 336         * OK, the prio is below the threshold, updating vmpressure
 337         * information before shrinker dives into long shrinking of long
 338         * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0
 339         * to the vmpressure() basically means that we signal 'critical'
 340         * level.
 341         */
 342        vmpressure(gfp, memcg, true, vmpressure_win, 0);
 343}
 344
 345static enum vmpressure_levels str_to_level(const char *arg)
 346{
 347        enum vmpressure_levels level;
 348
 349        for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++)
 350                if (!strcmp(vmpressure_str_levels[level], arg))
 351                        return level;
 352        return -1;
 353}
 354
 355static enum vmpressure_modes str_to_mode(const char *arg)
 356{
 357        enum vmpressure_modes mode;
 358
 359        for (mode = 0; mode < VMPRESSURE_NUM_MODES; mode++)
 360                if (!strcmp(vmpressure_str_modes[mode], arg))
 361                        return mode;
 362        return -1;
 363}
 364
 365#define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2)
 366
 367/**
 368 * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
 369 * @memcg:      memcg that is interested in vmpressure notifications
 370 * @eventfd:    eventfd context to link notifications with
 371 * @args:       event arguments (pressure level threshold, optional mode)
 372 *
 373 * This function associates eventfd context with the vmpressure
 374 * infrastructure, so that the notifications will be delivered to the
 375 * @eventfd. The @args parameter is a comma-delimited string that denotes a
 376 * pressure level threshold (one of vmpressure_str_levels, i.e. "low", "medium",
 377 * or "critical") and an optional mode (one of vmpressure_str_modes, i.e.
 378 * "hierarchy" or "local").
 379 *
 380 * To be used as memcg event method.
 381 */
 382int vmpressure_register_event(struct mem_cgroup *memcg,
 383                              struct eventfd_ctx *eventfd, const char *args)
 384{
 385        struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
 386        struct vmpressure_event *ev;
 387        enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH;
 388        enum vmpressure_levels level = -1;
 389        char *spec, *spec_orig;
 390        char *token;
 391        int ret = 0;
 392
 393        spec_orig = spec = kzalloc(MAX_VMPRESSURE_ARGS_LEN + 1, GFP_KERNEL);
 394        if (!spec) {
 395                ret = -ENOMEM;
 396                goto out;
 397        }
 398        strncpy(spec, args, MAX_VMPRESSURE_ARGS_LEN);
 399
 400        /* Find required level */
 401        token = strsep(&spec, ",");
 402        level = str_to_level(token);
 403        if (level == -1) {
 404                ret = -EINVAL;
 405                goto out;
 406        }
 407
 408        /* Find optional mode */
 409        token = strsep(&spec, ",");
 410        if (token) {
 411                mode = str_to_mode(token);
 412                if (mode == -1) {
 413                        ret = -EINVAL;
 414                        goto out;
 415                }
 416        }
 417
 418        ev = kzalloc(sizeof(*ev), GFP_KERNEL);
 419        if (!ev) {
 420                ret = -ENOMEM;
 421                goto out;
 422        }
 423
 424        ev->efd = eventfd;
 425        ev->level = level;
 426        ev->mode = mode;
 427
 428        mutex_lock(&vmpr->events_lock);
 429        list_add(&ev->node, &vmpr->events);
 430        mutex_unlock(&vmpr->events_lock);
 431out:
 432        kfree(spec_orig);
 433        return ret;
 434}
 435
 436/**
 437 * vmpressure_unregister_event() - Unbind eventfd from vmpressure
 438 * @memcg:      memcg handle
 439 * @eventfd:    eventfd context that was used to link vmpressure with the @cg
 440 *
 441 * This function does internal manipulations to detach the @eventfd from
 442 * the vmpressure notifications, and then frees internal resources
 443 * associated with the @eventfd (but the @eventfd itself is not freed).
 444 *
 445 * To be used as memcg event method.
 446 */
 447void vmpressure_unregister_event(struct mem_cgroup *memcg,
 448                                 struct eventfd_ctx *eventfd)
 449{
 450        struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
 451        struct vmpressure_event *ev;
 452
 453        mutex_lock(&vmpr->events_lock);
 454        list_for_each_entry(ev, &vmpr->events, node) {
 455                if (ev->efd != eventfd)
 456                        continue;
 457                list_del(&ev->node);
 458                kfree(ev);
 459                break;
 460        }
 461        mutex_unlock(&vmpr->events_lock);
 462}
 463
 464/**
 465 * vmpressure_init() - Initialize vmpressure control structure
 466 * @vmpr:       Structure to be initialized
 467 *
 468 * This function should be called on every allocated vmpressure structure
 469 * before any usage.
 470 */
 471void vmpressure_init(struct vmpressure *vmpr)
 472{
 473        spin_lock_init(&vmpr->sr_lock);
 474        mutex_init(&vmpr->events_lock);
 475        INIT_LIST_HEAD(&vmpr->events);
 476        INIT_WORK(&vmpr->work, vmpressure_work_fn);
 477}
 478
 479/**
 480 * vmpressure_cleanup() - shuts down vmpressure control structure
 481 * @vmpr:       Structure to be cleaned up
 482 *
 483 * This function should be called before the structure in which it is
 484 * embedded is cleaned up.
 485 */
 486void vmpressure_cleanup(struct vmpressure *vmpr)
 487{
 488        /*
 489         * Make sure there is no pending work before eventfd infrastructure
 490         * goes away.
 491         */
 492        flush_work(&vmpr->work);
 493}
 494