linux/kernel/sched/fair.c
<<
>>
Prefs
   1/*
   2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
   3 *
   4 *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   5 *
   6 *  Interactivity improvements by Mike Galbraith
   7 *  (C) 2007 Mike Galbraith <efault@gmx.de>
   8 *
   9 *  Various enhancements by Dmitry Adamushko.
  10 *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  11 *
  12 *  Group scheduling enhancements by Srivatsa Vaddagiri
  13 *  Copyright IBM Corporation, 2007
  14 *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  15 *
  16 *  Scaled math optimizations by Thomas Gleixner
  17 *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  18 *
  19 *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  20 *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  21 */
  22
  23#include <linux/latencytop.h>
  24#include <linux/sched.h>
  25#include <linux/cpumask.h>
  26#include <linux/cpuidle.h>
  27#include <linux/slab.h>
  28#include <linux/profile.h>
  29#include <linux/interrupt.h>
  30#include <linux/mempolicy.h>
  31#include <linux/migrate.h>
  32#include <linux/task_work.h>
  33
  34#include <trace/events/sched.h>
  35
  36#include "sched.h"
  37
  38/*
  39 * Targeted preemption latency for CPU-bound tasks:
  40 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
  41 *
  42 * NOTE: this latency value is not the same as the concept of
  43 * 'timeslice length' - timeslices in CFS are of variable length
  44 * and have no persistent notion like in traditional, time-slice
  45 * based scheduling concepts.
  46 *
  47 * (to see the precise effective timeslice length of your workload,
  48 *  run vmstat and monitor the context-switches (cs) field)
  49 */
  50unsigned int sysctl_sched_latency = 6000000ULL;
  51unsigned int normalized_sysctl_sched_latency = 6000000ULL;
  52
  53/*
  54 * The initial- and re-scaling of tunables is configurable
  55 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
  56 *
  57 * Options are:
  58 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
  59 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
  60 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
  61 */
  62enum sched_tunable_scaling sysctl_sched_tunable_scaling
  63        = SCHED_TUNABLESCALING_LOG;
  64
  65/*
  66 * Minimal preemption granularity for CPU-bound tasks:
  67 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  68 */
  69unsigned int sysctl_sched_min_granularity = 750000ULL;
  70unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
  71
  72/*
  73 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
  74 */
  75static unsigned int sched_nr_latency = 8;
  76
  77/*
  78 * After fork, child runs first. If set to 0 (default) then
  79 * parent will (try to) run first.
  80 */
  81unsigned int sysctl_sched_child_runs_first __read_mostly;
  82
  83/*
  84 * SCHED_OTHER wake-up granularity.
  85 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
  86 *
  87 * This option delays the preemption effects of decoupled workloads
  88 * and reduces their over-scheduling. Synchronous workloads will still
  89 * have immediate wakeup/sleep latencies.
  90 */
  91unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
  92unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
  93
  94const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
  95
  96/*
  97 * The exponential sliding  window over which load is averaged for shares
  98 * distribution.
  99 * (default: 10msec)
 100 */
 101unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 102
 103#ifdef CONFIG_CFS_BANDWIDTH
 104/*
 105 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
 106 * each time a cfs_rq requests quota.
 107 *
 108 * Note: in the case that the slice exceeds the runtime remaining (either due
 109 * to consumption or the quota being specified to be smaller than the slice)
 110 * we will always only issue the remaining available time.
 111 *
 112 * default: 5 msec, units: microseconds
 113  */
 114unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 115#endif
 116
 117static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 118{
 119        lw->weight += inc;
 120        lw->inv_weight = 0;
 121}
 122
 123static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
 124{
 125        lw->weight -= dec;
 126        lw->inv_weight = 0;
 127}
 128
 129static inline void update_load_set(struct load_weight *lw, unsigned long w)
 130{
 131        lw->weight = w;
 132        lw->inv_weight = 0;
 133}
 134
 135/*
 136 * Increase the granularity value when there are more CPUs,
 137 * because with more CPUs the 'effective latency' as visible
 138 * to users decreases. But the relationship is not linear,
 139 * so pick a second-best guess by going with the log2 of the
 140 * number of CPUs.
 141 *
 142 * This idea comes from the SD scheduler of Con Kolivas:
 143 */
 144static unsigned int get_update_sysctl_factor(void)
 145{
 146        unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
 147        unsigned int factor;
 148
 149        switch (sysctl_sched_tunable_scaling) {
 150        case SCHED_TUNABLESCALING_NONE:
 151                factor = 1;
 152                break;
 153        case SCHED_TUNABLESCALING_LINEAR:
 154                factor = cpus;
 155                break;
 156        case SCHED_TUNABLESCALING_LOG:
 157        default:
 158                factor = 1 + ilog2(cpus);
 159                break;
 160        }
 161
 162        return factor;
 163}
 164
 165static void update_sysctl(void)
 166{
 167        unsigned int factor = get_update_sysctl_factor();
 168
 169#define SET_SYSCTL(name) \
 170        (sysctl_##name = (factor) * normalized_sysctl_##name)
 171        SET_SYSCTL(sched_min_granularity);
 172        SET_SYSCTL(sched_latency);
 173        SET_SYSCTL(sched_wakeup_granularity);
 174#undef SET_SYSCTL
 175}
 176
 177void sched_init_granularity(void)
 178{
 179        update_sysctl();
 180}
 181
 182#define WMULT_CONST     (~0U)
 183#define WMULT_SHIFT     32
 184
 185static void __update_inv_weight(struct load_weight *lw)
 186{
 187        unsigned long w;
 188
 189        if (likely(lw->inv_weight))
 190                return;
 191
 192        w = scale_load_down(lw->weight);
 193
 194        if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
 195                lw->inv_weight = 1;
 196        else if (unlikely(!w))
 197                lw->inv_weight = WMULT_CONST;
 198        else
 199                lw->inv_weight = WMULT_CONST / w;
 200}
 201
 202/*
 203 * delta_exec * weight / lw.weight
 204 *   OR
 205 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
 206 *
 207 * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
 208 * we're guaranteed shift stays positive because inv_weight is guaranteed to
 209 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
 210 *
 211 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
 212 * weight/lw.weight <= 1, and therefore our shift will also be positive.
 213 */
 214static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
 215{
 216        u64 fact = scale_load_down(weight);
 217        int shift = WMULT_SHIFT;
 218
 219        __update_inv_weight(lw);
 220
 221        if (unlikely(fact >> 32)) {
 222                while (fact >> 32) {
 223                        fact >>= 1;
 224                        shift--;
 225                }
 226        }
 227
 228        /* hint to use a 32x32->64 mul */
 229        fact = (u64)(u32)fact * lw->inv_weight;
 230
 231        while (fact >> 32) {
 232                fact >>= 1;
 233                shift--;
 234        }
 235
 236        return mul_u64_u32_shr(delta_exec, fact, shift);
 237}
 238
 239
 240const struct sched_class fair_sched_class;
 241
 242/**************************************************************
 243 * CFS operations on generic schedulable entities:
 244 */
 245
 246#ifdef CONFIG_FAIR_GROUP_SCHED
 247
 248/* cpu runqueue to which this cfs_rq is attached */
 249static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 250{
 251        return cfs_rq->rq;
 252}
 253
 254/* An entity is a task if it doesn't "own" a runqueue */
 255#define entity_is_task(se)      (!se->my_q)
 256
 257static inline struct task_struct *task_of(struct sched_entity *se)
 258{
 259#ifdef CONFIG_SCHED_DEBUG
 260        WARN_ON_ONCE(!entity_is_task(se));
 261#endif
 262        return container_of(se, struct task_struct, se);
 263}
 264
 265/* Walk up scheduling entities hierarchy */
 266#define for_each_sched_entity(se) \
 267                for (; se; se = se->parent)
 268
 269static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 270{
 271        return p->se.cfs_rq;
 272}
 273
 274/* runqueue on which this entity is (to be) queued */
 275static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 276{
 277        return se->cfs_rq;
 278}
 279
 280/* runqueue "owned" by this group */
 281static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 282{
 283        return grp->my_q;
 284}
 285
 286static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 287{
 288        if (!cfs_rq->on_list) {
 289                /*
 290                 * Ensure we either appear before our parent (if already
 291                 * enqueued) or force our parent to appear after us when it is
 292                 * enqueued.  The fact that we always enqueue bottom-up
 293                 * reduces this to two cases.
 294                 */
 295                if (cfs_rq->tg->parent &&
 296                    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
 297                        list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
 298                                &rq_of(cfs_rq)->leaf_cfs_rq_list);
 299                } else {
 300                        list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 301                                &rq_of(cfs_rq)->leaf_cfs_rq_list);
 302                }
 303
 304                cfs_rq->on_list = 1;
 305        }
 306}
 307
 308static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 309{
 310        if (cfs_rq->on_list) {
 311                list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
 312                cfs_rq->on_list = 0;
 313        }
 314}
 315
 316/* Iterate thr' all leaf cfs_rq's on a runqueue */
 317#define for_each_leaf_cfs_rq(rq, cfs_rq) \
 318        list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 319
 320/* Do the two (enqueued) entities belong to the same group ? */
 321static inline struct cfs_rq *
 322is_same_group(struct sched_entity *se, struct sched_entity *pse)
 323{
 324        if (se->cfs_rq == pse->cfs_rq)
 325                return se->cfs_rq;
 326
 327        return NULL;
 328}
 329
 330static inline struct sched_entity *parent_entity(struct sched_entity *se)
 331{
 332        return se->parent;
 333}
 334
 335static void
 336find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 337{
 338        int se_depth, pse_depth;
 339
 340        /*
 341         * preemption test can be made between sibling entities who are in the
 342         * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
 343         * both tasks until we find their ancestors who are siblings of common
 344         * parent.
 345         */
 346
 347        /* First walk up until both entities are at same depth */
 348        se_depth = (*se)->depth;
 349        pse_depth = (*pse)->depth;
 350
 351        while (se_depth > pse_depth) {
 352                se_depth--;
 353                *se = parent_entity(*se);
 354        }
 355
 356        while (pse_depth > se_depth) {
 357                pse_depth--;
 358                *pse = parent_entity(*pse);
 359        }
 360
 361        while (!is_same_group(*se, *pse)) {
 362                *se = parent_entity(*se);
 363                *pse = parent_entity(*pse);
 364        }
 365}
 366
 367#else   /* !CONFIG_FAIR_GROUP_SCHED */
 368
 369static inline struct task_struct *task_of(struct sched_entity *se)
 370{
 371        return container_of(se, struct task_struct, se);
 372}
 373
 374static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 375{
 376        return container_of(cfs_rq, struct rq, cfs);
 377}
 378
 379#define entity_is_task(se)      1
 380
 381#define for_each_sched_entity(se) \
 382                for (; se; se = NULL)
 383
 384static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 385{
 386        return &task_rq(p)->cfs;
 387}
 388
 389static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 390{
 391        struct task_struct *p = task_of(se);
 392        struct rq *rq = task_rq(p);
 393
 394        return &rq->cfs;
 395}
 396
 397/* runqueue "owned" by this group */
 398static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 399{
 400        return NULL;
 401}
 402
 403static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 404{
 405}
 406
 407static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 408{
 409}
 410
 411#define for_each_leaf_cfs_rq(rq, cfs_rq) \
 412                for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 413
 414static inline struct sched_entity *parent_entity(struct sched_entity *se)
 415{
 416        return NULL;
 417}
 418
 419static inline void
 420find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 421{
 422}
 423
 424#endif  /* CONFIG_FAIR_GROUP_SCHED */
 425
 426static __always_inline
 427void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 428
 429/**************************************************************
 430 * Scheduling class tree data structure manipulation methods:
 431 */
 432
 433static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
 434{
 435        s64 delta = (s64)(vruntime - max_vruntime);
 436        if (delta > 0)
 437                max_vruntime = vruntime;
 438
 439        return max_vruntime;
 440}
 441
 442static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
 443{
 444        s64 delta = (s64)(vruntime - min_vruntime);
 445        if (delta < 0)
 446                min_vruntime = vruntime;
 447
 448        return min_vruntime;
 449}
 450
 451static inline int entity_before(struct sched_entity *a,
 452                                struct sched_entity *b)
 453{
 454        return (s64)(a->vruntime - b->vruntime) < 0;
 455}
 456
 457static void update_min_vruntime(struct cfs_rq *cfs_rq)
 458{
 459        u64 vruntime = cfs_rq->min_vruntime;
 460
 461        if (cfs_rq->curr)
 462                vruntime = cfs_rq->curr->vruntime;
 463
 464        if (cfs_rq->rb_leftmost) {
 465                struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
 466                                                   struct sched_entity,
 467                                                   run_node);
 468
 469                if (!cfs_rq->curr)
 470                        vruntime = se->vruntime;
 471                else
 472                        vruntime = min_vruntime(vruntime, se->vruntime);
 473        }
 474
 475        /* ensure we never gain time by being placed backwards. */
 476        cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
 477#ifndef CONFIG_64BIT
 478        smp_wmb();
 479        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 480#endif
 481}
 482
 483/*
 484 * Enqueue an entity into the rb-tree:
 485 */
 486static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 487{
 488        struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 489        struct rb_node *parent = NULL;
 490        struct sched_entity *entry;
 491        int leftmost = 1;
 492
 493        /*
 494         * Find the right place in the rbtree:
 495         */
 496        while (*link) {
 497                parent = *link;
 498                entry = rb_entry(parent, struct sched_entity, run_node);
 499                /*
 500                 * We dont care about collisions. Nodes with
 501                 * the same key stay together.
 502                 */
 503                if (entity_before(se, entry)) {
 504                        link = &parent->rb_left;
 505                } else {
 506                        link = &parent->rb_right;
 507                        leftmost = 0;
 508                }
 509        }
 510
 511        /*
 512         * Maintain a cache of leftmost tree entries (it is frequently
 513         * used):
 514         */
 515        if (leftmost)
 516                cfs_rq->rb_leftmost = &se->run_node;
 517
 518        rb_link_node(&se->run_node, parent, link);
 519        rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
 520}
 521
 522static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 523{
 524        if (cfs_rq->rb_leftmost == &se->run_node) {
 525                struct rb_node *next_node;
 526
 527                next_node = rb_next(&se->run_node);
 528                cfs_rq->rb_leftmost = next_node;
 529        }
 530
 531        rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 532}
 533
 534struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 535{
 536        struct rb_node *left = cfs_rq->rb_leftmost;
 537
 538        if (!left)
 539                return NULL;
 540
 541        return rb_entry(left, struct sched_entity, run_node);
 542}
 543
 544static struct sched_entity *__pick_next_entity(struct sched_entity *se)
 545{
 546        struct rb_node *next = rb_next(&se->run_node);
 547
 548        if (!next)
 549                return NULL;
 550
 551        return rb_entry(next, struct sched_entity, run_node);
 552}
 553
 554#ifdef CONFIG_SCHED_DEBUG
 555struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 556{
 557        struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
 558
 559        if (!last)
 560                return NULL;
 561
 562        return rb_entry(last, struct sched_entity, run_node);
 563}
 564
 565/**************************************************************
 566 * Scheduling class statistics methods:
 567 */
 568
 569int sched_proc_update_handler(struct ctl_table *table, int write,
 570                void __user *buffer, size_t *lenp,
 571                loff_t *ppos)
 572{
 573        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 574        unsigned int factor = get_update_sysctl_factor();
 575
 576        if (ret || !write)
 577                return ret;
 578
 579        sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
 580                                        sysctl_sched_min_granularity);
 581
 582#define WRT_SYSCTL(name) \
 583        (normalized_sysctl_##name = sysctl_##name / (factor))
 584        WRT_SYSCTL(sched_min_granularity);
 585        WRT_SYSCTL(sched_latency);
 586        WRT_SYSCTL(sched_wakeup_granularity);
 587#undef WRT_SYSCTL
 588
 589        return 0;
 590}
 591#endif
 592
 593/*
 594 * delta /= w
 595 */
 596static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
 597{
 598        if (unlikely(se->load.weight != NICE_0_LOAD))
 599                delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
 600
 601        return delta;
 602}
 603
 604/*
 605 * The idea is to set a period in which each task runs once.
 606 *
 607 * When there are too many tasks (sched_nr_latency) we have to stretch
 608 * this period because otherwise the slices get too small.
 609 *
 610 * p = (nr <= nl) ? l : l*nr/nl
 611 */
 612static u64 __sched_period(unsigned long nr_running)
 613{
 614        if (unlikely(nr_running > sched_nr_latency))
 615                return nr_running * sysctl_sched_min_granularity;
 616        else
 617                return sysctl_sched_latency;
 618}
 619
 620/*
 621 * We calculate the wall-time slice from the period by taking a part
 622 * proportional to the weight.
 623 *
 624 * s = p*P[w/rw]
 625 */
 626static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 627{
 628        u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
 629
 630        for_each_sched_entity(se) {
 631                struct load_weight *load;
 632                struct load_weight lw;
 633
 634                cfs_rq = cfs_rq_of(se);
 635                load = &cfs_rq->load;
 636
 637                if (unlikely(!se->on_rq)) {
 638                        lw = cfs_rq->load;
 639
 640                        update_load_add(&lw, se->load.weight);
 641                        load = &lw;
 642                }
 643                slice = __calc_delta(slice, se->load.weight, load);
 644        }
 645        return slice;
 646}
 647
 648/*
 649 * We calculate the vruntime slice of a to-be-inserted task.
 650 *
 651 * vs = s/w
 652 */
 653static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 654{
 655        return calc_delta_fair(sched_slice(cfs_rq, se), se);
 656}
 657
 658#ifdef CONFIG_SMP
 659static int select_idle_sibling(struct task_struct *p, int cpu);
 660static unsigned long task_h_load(struct task_struct *p);
 661
 662/*
 663 * We choose a half-life close to 1 scheduling period.
 664 * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
 665 * dependent on this value.
 666 */
 667#define LOAD_AVG_PERIOD 32
 668#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
 669#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
 670
 671/* Give new sched_entity start runnable values to heavy its load in infant time */
 672void init_entity_runnable_average(struct sched_entity *se)
 673{
 674        struct sched_avg *sa = &se->avg;
 675
 676        sa->last_update_time = 0;
 677        /*
 678         * sched_avg's period_contrib should be strictly less then 1024, so
 679         * we give it 1023 to make sure it is almost a period (1024us), and
 680         * will definitely be update (after enqueue).
 681         */
 682        sa->period_contrib = 1023;
 683        sa->load_avg = scale_load_down(se->load.weight);
 684        sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
 685        sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
 686        sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
 687        /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 688}
 689
 690static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
 691static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
 692#else
 693void init_entity_runnable_average(struct sched_entity *se)
 694{
 695}
 696#endif
 697
 698/*
 699 * Update the current task's runtime statistics.
 700 */
 701static void update_curr(struct cfs_rq *cfs_rq)
 702{
 703        struct sched_entity *curr = cfs_rq->curr;
 704        u64 now = rq_clock_task(rq_of(cfs_rq));
 705        u64 delta_exec;
 706
 707        if (unlikely(!curr))
 708                return;
 709
 710        delta_exec = now - curr->exec_start;
 711        if (unlikely((s64)delta_exec <= 0))
 712                return;
 713
 714        curr->exec_start = now;
 715
 716        schedstat_set(curr->statistics.exec_max,
 717                      max(delta_exec, curr->statistics.exec_max));
 718
 719        curr->sum_exec_runtime += delta_exec;
 720        schedstat_add(cfs_rq, exec_clock, delta_exec);
 721
 722        curr->vruntime += calc_delta_fair(delta_exec, curr);
 723        update_min_vruntime(cfs_rq);
 724
 725        if (entity_is_task(curr)) {
 726                struct task_struct *curtask = task_of(curr);
 727
 728                trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
 729                cpuacct_charge(curtask, delta_exec);
 730                account_group_exec_runtime(curtask, delta_exec);
 731        }
 732
 733        account_cfs_rq_runtime(cfs_rq, delta_exec);
 734}
 735
 736static void update_curr_fair(struct rq *rq)
 737{
 738        update_curr(cfs_rq_of(&rq->curr->se));
 739}
 740
 741static inline void
 742update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 743{
 744        schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
 745}
 746
 747/*
 748 * Task is being enqueued - update stats:
 749 */
 750static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 751{
 752        /*
 753         * Are we enqueueing a waiting task? (for current tasks
 754         * a dequeue/enqueue event is a NOP)
 755         */
 756        if (se != cfs_rq->curr)
 757                update_stats_wait_start(cfs_rq, se);
 758}
 759
 760static void
 761update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 762{
 763        schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
 764                        rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
 765        schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
 766        schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
 767                        rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
 768#ifdef CONFIG_SCHEDSTATS
 769        if (entity_is_task(se)) {
 770                trace_sched_stat_wait(task_of(se),
 771                        rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
 772        }
 773#endif
 774        schedstat_set(se->statistics.wait_start, 0);
 775}
 776
 777static inline void
 778update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 779{
 780        /*
 781         * Mark the end of the wait period if dequeueing a
 782         * waiting task:
 783         */
 784        if (se != cfs_rq->curr)
 785                update_stats_wait_end(cfs_rq, se);
 786}
 787
 788/*
 789 * We are picking a new current task - update its stats:
 790 */
 791static inline void
 792update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 793{
 794        /*
 795         * We are starting a new run period:
 796         */
 797        se->exec_start = rq_clock_task(rq_of(cfs_rq));
 798}
 799
 800/**************************************************
 801 * Scheduling class queueing methods:
 802 */
 803
 804#ifdef CONFIG_NUMA_BALANCING
 805/*
 806 * Approximate time to scan a full NUMA task in ms. The task scan period is
 807 * calculated based on the tasks virtual memory size and
 808 * numa_balancing_scan_size.
 809 */
 810unsigned int sysctl_numa_balancing_scan_period_min = 1000;
 811unsigned int sysctl_numa_balancing_scan_period_max = 60000;
 812
 813/* Portion of address space to scan in MB */
 814unsigned int sysctl_numa_balancing_scan_size = 256;
 815
 816/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 817unsigned int sysctl_numa_balancing_scan_delay = 1000;
 818
 819static unsigned int task_nr_scan_windows(struct task_struct *p)
 820{
 821        unsigned long rss = 0;
 822        unsigned long nr_scan_pages;
 823
 824        /*
 825         * Calculations based on RSS as non-present and empty pages are skipped
 826         * by the PTE scanner and NUMA hinting faults should be trapped based
 827         * on resident pages
 828         */
 829        nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
 830        rss = get_mm_rss(p->mm);
 831        if (!rss)
 832                rss = nr_scan_pages;
 833
 834        rss = round_up(rss, nr_scan_pages);
 835        return rss / nr_scan_pages;
 836}
 837
 838/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
 839#define MAX_SCAN_WINDOW 2560
 840
 841static unsigned int task_scan_min(struct task_struct *p)
 842{
 843        unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
 844        unsigned int scan, floor;
 845        unsigned int windows = 1;
 846
 847        if (scan_size < MAX_SCAN_WINDOW)
 848                windows = MAX_SCAN_WINDOW / scan_size;
 849        floor = 1000 / windows;
 850
 851        scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
 852        return max_t(unsigned int, floor, scan);
 853}
 854
 855static unsigned int task_scan_max(struct task_struct *p)
 856{
 857        unsigned int smin = task_scan_min(p);
 858        unsigned int smax;
 859
 860        /* Watch for min being lower than max due to floor calculations */
 861        smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
 862        return max(smin, smax);
 863}
 864
 865static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
 866{
 867        rq->nr_numa_running += (p->numa_preferred_nid != -1);
 868        rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
 869}
 870
 871static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
 872{
 873        rq->nr_numa_running -= (p->numa_preferred_nid != -1);
 874        rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
 875}
 876
 877struct numa_group {
 878        atomic_t refcount;
 879
 880        spinlock_t lock; /* nr_tasks, tasks */
 881        int nr_tasks;
 882        pid_t gid;
 883
 884        struct rcu_head rcu;
 885        nodemask_t active_nodes;
 886        unsigned long total_faults;
 887        /*
 888         * Faults_cpu is used to decide whether memory should move
 889         * towards the CPU. As a consequence, these stats are weighted
 890         * more by CPU use than by memory faults.
 891         */
 892        unsigned long *faults_cpu;
 893        unsigned long faults[0];
 894};
 895
 896/* Shared or private faults. */
 897#define NR_NUMA_HINT_FAULT_TYPES 2
 898
 899/* Memory and CPU locality */
 900#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
 901
 902/* Averaged statistics, and temporary buffers. */
 903#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
 904
 905pid_t task_numa_group_id(struct task_struct *p)
 906{
 907        return p->numa_group ? p->numa_group->gid : 0;
 908}
 909
 910/*
 911 * The averaged statistics, shared & private, memory & cpu,
 912 * occupy the first half of the array. The second half of the
 913 * array is for current counters, which are averaged into the
 914 * first set by task_numa_placement.
 915 */
 916static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
 917{
 918        return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
 919}
 920
 921static inline unsigned long task_faults(struct task_struct *p, int nid)
 922{
 923        if (!p->numa_faults)
 924                return 0;
 925
 926        return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
 927                p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
 928}
 929
 930static inline unsigned long group_faults(struct task_struct *p, int nid)
 931{
 932        if (!p->numa_group)
 933                return 0;
 934
 935        return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
 936                p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
 937}
 938
 939static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
 940{
 941        return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
 942                group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
 943}
 944
 945/* Handle placement on systems where not all nodes are directly connected. */
 946static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
 947                                        int maxdist, bool task)
 948{
 949        unsigned long score = 0;
 950        int node;
 951
 952        /*
 953         * All nodes are directly connected, and the same distance
 954         * from each other. No need for fancy placement algorithms.
 955         */
 956        if (sched_numa_topology_type == NUMA_DIRECT)
 957                return 0;
 958
 959        /*
 960         * This code is called for each node, introducing N^2 complexity,
 961         * which should be ok given the number of nodes rarely exceeds 8.
 962         */
 963        for_each_online_node(node) {
 964                unsigned long faults;
 965                int dist = node_distance(nid, node);
 966
 967                /*
 968                 * The furthest away nodes in the system are not interesting
 969                 * for placement; nid was already counted.
 970                 */
 971                if (dist == sched_max_numa_distance || node == nid)
 972                        continue;
 973
 974                /*
 975                 * On systems with a backplane NUMA topology, compare groups
 976                 * of nodes, and move tasks towards the group with the most
 977                 * memory accesses. When comparing two nodes at distance
 978                 * "hoplimit", only nodes closer by than "hoplimit" are part
 979                 * of each group. Skip other nodes.
 980                 */
 981                if (sched_numa_topology_type == NUMA_BACKPLANE &&
 982                                        dist > maxdist)
 983                        continue;
 984
 985                /* Add up the faults from nearby nodes. */
 986                if (task)
 987                        faults = task_faults(p, node);
 988                else
 989                        faults = group_faults(p, node);
 990
 991                /*
 992                 * On systems with a glueless mesh NUMA topology, there are
 993                 * no fixed "groups of nodes". Instead, nodes that are not
 994                 * directly connected bounce traffic through intermediate
 995                 * nodes; a numa_group can occupy any set of nodes.
 996                 * The further away a node is, the less the faults count.
 997                 * This seems to result in good task placement.
 998                 */
 999                if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1000                        faults *= (sched_max_numa_distance - dist);
1001                        faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1002                }
1003
1004                score += faults;
1005        }
1006
1007        return score;
1008}
1009
1010/*
1011 * These return the fraction of accesses done by a particular task, or
1012 * task group, on a particular numa node.  The group weight is given a
1013 * larger multiplier, in order to group tasks together that are almost
1014 * evenly spread out between numa nodes.
1015 */
1016static inline unsigned long task_weight(struct task_struct *p, int nid,
1017                                        int dist)
1018{
1019        unsigned long faults, total_faults;
1020
1021        if (!p->numa_faults)
1022                return 0;
1023
1024        total_faults = p->total_numa_faults;
1025
1026        if (!total_faults)
1027                return 0;
1028
1029        faults = task_faults(p, nid);
1030        faults += score_nearby_nodes(p, nid, dist, true);
1031
1032        return 1000 * faults / total_faults;
1033}
1034
1035static inline unsigned long group_weight(struct task_struct *p, int nid,
1036                                         int dist)
1037{
1038        unsigned long faults, total_faults;
1039
1040        if (!p->numa_group)
1041                return 0;
1042
1043        total_faults = p->numa_group->total_faults;
1044
1045        if (!total_faults)
1046                return 0;
1047
1048        faults = group_faults(p, nid);
1049        faults += score_nearby_nodes(p, nid, dist, false);
1050
1051        return 1000 * faults / total_faults;
1052}
1053
1054bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1055                                int src_nid, int dst_cpu)
1056{
1057        struct numa_group *ng = p->numa_group;
1058        int dst_nid = cpu_to_node(dst_cpu);
1059        int last_cpupid, this_cpupid;
1060
1061        this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1062
1063        /*
1064         * Multi-stage node selection is used in conjunction with a periodic
1065         * migration fault to build a temporal task<->page relation. By using
1066         * a two-stage filter we remove short/unlikely relations.
1067         *
1068         * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1069         * a task's usage of a particular page (n_p) per total usage of this
1070         * page (n_t) (in a given time-span) to a probability.
1071         *
1072         * Our periodic faults will sample this probability and getting the
1073         * same result twice in a row, given these samples are fully
1074         * independent, is then given by P(n)^2, provided our sample period
1075         * is sufficiently short compared to the usage pattern.
1076         *
1077         * This quadric squishes small probabilities, making it less likely we
1078         * act on an unlikely task<->page relation.
1079         */
1080        last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1081        if (!cpupid_pid_unset(last_cpupid) &&
1082                                cpupid_to_nid(last_cpupid) != dst_nid)
1083                return false;
1084
1085        /* Always allow migrate on private faults */
1086        if (cpupid_match_pid(p, last_cpupid))
1087                return true;
1088
1089        /* A shared fault, but p->numa_group has not been set up yet. */
1090        if (!ng)
1091                return true;
1092
1093        /*
1094         * Do not migrate if the destination is not a node that
1095         * is actively used by this numa group.
1096         */
1097        if (!node_isset(dst_nid, ng->active_nodes))
1098                return false;
1099
1100        /*
1101         * Source is a node that is not actively used by this
1102         * numa group, while the destination is. Migrate.
1103         */
1104        if (!node_isset(src_nid, ng->active_nodes))
1105                return true;
1106
1107        /*
1108         * Both source and destination are nodes in active
1109         * use by this numa group. Maximize memory bandwidth
1110         * by migrating from more heavily used groups, to less
1111         * heavily used ones, spreading the load around.
1112         * Use a 1/4 hysteresis to avoid spurious page movement.
1113         */
1114        return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
1115}
1116
1117static unsigned long weighted_cpuload(const int cpu);
1118static unsigned long source_load(int cpu, int type);
1119static unsigned long target_load(int cpu, int type);
1120static unsigned long capacity_of(int cpu);
1121static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
1122
1123/* Cached statistics for all CPUs within a node */
1124struct numa_stats {
1125        unsigned long nr_running;
1126        unsigned long load;
1127
1128        /* Total compute capacity of CPUs on a node */
1129        unsigned long compute_capacity;
1130
1131        /* Approximate capacity in terms of runnable tasks on a node */
1132        unsigned long task_capacity;
1133        int has_free_capacity;
1134};
1135
1136/*
1137 * XXX borrowed from update_sg_lb_stats
1138 */
1139static void update_numa_stats(struct numa_stats *ns, int nid)
1140{
1141        int smt, cpu, cpus = 0;
1142        unsigned long capacity;
1143
1144        memset(ns, 0, sizeof(*ns));
1145        for_each_cpu(cpu, cpumask_of_node(nid)) {
1146                struct rq *rq = cpu_rq(cpu);
1147
1148                ns->nr_running += rq->nr_running;
1149                ns->load += weighted_cpuload(cpu);
1150                ns->compute_capacity += capacity_of(cpu);
1151
1152                cpus++;
1153        }
1154
1155        /*
1156         * If we raced with hotplug and there are no CPUs left in our mask
1157         * the @ns structure is NULL'ed and task_numa_compare() will
1158         * not find this node attractive.
1159         *
1160         * We'll either bail at !has_free_capacity, or we'll detect a huge
1161         * imbalance and bail there.
1162         */
1163        if (!cpus)
1164                return;
1165
1166        /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1167        smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1168        capacity = cpus / smt; /* cores */
1169
1170        ns->task_capacity = min_t(unsigned, capacity,
1171                DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1172        ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1173}
1174
1175struct task_numa_env {
1176        struct task_struct *p;
1177
1178        int src_cpu, src_nid;
1179        int dst_cpu, dst_nid;
1180
1181        struct numa_stats src_stats, dst_stats;
1182
1183        int imbalance_pct;
1184        int dist;
1185
1186        struct task_struct *best_task;
1187        long best_imp;
1188        int best_cpu;
1189};
1190
1191static void task_numa_assign(struct task_numa_env *env,
1192                             struct task_struct *p, long imp)
1193{
1194        if (env->best_task)
1195                put_task_struct(env->best_task);
1196        if (p)
1197                get_task_struct(p);
1198
1199        env->best_task = p;
1200        env->best_imp = imp;
1201        env->best_cpu = env->dst_cpu;
1202}
1203
1204static bool load_too_imbalanced(long src_load, long dst_load,
1205                                struct task_numa_env *env)
1206{
1207        long imb, old_imb;
1208        long orig_src_load, orig_dst_load;
1209        long src_capacity, dst_capacity;
1210
1211        /*
1212         * The load is corrected for the CPU capacity available on each node.
1213         *
1214         * src_load        dst_load
1215         * ------------ vs ---------
1216         * src_capacity    dst_capacity
1217         */
1218        src_capacity = env->src_stats.compute_capacity;
1219        dst_capacity = env->dst_stats.compute_capacity;
1220
1221        /* We care about the slope of the imbalance, not the direction. */
1222        if (dst_load < src_load)
1223                swap(dst_load, src_load);
1224
1225        /* Is the difference below the threshold? */
1226        imb = dst_load * src_capacity * 100 -
1227              src_load * dst_capacity * env->imbalance_pct;
1228        if (imb <= 0)
1229                return false;
1230
1231        /*
1232         * The imbalance is above the allowed threshold.
1233         * Compare it with the old imbalance.
1234         */
1235        orig_src_load = env->src_stats.load;
1236        orig_dst_load = env->dst_stats.load;
1237
1238        if (orig_dst_load < orig_src_load)
1239                swap(orig_dst_load, orig_src_load);
1240
1241        old_imb = orig_dst_load * src_capacity * 100 -
1242                  orig_src_load * dst_capacity * env->imbalance_pct;
1243
1244        /* Would this change make things worse? */
1245        return (imb > old_imb);
1246}
1247
1248/*
1249 * This checks if the overall compute and NUMA accesses of the system would
1250 * be improved if the source tasks was migrated to the target dst_cpu taking
1251 * into account that it might be best if task running on the dst_cpu should
1252 * be exchanged with the source task
1253 */
1254static void task_numa_compare(struct task_numa_env *env,
1255                              long taskimp, long groupimp)
1256{
1257        struct rq *src_rq = cpu_rq(env->src_cpu);
1258        struct rq *dst_rq = cpu_rq(env->dst_cpu);
1259        struct task_struct *cur;
1260        long src_load, dst_load;
1261        long load;
1262        long imp = env->p->numa_group ? groupimp : taskimp;
1263        long moveimp = imp;
1264        int dist = env->dist;
1265
1266        rcu_read_lock();
1267
1268        raw_spin_lock_irq(&dst_rq->lock);
1269        cur = dst_rq->curr;
1270        /*
1271         * No need to move the exiting task, and this ensures that ->curr
1272         * wasn't reaped and thus get_task_struct() in task_numa_assign()
1273         * is safe under RCU read lock.
1274         * Note that rcu_read_lock() itself can't protect from the final
1275         * put_task_struct() after the last schedule().
1276         */
1277        if ((cur->flags & PF_EXITING) || is_idle_task(cur))
1278                cur = NULL;
1279        raw_spin_unlock_irq(&dst_rq->lock);
1280
1281        /*
1282         * Because we have preemption enabled we can get migrated around and
1283         * end try selecting ourselves (current == env->p) as a swap candidate.
1284         */
1285        if (cur == env->p)
1286                goto unlock;
1287
1288        /*
1289         * "imp" is the fault differential for the source task between the
1290         * source and destination node. Calculate the total differential for
1291         * the source task and potential destination task. The more negative
1292         * the value is, the more rmeote accesses that would be expected to
1293         * be incurred if the tasks were swapped.
1294         */
1295        if (cur) {
1296                /* Skip this swap candidate if cannot move to the source cpu */
1297                if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1298                        goto unlock;
1299
1300                /*
1301                 * If dst and source tasks are in the same NUMA group, or not
1302                 * in any group then look only at task weights.
1303                 */
1304                if (cur->numa_group == env->p->numa_group) {
1305                        imp = taskimp + task_weight(cur, env->src_nid, dist) -
1306                              task_weight(cur, env->dst_nid, dist);
1307                        /*
1308                         * Add some hysteresis to prevent swapping the
1309                         * tasks within a group over tiny differences.
1310                         */
1311                        if (cur->numa_group)
1312                                imp -= imp/16;
1313                } else {
1314                        /*
1315                         * Compare the group weights. If a task is all by
1316                         * itself (not part of a group), use the task weight
1317                         * instead.
1318                         */
1319                        if (cur->numa_group)
1320                                imp += group_weight(cur, env->src_nid, dist) -
1321                                       group_weight(cur, env->dst_nid, dist);
1322                        else
1323                                imp += task_weight(cur, env->src_nid, dist) -
1324                                       task_weight(cur, env->dst_nid, dist);
1325                }
1326        }
1327
1328        if (imp <= env->best_imp && moveimp <= env->best_imp)
1329                goto unlock;
1330
1331        if (!cur) {
1332                /* Is there capacity at our destination? */
1333                if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1334                    !env->dst_stats.has_free_capacity)
1335                        goto unlock;
1336
1337                goto balance;
1338        }
1339
1340        /* Balance doesn't matter much if we're running a task per cpu */
1341        if (imp > env->best_imp && src_rq->nr_running == 1 &&
1342                        dst_rq->nr_running == 1)
1343                goto assign;
1344
1345        /*
1346         * In the overloaded case, try and keep the load balanced.
1347         */
1348balance:
1349        load = task_h_load(env->p);
1350        dst_load = env->dst_stats.load + load;
1351        src_load = env->src_stats.load - load;
1352
1353        if (moveimp > imp && moveimp > env->best_imp) {
1354                /*
1355                 * If the improvement from just moving env->p direction is
1356                 * better than swapping tasks around, check if a move is
1357                 * possible. Store a slightly smaller score than moveimp,
1358                 * so an actually idle CPU will win.
1359                 */
1360                if (!load_too_imbalanced(src_load, dst_load, env)) {
1361                        imp = moveimp - 1;
1362                        cur = NULL;
1363                        goto assign;
1364                }
1365        }
1366
1367        if (imp <= env->best_imp)
1368                goto unlock;
1369
1370        if (cur) {
1371                load = task_h_load(cur);
1372                dst_load -= load;
1373                src_load += load;
1374        }
1375
1376        if (load_too_imbalanced(src_load, dst_load, env))
1377                goto unlock;
1378
1379        /*
1380         * One idle CPU per node is evaluated for a task numa move.
1381         * Call select_idle_sibling to maybe find a better one.
1382         */
1383        if (!cur)
1384                env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
1385
1386assign:
1387        task_numa_assign(env, cur, imp);
1388unlock:
1389        rcu_read_unlock();
1390}
1391
1392static void task_numa_find_cpu(struct task_numa_env *env,
1393                                long taskimp, long groupimp)
1394{
1395        int cpu;
1396
1397        for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1398                /* Skip this CPU if the source task cannot migrate */
1399                if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1400                        continue;
1401
1402                env->dst_cpu = cpu;
1403                task_numa_compare(env, taskimp, groupimp);
1404        }
1405}
1406
1407/* Only move tasks to a NUMA node less busy than the current node. */
1408static bool numa_has_capacity(struct task_numa_env *env)
1409{
1410        struct numa_stats *src = &env->src_stats;
1411        struct numa_stats *dst = &env->dst_stats;
1412
1413        if (src->has_free_capacity && !dst->has_free_capacity)
1414                return false;
1415
1416        /*
1417         * Only consider a task move if the source has a higher load
1418         * than the destination, corrected for CPU capacity on each node.
1419         *
1420         *      src->load                dst->load
1421         * --------------------- vs ---------------------
1422         * src->compute_capacity    dst->compute_capacity
1423         */
1424        if (src->load * dst->compute_capacity * env->imbalance_pct >
1425
1426            dst->load * src->compute_capacity * 100)
1427                return true;
1428
1429        return false;
1430}
1431
1432static int task_numa_migrate(struct task_struct *p)
1433{
1434        struct task_numa_env env = {
1435                .p = p,
1436
1437                .src_cpu = task_cpu(p),
1438                .src_nid = task_node(p),
1439
1440                .imbalance_pct = 112,
1441
1442                .best_task = NULL,
1443                .best_imp = 0,
1444                .best_cpu = -1
1445        };
1446        struct sched_domain *sd;
1447        unsigned long taskweight, groupweight;
1448        int nid, ret, dist;
1449        long taskimp, groupimp;
1450
1451        /*
1452         * Pick the lowest SD_NUMA domain, as that would have the smallest
1453         * imbalance and would be the first to start moving tasks about.
1454         *
1455         * And we want to avoid any moving of tasks about, as that would create
1456         * random movement of tasks -- counter the numa conditions we're trying
1457         * to satisfy here.
1458         */
1459        rcu_read_lock();
1460        sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1461        if (sd)
1462                env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1463        rcu_read_unlock();
1464
1465        /*
1466         * Cpusets can break the scheduler domain tree into smaller
1467         * balance domains, some of which do not cross NUMA boundaries.
1468         * Tasks that are "trapped" in such domains cannot be migrated
1469         * elsewhere, so there is no point in (re)trying.
1470         */
1471        if (unlikely(!sd)) {
1472                p->numa_preferred_nid = task_node(p);
1473                return -EINVAL;
1474        }
1475
1476        env.dst_nid = p->numa_preferred_nid;
1477        dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1478        taskweight = task_weight(p, env.src_nid, dist);
1479        groupweight = group_weight(p, env.src_nid, dist);
1480        update_numa_stats(&env.src_stats, env.src_nid);
1481        taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1482        groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1483        update_numa_stats(&env.dst_stats, env.dst_nid);
1484
1485        /* Try to find a spot on the preferred nid. */
1486        if (numa_has_capacity(&env))
1487                task_numa_find_cpu(&env, taskimp, groupimp);
1488
1489        /*
1490         * Look at other nodes in these cases:
1491         * - there is no space available on the preferred_nid
1492         * - the task is part of a numa_group that is interleaved across
1493         *   multiple NUMA nodes; in order to better consolidate the group,
1494         *   we need to check other locations.
1495         */
1496        if (env.best_cpu == -1 || (p->numa_group &&
1497                        nodes_weight(p->numa_group->active_nodes) > 1)) {
1498                for_each_online_node(nid) {
1499                        if (nid == env.src_nid || nid == p->numa_preferred_nid)
1500                                continue;
1501
1502                        dist = node_distance(env.src_nid, env.dst_nid);
1503                        if (sched_numa_topology_type == NUMA_BACKPLANE &&
1504                                                dist != env.dist) {
1505                                taskweight = task_weight(p, env.src_nid, dist);
1506                                groupweight = group_weight(p, env.src_nid, dist);
1507                        }
1508
1509                        /* Only consider nodes where both task and groups benefit */
1510                        taskimp = task_weight(p, nid, dist) - taskweight;
1511                        groupimp = group_weight(p, nid, dist) - groupweight;
1512                        if (taskimp < 0 && groupimp < 0)
1513                                continue;
1514
1515                        env.dist = dist;
1516                        env.dst_nid = nid;
1517                        update_numa_stats(&env.dst_stats, env.dst_nid);
1518                        if (numa_has_capacity(&env))
1519                                task_numa_find_cpu(&env, taskimp, groupimp);
1520                }
1521        }
1522
1523        /*
1524         * If the task is part of a workload that spans multiple NUMA nodes,
1525         * and is migrating into one of the workload's active nodes, remember
1526         * this node as the task's preferred numa node, so the workload can
1527         * settle down.
1528         * A task that migrated to a second choice node will be better off
1529         * trying for a better one later. Do not set the preferred node here.
1530         */
1531        if (p->numa_group) {
1532                if (env.best_cpu == -1)
1533                        nid = env.src_nid;
1534                else
1535                        nid = env.dst_nid;
1536
1537                if (node_isset(nid, p->numa_group->active_nodes))
1538                        sched_setnuma(p, env.dst_nid);
1539        }
1540
1541        /* No better CPU than the current one was found. */
1542        if (env.best_cpu == -1)
1543                return -EAGAIN;
1544
1545        /*
1546         * Reset the scan period if the task is being rescheduled on an
1547         * alternative node to recheck if the tasks is now properly placed.
1548         */
1549        p->numa_scan_period = task_scan_min(p);
1550
1551        if (env.best_task == NULL) {
1552                ret = migrate_task_to(p, env.best_cpu);
1553                if (ret != 0)
1554                        trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1555                return ret;
1556        }
1557
1558        ret = migrate_swap(p, env.best_task);
1559        if (ret != 0)
1560                trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1561        put_task_struct(env.best_task);
1562        return ret;
1563}
1564
1565/* Attempt to migrate a task to a CPU on the preferred node. */
1566static void numa_migrate_preferred(struct task_struct *p)
1567{
1568        unsigned long interval = HZ;
1569
1570        /* This task has no NUMA fault statistics yet */
1571        if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1572                return;
1573
1574        /* Periodically retry migrating the task to the preferred node */
1575        interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1576        p->numa_migrate_retry = jiffies + interval;
1577
1578        /* Success if task is already running on preferred CPU */
1579        if (task_node(p) == p->numa_preferred_nid)
1580                return;
1581
1582        /* Otherwise, try migrate to a CPU on the preferred node */
1583        task_numa_migrate(p);
1584}
1585
1586/*
1587 * Find the nodes on which the workload is actively running. We do this by
1588 * tracking the nodes from which NUMA hinting faults are triggered. This can
1589 * be different from the set of nodes where the workload's memory is currently
1590 * located.
1591 *
1592 * The bitmask is used to make smarter decisions on when to do NUMA page
1593 * migrations, To prevent flip-flopping, and excessive page migrations, nodes
1594 * are added when they cause over 6/16 of the maximum number of faults, but
1595 * only removed when they drop below 3/16.
1596 */
1597static void update_numa_active_node_mask(struct numa_group *numa_group)
1598{
1599        unsigned long faults, max_faults = 0;
1600        int nid;
1601
1602        for_each_online_node(nid) {
1603                faults = group_faults_cpu(numa_group, nid);
1604                if (faults > max_faults)
1605                        max_faults = faults;
1606        }
1607
1608        for_each_online_node(nid) {
1609                faults = group_faults_cpu(numa_group, nid);
1610                if (!node_isset(nid, numa_group->active_nodes)) {
1611                        if (faults > max_faults * 6 / 16)
1612                                node_set(nid, numa_group->active_nodes);
1613                } else if (faults < max_faults * 3 / 16)
1614                        node_clear(nid, numa_group->active_nodes);
1615        }
1616}
1617
1618/*
1619 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1620 * increments. The more local the fault statistics are, the higher the scan
1621 * period will be for the next scan window. If local/(local+remote) ratio is
1622 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1623 * the scan period will decrease. Aim for 70% local accesses.
1624 */
1625#define NUMA_PERIOD_SLOTS 10
1626#define NUMA_PERIOD_THRESHOLD 7
1627
1628/*
1629 * Increase the scan period (slow down scanning) if the majority of
1630 * our memory is already on our local node, or if the majority of
1631 * the page accesses are shared with other processes.
1632 * Otherwise, decrease the scan period.
1633 */
1634static void update_task_scan_period(struct task_struct *p,
1635                        unsigned long shared, unsigned long private)
1636{
1637        unsigned int period_slot;
1638        int ratio;
1639        int diff;
1640
1641        unsigned long remote = p->numa_faults_locality[0];
1642        unsigned long local = p->numa_faults_locality[1];
1643
1644        /*
1645         * If there were no record hinting faults then either the task is
1646         * completely idle or all activity is areas that are not of interest
1647         * to automatic numa balancing. Related to that, if there were failed
1648         * migration then it implies we are migrating too quickly or the local
1649         * node is overloaded. In either case, scan slower
1650         */
1651        if (local + shared == 0 || p->numa_faults_locality[2]) {
1652                p->numa_scan_period = min(p->numa_scan_period_max,
1653                        p->numa_scan_period << 1);
1654
1655                p->mm->numa_next_scan = jiffies +
1656                        msecs_to_jiffies(p->numa_scan_period);
1657
1658                return;
1659        }
1660
1661        /*
1662         * Prepare to scale scan period relative to the current period.
1663         *       == NUMA_PERIOD_THRESHOLD scan period stays the same
1664         *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1665         *       >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1666         */
1667        period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1668        ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1669        if (ratio >= NUMA_PERIOD_THRESHOLD) {
1670                int slot = ratio - NUMA_PERIOD_THRESHOLD;
1671                if (!slot)
1672                        slot = 1;
1673                diff = slot * period_slot;
1674        } else {
1675                diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1676
1677                /*
1678                 * Scale scan rate increases based on sharing. There is an
1679                 * inverse relationship between the degree of sharing and
1680                 * the adjustment made to the scanning period. Broadly
1681                 * speaking the intent is that there is little point
1682                 * scanning faster if shared accesses dominate as it may
1683                 * simply bounce migrations uselessly
1684                 */
1685                ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
1686                diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1687        }
1688
1689        p->numa_scan_period = clamp(p->numa_scan_period + diff,
1690                        task_scan_min(p), task_scan_max(p));
1691        memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1692}
1693
1694/*
1695 * Get the fraction of time the task has been running since the last
1696 * NUMA placement cycle. The scheduler keeps similar statistics, but
1697 * decays those on a 32ms period, which is orders of magnitude off
1698 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
1699 * stats only if the task is so new there are no NUMA statistics yet.
1700 */
1701static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1702{
1703        u64 runtime, delta, now;
1704        /* Use the start of this time slice to avoid calculations. */
1705        now = p->se.exec_start;
1706        runtime = p->se.sum_exec_runtime;
1707
1708        if (p->last_task_numa_placement) {
1709                delta = runtime - p->last_sum_exec_runtime;
1710                *period = now - p->last_task_numa_placement;
1711        } else {
1712                delta = p->se.avg.load_sum / p->se.load.weight;
1713                *period = LOAD_AVG_MAX;
1714        }
1715
1716        p->last_sum_exec_runtime = runtime;
1717        p->last_task_numa_placement = now;
1718
1719        return delta;
1720}
1721
1722/*
1723 * Determine the preferred nid for a task in a numa_group. This needs to
1724 * be done in a way that produces consistent results with group_weight,
1725 * otherwise workloads might not converge.
1726 */
1727static int preferred_group_nid(struct task_struct *p, int nid)
1728{
1729        nodemask_t nodes;
1730        int dist;
1731
1732        /* Direct connections between all NUMA nodes. */
1733        if (sched_numa_topology_type == NUMA_DIRECT)
1734                return nid;
1735
1736        /*
1737         * On a system with glueless mesh NUMA topology, group_weight
1738         * scores nodes according to the number of NUMA hinting faults on
1739         * both the node itself, and on nearby nodes.
1740         */
1741        if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1742                unsigned long score, max_score = 0;
1743                int node, max_node = nid;
1744
1745                dist = sched_max_numa_distance;
1746
1747                for_each_online_node(node) {
1748                        score = group_weight(p, node, dist);
1749                        if (score > max_score) {
1750                                max_score = score;
1751                                max_node = node;
1752                        }
1753                }
1754                return max_node;
1755        }
1756
1757        /*
1758         * Finding the preferred nid in a system with NUMA backplane
1759         * interconnect topology is more involved. The goal is to locate
1760         * tasks from numa_groups near each other in the system, and
1761         * untangle workloads from different sides of the system. This requires
1762         * searching down the hierarchy of node groups, recursively searching
1763         * inside the highest scoring group of nodes. The nodemask tricks
1764         * keep the complexity of the search down.
1765         */
1766        nodes = node_online_map;
1767        for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1768                unsigned long max_faults = 0;
1769                nodemask_t max_group = NODE_MASK_NONE;
1770                int a, b;
1771
1772                /* Are there nodes at this distance from each other? */
1773                if (!find_numa_distance(dist))
1774                        continue;
1775
1776                for_each_node_mask(a, nodes) {
1777                        unsigned long faults = 0;
1778                        nodemask_t this_group;
1779                        nodes_clear(this_group);
1780
1781                        /* Sum group's NUMA faults; includes a==b case. */
1782                        for_each_node_mask(b, nodes) {
1783                                if (node_distance(a, b) < dist) {
1784                                        faults += group_faults(p, b);
1785                                        node_set(b, this_group);
1786                                        node_clear(b, nodes);
1787                                }
1788                        }
1789
1790                        /* Remember the top group. */
1791                        if (faults > max_faults) {
1792                                max_faults = faults;
1793                                max_group = this_group;
1794                                /*
1795                                 * subtle: at the smallest distance there is
1796                                 * just one node left in each "group", the
1797                                 * winner is the preferred nid.
1798                                 */
1799                                nid = a;
1800                        }
1801                }
1802                /* Next round, evaluate the nodes within max_group. */
1803                if (!max_faults)
1804                        break;
1805                nodes = max_group;
1806        }
1807        return nid;
1808}
1809
1810static void task_numa_placement(struct task_struct *p)
1811{
1812        int seq, nid, max_nid = -1, max_group_nid = -1;
1813        unsigned long max_faults = 0, max_group_faults = 0;
1814        unsigned long fault_types[2] = { 0, 0 };
1815        unsigned long total_faults;
1816        u64 runtime, period;
1817        spinlock_t *group_lock = NULL;
1818
1819        /*
1820         * The p->mm->numa_scan_seq field gets updated without
1821         * exclusive access. Use READ_ONCE() here to ensure
1822         * that the field is read in a single access:
1823         */
1824        seq = READ_ONCE(p->mm->numa_scan_seq);
1825        if (p->numa_scan_seq == seq)
1826                return;
1827        p->numa_scan_seq = seq;
1828        p->numa_scan_period_max = task_scan_max(p);
1829
1830        total_faults = p->numa_faults_locality[0] +
1831                       p->numa_faults_locality[1];
1832        runtime = numa_get_avg_runtime(p, &period);
1833
1834        /* If the task is part of a group prevent parallel updates to group stats */
1835        if (p->numa_group) {
1836                group_lock = &p->numa_group->lock;
1837                spin_lock_irq(group_lock);
1838        }
1839
1840        /* Find the node with the highest number of faults */
1841        for_each_online_node(nid) {
1842                /* Keep track of the offsets in numa_faults array */
1843                int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
1844                unsigned long faults = 0, group_faults = 0;
1845                int priv;
1846
1847                for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
1848                        long diff, f_diff, f_weight;
1849
1850                        mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
1851                        membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
1852                        cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
1853                        cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
1854
1855                        /* Decay existing window, copy faults since last scan */
1856                        diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
1857                        fault_types[priv] += p->numa_faults[membuf_idx];
1858                        p->numa_faults[membuf_idx] = 0;
1859
1860                        /*
1861                         * Normalize the faults_from, so all tasks in a group
1862                         * count according to CPU use, instead of by the raw
1863                         * number of faults. Tasks with little runtime have
1864                         * little over-all impact on throughput, and thus their
1865                         * faults are less important.
1866                         */
1867                        f_weight = div64_u64(runtime << 16, period + 1);
1868                        f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
1869                                   (total_faults + 1);
1870                        f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
1871                        p->numa_faults[cpubuf_idx] = 0;
1872
1873                        p->numa_faults[mem_idx] += diff;
1874                        p->numa_faults[cpu_idx] += f_diff;
1875                        faults += p->numa_faults[mem_idx];
1876                        p->total_numa_faults += diff;
1877                        if (p->numa_group) {
1878                                /*
1879                                 * safe because we can only change our own group
1880                                 *
1881                                 * mem_idx represents the offset for a given
1882                                 * nid and priv in a specific region because it
1883                                 * is at the beginning of the numa_faults array.
1884                                 */
1885                                p->numa_group->faults[mem_idx] += diff;
1886                                p->numa_group->faults_cpu[mem_idx] += f_diff;
1887                                p->numa_group->total_faults += diff;
1888                                group_faults += p->numa_group->faults[mem_idx];
1889                        }
1890                }
1891
1892                if (faults > max_faults) {
1893                        max_faults = faults;
1894                        max_nid = nid;
1895                }
1896
1897                if (group_faults > max_group_faults) {
1898                        max_group_faults = group_faults;
1899                        max_group_nid = nid;
1900                }
1901        }
1902
1903        update_task_scan_period(p, fault_types[0], fault_types[1]);
1904
1905        if (p->numa_group) {
1906                update_numa_active_node_mask(p->numa_group);
1907                spin_unlock_irq(group_lock);
1908                max_nid = preferred_group_nid(p, max_group_nid);
1909        }
1910
1911        if (max_faults) {
1912                /* Set the new preferred node */
1913                if (max_nid != p->numa_preferred_nid)
1914                        sched_setnuma(p, max_nid);
1915
1916                if (task_node(p) != p->numa_preferred_nid)
1917                        numa_migrate_preferred(p);
1918        }
1919}
1920
1921static inline int get_numa_group(struct numa_group *grp)
1922{
1923        return atomic_inc_not_zero(&grp->refcount);
1924}
1925
1926static inline void put_numa_group(struct numa_group *grp)
1927{
1928        if (atomic_dec_and_test(&grp->refcount))
1929                kfree_rcu(grp, rcu);
1930}
1931
1932static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1933                        int *priv)
1934{
1935        struct numa_group *grp, *my_grp;
1936        struct task_struct *tsk;
1937        bool join = false;
1938        int cpu = cpupid_to_cpu(cpupid);
1939        int i;
1940
1941        if (unlikely(!p->numa_group)) {
1942                unsigned int size = sizeof(struct numa_group) +
1943                                    4*nr_node_ids*sizeof(unsigned long);
1944
1945                grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1946                if (!grp)
1947                        return;
1948
1949                atomic_set(&grp->refcount, 1);
1950                spin_lock_init(&grp->lock);
1951                grp->gid = p->pid;
1952                /* Second half of the array tracks nids where faults happen */
1953                grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
1954                                                nr_node_ids;
1955
1956                node_set(task_node(current), grp->active_nodes);
1957
1958                for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1959                        grp->faults[i] = p->numa_faults[i];
1960
1961                grp->total_faults = p->total_numa_faults;
1962
1963                grp->nr_tasks++;
1964                rcu_assign_pointer(p->numa_group, grp);
1965        }
1966
1967        rcu_read_lock();
1968        tsk = READ_ONCE(cpu_rq(cpu)->curr);
1969
1970        if (!cpupid_match_pid(tsk, cpupid))
1971                goto no_join;
1972
1973        grp = rcu_dereference(tsk->numa_group);
1974        if (!grp)
1975                goto no_join;
1976
1977        my_grp = p->numa_group;
1978        if (grp == my_grp)
1979                goto no_join;
1980
1981        /*
1982         * Only join the other group if its bigger; if we're the bigger group,
1983         * the other task will join us.
1984         */
1985        if (my_grp->nr_tasks > grp->nr_tasks)
1986                goto no_join;
1987
1988        /*
1989         * Tie-break on the grp address.
1990         */
1991        if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
1992                goto no_join;
1993
1994        /* Always join threads in the same process. */
1995        if (tsk->mm == current->mm)
1996                join = true;
1997
1998        /* Simple filter to avoid false positives due to PID collisions */
1999        if (flags & TNF_SHARED)
2000                join = true;
2001
2002        /* Update priv based on whether false sharing was detected */
2003        *priv = !join;
2004
2005        if (join && !get_numa_group(grp))
2006                goto no_join;
2007
2008        rcu_read_unlock();
2009
2010        if (!join)
2011                return;
2012
2013        BUG_ON(irqs_disabled());
2014        double_lock_irq(&my_grp->lock, &grp->lock);
2015
2016        for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2017                my_grp->faults[i] -= p->numa_faults[i];
2018                grp->faults[i] += p->numa_faults[i];
2019        }
2020        my_grp->total_faults -= p->total_numa_faults;
2021        grp->total_faults += p->total_numa_faults;
2022
2023        my_grp->nr_tasks--;
2024        grp->nr_tasks++;
2025
2026        spin_unlock(&my_grp->lock);
2027        spin_unlock_irq(&grp->lock);
2028
2029        rcu_assign_pointer(p->numa_group, grp);
2030
2031        put_numa_group(my_grp);
2032        return;
2033
2034no_join:
2035        rcu_read_unlock();
2036        return;
2037}
2038
2039void task_numa_free(struct task_struct *p)
2040{
2041        struct numa_group *grp = p->numa_group;
2042        void *numa_faults = p->numa_faults;
2043        unsigned long flags;
2044        int i;
2045
2046        if (grp) {
2047                spin_lock_irqsave(&grp->lock, flags);
2048                for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2049                        grp->faults[i] -= p->numa_faults[i];
2050                grp->total_faults -= p->total_numa_faults;
2051
2052                grp->nr_tasks--;
2053                spin_unlock_irqrestore(&grp->lock, flags);
2054                RCU_INIT_POINTER(p->numa_group, NULL);
2055                put_numa_group(grp);
2056        }
2057
2058        p->numa_faults = NULL;
2059        kfree(numa_faults);
2060}
2061
2062/*
2063 * Got a PROT_NONE fault for a page on @node.
2064 */
2065void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2066{
2067        struct task_struct *p = current;
2068        bool migrated = flags & TNF_MIGRATED;
2069        int cpu_node = task_node(current);
2070        int local = !!(flags & TNF_FAULT_LOCAL);
2071        int priv;
2072
2073        if (!static_branch_likely(&sched_numa_balancing))
2074                return;
2075
2076        /* for example, ksmd faulting in a user's mm */
2077        if (!p->mm)
2078                return;
2079
2080        /* Allocate buffer to track faults on a per-node basis */
2081        if (unlikely(!p->numa_faults)) {
2082                int size = sizeof(*p->numa_faults) *
2083                           NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2084
2085                p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2086                if (!p->numa_faults)
2087                        return;
2088
2089                p->total_numa_faults = 0;
2090                memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2091        }
2092
2093        /*
2094         * First accesses are treated as private, otherwise consider accesses
2095         * to be private if the accessing pid has not changed
2096         */
2097        if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2098                priv = 1;
2099        } else {
2100                priv = cpupid_match_pid(p, last_cpupid);
2101                if (!priv && !(flags & TNF_NO_GROUP))
2102                        task_numa_group(p, last_cpupid, flags, &priv);
2103        }
2104
2105        /*
2106         * If a workload spans multiple NUMA nodes, a shared fault that
2107         * occurs wholly within the set of nodes that the workload is
2108         * actively using should be counted as local. This allows the
2109         * scan rate to slow down when a workload has settled down.
2110         */
2111        if (!priv && !local && p->numa_group &&
2112                        node_isset(cpu_node, p->numa_group->active_nodes) &&
2113                        node_isset(mem_node, p->numa_group->active_nodes))
2114                local = 1;
2115
2116        task_numa_placement(p);
2117
2118        /*
2119         * Retry task to preferred node migration periodically, in case it
2120         * case it previously failed, or the scheduler moved us.
2121         */
2122        if (time_after(jiffies, p->numa_migrate_retry))
2123                numa_migrate_preferred(p);
2124
2125        if (migrated)
2126                p->numa_pages_migrated += pages;
2127        if (flags & TNF_MIGRATE_FAIL)
2128                p->numa_faults_locality[2] += pages;
2129
2130        p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2131        p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2132        p->numa_faults_locality[local] += pages;
2133}
2134
2135static void reset_ptenuma_scan(struct task_struct *p)
2136{
2137        /*
2138         * We only did a read acquisition of the mmap sem, so
2139         * p->mm->numa_scan_seq is written to without exclusive access
2140         * and the update is not guaranteed to be atomic. That's not
2141         * much of an issue though, since this is just used for
2142         * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2143         * expensive, to avoid any form of compiler optimizations:
2144         */
2145        WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2146        p->mm->numa_scan_offset = 0;
2147}
2148
2149/*
2150 * The expensive part of numa migration is done from task_work context.
2151 * Triggered from task_tick_numa().
2152 */
2153void task_numa_work(struct callback_head *work)
2154{
2155        unsigned long migrate, next_scan, now = jiffies;
2156        struct task_struct *p = current;
2157        struct mm_struct *mm = p->mm;
2158        struct vm_area_struct *vma;
2159        unsigned long start, end;
2160        unsigned long nr_pte_updates = 0;
2161        long pages, virtpages;
2162
2163        WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
2164
2165        work->next = work; /* protect against double add */
2166        /*
2167         * Who cares about NUMA placement when they're dying.
2168         *
2169         * NOTE: make sure not to dereference p->mm before this check,
2170         * exit_task_work() happens _after_ exit_mm() so we could be called
2171         * without p->mm even though we still had it when we enqueued this
2172         * work.
2173         */
2174        if (p->flags & PF_EXITING)
2175                return;
2176
2177        if (!mm->numa_next_scan) {
2178                mm->numa_next_scan = now +
2179                        msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2180        }
2181
2182        /*
2183         * Enforce maximal scan/migration frequency..
2184         */
2185        migrate = mm->numa_next_scan;
2186        if (time_before(now, migrate))
2187                return;
2188
2189        if (p->numa_scan_period == 0) {
2190                p->numa_scan_period_max = task_scan_max(p);
2191                p->numa_scan_period = task_scan_min(p);
2192        }
2193
2194        next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2195        if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2196                return;
2197
2198        /*
2199         * Delay this task enough that another task of this mm will likely win
2200         * the next time around.
2201         */
2202        p->node_stamp += 2 * TICK_NSEC;
2203
2204        start = mm->numa_scan_offset;
2205        pages = sysctl_numa_balancing_scan_size;
2206        pages <<= 20 - PAGE_SHIFT; /* MB in pages */
2207        virtpages = pages * 8;     /* Scan up to this much virtual space */
2208        if (!pages)
2209                return;
2210
2211
2212        down_read(&mm->mmap_sem);
2213        vma = find_vma(mm, start);
2214        if (!vma) {
2215                reset_ptenuma_scan(p);
2216                start = 0;
2217                vma = mm->mmap;
2218        }
2219        for (; vma; vma = vma->vm_next) {
2220                if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2221                        is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2222                        continue;
2223                }
2224
2225                /*
2226                 * Shared library pages mapped by multiple processes are not
2227                 * migrated as it is expected they are cache replicated. Avoid
2228                 * hinting faults in read-only file-backed mappings or the vdso
2229                 * as migrating the pages will be of marginal benefit.
2230                 */
2231                if (!vma->vm_mm ||
2232                    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2233                        continue;
2234
2235                /*
2236                 * Skip inaccessible VMAs to avoid any confusion between
2237                 * PROT_NONE and NUMA hinting ptes
2238                 */
2239                if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2240                        continue;
2241
2242                do {
2243                        start = max(start, vma->vm_start);
2244                        end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2245                        end = min(end, vma->vm_end);
2246                        nr_pte_updates = change_prot_numa(vma, start, end);
2247
2248                        /*
2249                         * Try to scan sysctl_numa_balancing_size worth of
2250                         * hpages that have at least one present PTE that
2251                         * is not already pte-numa. If the VMA contains
2252                         * areas that are unused or already full of prot_numa
2253                         * PTEs, scan up to virtpages, to skip through those
2254                         * areas faster.
2255                         */
2256                        if (nr_pte_updates)
2257                                pages -= (end - start) >> PAGE_SHIFT;
2258                        virtpages -= (end - start) >> PAGE_SHIFT;
2259
2260                        start = end;
2261                        if (pages <= 0 || virtpages <= 0)
2262                                goto out;
2263
2264                        cond_resched();
2265                } while (end != vma->vm_end);
2266        }
2267
2268out:
2269        /*
2270         * It is possible to reach the end of the VMA list but the last few
2271         * VMAs are not guaranteed to the vma_migratable. If they are not, we
2272         * would find the !migratable VMA on the next scan but not reset the
2273         * scanner to the start so check it now.
2274         */
2275        if (vma)
2276                mm->numa_scan_offset = start;
2277        else
2278                reset_ptenuma_scan(p);
2279        up_read(&mm->mmap_sem);
2280}
2281
2282/*
2283 * Drive the periodic memory faults..
2284 */
2285void task_tick_numa(struct rq *rq, struct task_struct *curr)
2286{
2287        struct callback_head *work = &curr->numa_work;
2288        u64 period, now;
2289
2290        /*
2291         * We don't care about NUMA placement if we don't have memory.
2292         */
2293        if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2294                return;
2295
2296        /*
2297         * Using runtime rather than walltime has the dual advantage that
2298         * we (mostly) drive the selection from busy threads and that the
2299         * task needs to have done some actual work before we bother with
2300         * NUMA placement.
2301         */
2302        now = curr->se.sum_exec_runtime;
2303        period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2304
2305        if (now > curr->node_stamp + period) {
2306                if (!curr->node_stamp)
2307                        curr->numa_scan_period = task_scan_min(curr);
2308                curr->node_stamp += period;
2309
2310                if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2311                        init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2312                        task_work_add(curr, work, true);
2313                }
2314        }
2315}
2316#else
2317static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2318{
2319}
2320
2321static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2322{
2323}
2324
2325static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2326{
2327}
2328#endif /* CONFIG_NUMA_BALANCING */
2329
2330static void
2331account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2332{
2333        update_load_add(&cfs_rq->load, se->load.weight);
2334        if (!parent_entity(se))
2335                update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2336#ifdef CONFIG_SMP
2337        if (entity_is_task(se)) {
2338                struct rq *rq = rq_of(cfs_rq);
2339
2340                account_numa_enqueue(rq, task_of(se));
2341                list_add(&se->group_node, &rq->cfs_tasks);
2342        }
2343#endif
2344        cfs_rq->nr_running++;
2345}
2346
2347static void
2348account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2349{
2350        update_load_sub(&cfs_rq->load, se->load.weight);
2351        if (!parent_entity(se))
2352                update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2353        if (entity_is_task(se)) {
2354                account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2355                list_del_init(&se->group_node);
2356        }
2357        cfs_rq->nr_running--;
2358}
2359
2360#ifdef CONFIG_FAIR_GROUP_SCHED
2361# ifdef CONFIG_SMP
2362static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
2363{
2364        long tg_weight;
2365
2366        /*
2367         * Use this CPU's real-time load instead of the last load contribution
2368         * as the updating of the contribution is delayed, and we will use the
2369         * the real-time load to calc the share. See update_tg_load_avg().
2370         */
2371        tg_weight = atomic_long_read(&tg->load_avg);
2372        tg_weight -= cfs_rq->tg_load_avg_contrib;
2373        tg_weight += cfs_rq->load.weight;
2374
2375        return tg_weight;
2376}
2377
2378static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2379{
2380        long tg_weight, load, shares;
2381
2382        tg_weight = calc_tg_weight(tg, cfs_rq);
2383        load = cfs_rq->load.weight;
2384
2385        shares = (tg->shares * load);
2386        if (tg_weight)
2387                shares /= tg_weight;
2388
2389        if (shares < MIN_SHARES)
2390                shares = MIN_SHARES;
2391        if (shares > tg->shares)
2392                shares = tg->shares;
2393
2394        return shares;
2395}
2396# else /* CONFIG_SMP */
2397static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2398{
2399        return tg->shares;
2400}
2401# endif /* CONFIG_SMP */
2402static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2403                            unsigned long weight)
2404{
2405        if (se->on_rq) {
2406                /* commit outstanding execution time */
2407                if (cfs_rq->curr == se)
2408                        update_curr(cfs_rq);
2409                account_entity_dequeue(cfs_rq, se);
2410        }
2411
2412        update_load_set(&se->load, weight);
2413
2414        if (se->on_rq)
2415                account_entity_enqueue(cfs_rq, se);
2416}
2417
2418static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2419
2420static void update_cfs_shares(struct cfs_rq *cfs_rq)
2421{
2422        struct task_group *tg;
2423        struct sched_entity *se;
2424        long shares;
2425
2426        tg = cfs_rq->tg;
2427        se = tg->se[cpu_of(rq_of(cfs_rq))];
2428        if (!se || throttled_hierarchy(cfs_rq))
2429                return;
2430#ifndef CONFIG_SMP
2431        if (likely(se->load.weight == tg->shares))
2432                return;
2433#endif
2434        shares = calc_cfs_shares(cfs_rq, tg);
2435
2436        reweight_entity(cfs_rq_of(se), se, shares);
2437}
2438#else /* CONFIG_FAIR_GROUP_SCHED */
2439static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
2440{
2441}
2442#endif /* CONFIG_FAIR_GROUP_SCHED */
2443
2444#ifdef CONFIG_SMP
2445/* Precomputed fixed inverse multiplies for multiplication by y^n */
2446static const u32 runnable_avg_yN_inv[] = {
2447        0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
2448        0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
2449        0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
2450        0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
2451        0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
2452        0x85aac367, 0x82cd8698,
2453};
2454
2455/*
2456 * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
2457 * over-estimates when re-combining.
2458 */
2459static const u32 runnable_avg_yN_sum[] = {
2460            0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
2461         9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
2462        17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
2463};
2464
2465/*
2466 * Approximate:
2467 *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
2468 */
2469static __always_inline u64 decay_load(u64 val, u64 n)
2470{
2471        unsigned int local_n;
2472
2473        if (!n)
2474                return val;
2475        else if (unlikely(n > LOAD_AVG_PERIOD * 63))
2476                return 0;
2477
2478        /* after bounds checking we can collapse to 32-bit */
2479        local_n = n;
2480
2481        /*
2482         * As y^PERIOD = 1/2, we can combine
2483         *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
2484         * With a look-up table which covers y^n (n<PERIOD)
2485         *
2486         * To achieve constant time decay_load.
2487         */
2488        if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
2489                val >>= local_n / LOAD_AVG_PERIOD;
2490                local_n %= LOAD_AVG_PERIOD;
2491        }
2492
2493        val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
2494        return val;
2495}
2496
2497/*
2498 * For updates fully spanning n periods, the contribution to runnable
2499 * average will be: \Sum 1024*y^n
2500 *
2501 * We can compute this reasonably efficiently by combining:
2502 *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
2503 */
2504static u32 __compute_runnable_contrib(u64 n)
2505{
2506        u32 contrib = 0;
2507
2508        if (likely(n <= LOAD_AVG_PERIOD))
2509                return runnable_avg_yN_sum[n];
2510        else if (unlikely(n >= LOAD_AVG_MAX_N))
2511                return LOAD_AVG_MAX;
2512
2513        /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
2514        do {
2515                contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
2516                contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
2517
2518                n -= LOAD_AVG_PERIOD;
2519        } while (n > LOAD_AVG_PERIOD);
2520
2521        contrib = decay_load(contrib, n);
2522        return contrib + runnable_avg_yN_sum[n];
2523}
2524
2525#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
2526#error "load tracking assumes 2^10 as unit"
2527#endif
2528
2529#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
2530
2531/*
2532 * We can represent the historical contribution to runnable average as the
2533 * coefficients of a geometric series.  To do this we sub-divide our runnable
2534 * history into segments of approximately 1ms (1024us); label the segment that
2535 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
2536 *
2537 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
2538 *      p0            p1           p2
2539 *     (now)       (~1ms ago)  (~2ms ago)
2540 *
2541 * Let u_i denote the fraction of p_i that the entity was runnable.
2542 *
2543 * We then designate the fractions u_i as our co-efficients, yielding the
2544 * following representation of historical load:
2545 *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
2546 *
2547 * We choose y based on the with of a reasonably scheduling period, fixing:
2548 *   y^32 = 0.5
2549 *
2550 * This means that the contribution to load ~32ms ago (u_32) will be weighted
2551 * approximately half as much as the contribution to load within the last ms
2552 * (u_0).
2553 *
2554 * When a period "rolls over" and we have new u_0`, multiplying the previous
2555 * sum again by y is sufficient to update:
2556 *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
2557 *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
2558 */
2559static __always_inline int
2560__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2561                  unsigned long weight, int running, struct cfs_rq *cfs_rq)
2562{
2563        u64 delta, scaled_delta, periods;
2564        u32 contrib;
2565        unsigned int delta_w, scaled_delta_w, decayed = 0;
2566        unsigned long scale_freq, scale_cpu;
2567
2568        delta = now - sa->last_update_time;
2569        /*
2570         * This should only happen when time goes backwards, which it
2571         * unfortunately does during sched clock init when we swap over to TSC.
2572         */
2573        if ((s64)delta < 0) {
2574                sa->last_update_time = now;
2575                return 0;
2576        }
2577
2578        /*
2579         * Use 1024ns as the unit of measurement since it's a reasonable
2580         * approximation of 1us and fast to compute.
2581         */
2582        delta >>= 10;
2583        if (!delta)
2584                return 0;
2585        sa->last_update_time = now;
2586
2587        scale_freq = arch_scale_freq_capacity(NULL, cpu);
2588        scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
2589
2590        /* delta_w is the amount already accumulated against our next period */
2591        delta_w = sa->period_contrib;
2592        if (delta + delta_w >= 1024) {
2593                decayed = 1;
2594
2595                /* how much left for next period will start over, we don't know yet */
2596                sa->period_contrib = 0;
2597
2598                /*
2599                 * Now that we know we're crossing a period boundary, figure
2600                 * out how much from delta we need to complete the current
2601                 * period and accrue it.
2602                 */
2603                delta_w = 1024 - delta_w;
2604                scaled_delta_w = cap_scale(delta_w, scale_freq);
2605                if (weight) {
2606                        sa->load_sum += weight * scaled_delta_w;
2607                        if (cfs_rq) {
2608                                cfs_rq->runnable_load_sum +=
2609                                                weight * scaled_delta_w;
2610                        }
2611                }
2612                if (running)
2613                        sa->util_sum += scaled_delta_w * scale_cpu;
2614
2615                delta -= delta_w;
2616
2617                /* Figure out how many additional periods this update spans */
2618                periods = delta / 1024;
2619                delta %= 1024;
2620
2621                sa->load_sum = decay_load(sa->load_sum, periods + 1);
2622                if (cfs_rq) {
2623                        cfs_rq->runnable_load_sum =
2624                                decay_load(cfs_rq->runnable_load_sum, periods + 1);
2625                }
2626                sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
2627
2628                /* Efficiently calculate \sum (1..n_period) 1024*y^i */
2629                contrib = __compute_runnable_contrib(periods);
2630                contrib = cap_scale(contrib, scale_freq);
2631                if (weight) {
2632                        sa->load_sum += weight * contrib;
2633                        if (cfs_rq)
2634                                cfs_rq->runnable_load_sum += weight * contrib;
2635                }
2636                if (running)
2637                        sa->util_sum += contrib * scale_cpu;
2638        }
2639
2640        /* Remainder of delta accrued against u_0` */
2641        scaled_delta = cap_scale(delta, scale_freq);
2642        if (weight) {
2643                sa->load_sum += weight * scaled_delta;
2644                if (cfs_rq)
2645                        cfs_rq->runnable_load_sum += weight * scaled_delta;
2646        }
2647        if (running)
2648                sa->util_sum += scaled_delta * scale_cpu;
2649
2650        sa->period_contrib += delta;
2651
2652        if (decayed) {
2653                sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
2654                if (cfs_rq) {
2655                        cfs_rq->runnable_load_avg =
2656                                div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
2657                }
2658                sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
2659        }
2660
2661        return decayed;
2662}
2663
2664#ifdef CONFIG_FAIR_GROUP_SCHED
2665/*
2666 * Updating tg's load_avg is necessary before update_cfs_share (which is done)
2667 * and effective_load (which is not done because it is too costly).
2668 */
2669static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
2670{
2671        long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
2672
2673        if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
2674                atomic_long_add(delta, &cfs_rq->tg->load_avg);
2675                cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
2676        }
2677}
2678
2679#else /* CONFIG_FAIR_GROUP_SCHED */
2680static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
2681#endif /* CONFIG_FAIR_GROUP_SCHED */
2682
2683static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
2684
2685/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
2686static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
2687{
2688        struct sched_avg *sa = &cfs_rq->avg;
2689        int decayed, removed = 0;
2690
2691        if (atomic_long_read(&cfs_rq->removed_load_avg)) {
2692                s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
2693                sa->load_avg = max_t(long, sa->load_avg - r, 0);
2694                sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
2695                removed = 1;
2696        }
2697
2698        if (atomic_long_read(&cfs_rq->removed_util_avg)) {
2699                long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
2700                sa->util_avg = max_t(long, sa->util_avg - r, 0);
2701                sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
2702        }
2703
2704        decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
2705                scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
2706
2707#ifndef CONFIG_64BIT
2708        smp_wmb();
2709        cfs_rq->load_last_update_time_copy = sa->last_update_time;
2710#endif
2711
2712        return decayed || removed;
2713}
2714
2715/* Update task and its cfs_rq load average */
2716static inline void update_load_avg(struct sched_entity *se, int update_tg)
2717{
2718        struct cfs_rq *cfs_rq = cfs_rq_of(se);
2719        u64 now = cfs_rq_clock_task(cfs_rq);
2720        int cpu = cpu_of(rq_of(cfs_rq));
2721
2722        /*
2723         * Track task load average for carrying it to new CPU after migrated, and
2724         * track group sched_entity load average for task_h_load calc in migration
2725         */
2726        __update_load_avg(now, cpu, &se->avg,
2727                          se->on_rq * scale_load_down(se->load.weight),
2728                          cfs_rq->curr == se, NULL);
2729
2730        if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
2731                update_tg_load_avg(cfs_rq, 0);
2732}
2733
2734static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2735{
2736        if (!sched_feat(ATTACH_AGE_LOAD))
2737                goto skip_aging;
2738
2739        /*
2740         * If we got migrated (either between CPUs or between cgroups) we'll
2741         * have aged the average right before clearing @last_update_time.
2742         */
2743        if (se->avg.last_update_time) {
2744                __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
2745                                  &se->avg, 0, 0, NULL);
2746
2747                /*
2748                 * XXX: we could have just aged the entire load away if we've been
2749                 * absent from the fair class for too long.
2750                 */
2751        }
2752
2753skip_aging:
2754        se->avg.last_update_time = cfs_rq->avg.last_update_time;
2755        cfs_rq->avg.load_avg += se->avg.load_avg;
2756        cfs_rq->avg.load_sum += se->avg.load_sum;
2757        cfs_rq->avg.util_avg += se->avg.util_avg;
2758        cfs_rq->avg.util_sum += se->avg.util_sum;
2759}
2760
2761static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2762{
2763        __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
2764                          &se->avg, se->on_rq * scale_load_down(se->load.weight),
2765                          cfs_rq->curr == se, NULL);
2766
2767        cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
2768        cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0);
2769        cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
2770        cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0);
2771}
2772
2773/* Add the load generated by se into cfs_rq's load average */
2774static inline void
2775enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2776{
2777        struct sched_avg *sa = &se->avg;
2778        u64 now = cfs_rq_clock_task(cfs_rq);
2779        int migrated, decayed;
2780
2781        migrated = !sa->last_update_time;
2782        if (!migrated) {
2783                __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
2784                        se->on_rq * scale_load_down(se->load.weight),
2785                        cfs_rq->curr == se, NULL);
2786        }
2787
2788        decayed = update_cfs_rq_load_avg(now, cfs_rq);
2789
2790        cfs_rq->runnable_load_avg += sa->load_avg;
2791        cfs_rq->runnable_load_sum += sa->load_sum;
2792
2793        if (migrated)
2794                attach_entity_load_avg(cfs_rq, se);
2795
2796        if (decayed || migrated)
2797                update_tg_load_avg(cfs_rq, 0);
2798}
2799
2800/* Remove the runnable load generated by se from cfs_rq's runnable load average */
2801static inline void
2802dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2803{
2804        update_load_avg(se, 1);
2805
2806        cfs_rq->runnable_load_avg =
2807                max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
2808        cfs_rq->runnable_load_sum =
2809                max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
2810}
2811
2812/*
2813 * Task first catches up with cfs_rq, and then subtract
2814 * itself from the cfs_rq (task must be off the queue now).
2815 */
2816void remove_entity_load_avg(struct sched_entity *se)
2817{
2818        struct cfs_rq *cfs_rq = cfs_rq_of(se);
2819        u64 last_update_time;
2820
2821#ifndef CONFIG_64BIT
2822        u64 last_update_time_copy;
2823
2824        do {
2825                last_update_time_copy = cfs_rq->load_last_update_time_copy;
2826                smp_rmb();
2827                last_update_time = cfs_rq->avg.last_update_time;
2828        } while (last_update_time != last_update_time_copy);
2829#else
2830        last_update_time = cfs_rq->avg.last_update_time;
2831#endif
2832
2833        __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
2834        atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
2835        atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
2836}
2837
2838/*
2839 * Update the rq's load with the elapsed running time before entering
2840 * idle. if the last scheduled task is not a CFS task, idle_enter will
2841 * be the only way to update the runnable statistic.
2842 */
2843void idle_enter_fair(struct rq *this_rq)
2844{
2845}
2846
2847/*
2848 * Update the rq's load with the elapsed idle time before a task is
2849 * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
2850 * be the only way to update the runnable statistic.
2851 */
2852void idle_exit_fair(struct rq *this_rq)
2853{
2854}
2855
2856static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
2857{
2858        return cfs_rq->runnable_load_avg;
2859}
2860
2861static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
2862{
2863        return cfs_rq->avg.load_avg;
2864}
2865
2866static int idle_balance(struct rq *this_rq);
2867
2868#else /* CONFIG_SMP */
2869
2870static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
2871static inline void
2872enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2873static inline void
2874dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2875static inline void remove_entity_load_avg(struct sched_entity *se) {}
2876
2877static inline void
2878attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2879static inline void
2880detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2881
2882static inline int idle_balance(struct rq *rq)
2883{
2884        return 0;
2885}
2886
2887#endif /* CONFIG_SMP */
2888
2889static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
2890{
2891#ifdef CONFIG_SCHEDSTATS
2892        struct task_struct *tsk = NULL;
2893
2894        if (entity_is_task(se))
2895                tsk = task_of(se);
2896
2897        if (se->statistics.sleep_start) {
2898                u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
2899
2900                if ((s64)delta < 0)
2901                        delta = 0;
2902
2903                if (unlikely(delta > se->statistics.sleep_max))
2904                        se->statistics.sleep_max = delta;
2905
2906                se->statistics.sleep_start = 0;
2907                se->statistics.sum_sleep_runtime += delta;
2908
2909                if (tsk) {
2910                        account_scheduler_latency(tsk, delta >> 10, 1);
2911                        trace_sched_stat_sleep(tsk, delta);
2912                }
2913        }
2914        if (se->statistics.block_start) {
2915                u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
2916
2917                if ((s64)delta < 0)
2918                        delta = 0;
2919
2920                if (unlikely(delta > se->statistics.block_max))
2921                        se->statistics.block_max = delta;
2922
2923                se->statistics.block_start = 0;
2924                se->statistics.sum_sleep_runtime += delta;
2925
2926                if (tsk) {
2927                        if (tsk->in_iowait) {
2928                                se->statistics.iowait_sum += delta;
2929                                se->statistics.iowait_count++;
2930                                trace_sched_stat_iowait(tsk, delta);
2931                        }
2932
2933                        trace_sched_stat_blocked(tsk, delta);
2934
2935                        /*
2936                         * Blocking time is in units of nanosecs, so shift by
2937                         * 20 to get a milliseconds-range estimation of the
2938                         * amount of time that the task spent sleeping:
2939                         */
2940                        if (unlikely(prof_on == SLEEP_PROFILING)) {
2941                                profile_hits(SLEEP_PROFILING,
2942                                                (void *)get_wchan(tsk),
2943                                                delta >> 20);
2944                        }
2945                        account_scheduler_latency(tsk, delta >> 10, 0);
2946                }
2947        }
2948#endif
2949}
2950
2951static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
2952{
2953#ifdef CONFIG_SCHED_DEBUG
2954        s64 d = se->vruntime - cfs_rq->min_vruntime;
2955
2956        if (d < 0)
2957                d = -d;
2958
2959        if (d > 3*sysctl_sched_latency)
2960                schedstat_inc(cfs_rq, nr_spread_over);
2961#endif
2962}
2963
2964static void
2965place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
2966{
2967        u64 vruntime = cfs_rq->min_vruntime;
2968
2969        /*
2970         * The 'current' period is already promised to the current tasks,
2971         * however the extra weight of the new task will slow them down a
2972         * little, place the new task so that it fits in the slot that
2973         * stays open at the end.
2974         */
2975        if (initial && sched_feat(START_DEBIT))
2976                vruntime += sched_vslice(cfs_rq, se);
2977
2978        /* sleeps up to a single latency don't count. */
2979        if (!initial) {
2980                unsigned long thresh = sysctl_sched_latency;
2981
2982                /*
2983                 * Halve their sleep time's effect, to allow
2984                 * for a gentler effect of sleepers:
2985                 */
2986                if (sched_feat(GENTLE_FAIR_SLEEPERS))
2987                        thresh >>= 1;
2988
2989                vruntime -= thresh;
2990        }
2991
2992        /* ensure we never gain time by being placed backwards. */
2993        se->vruntime = max_vruntime(se->vruntime, vruntime);
2994}
2995
2996static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
2997
2998static void
2999enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3000{
3001        /*
3002         * Update the normalized vruntime before updating min_vruntime
3003         * through calling update_curr().
3004         */
3005        if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
3006                se->vruntime += cfs_rq->min_vruntime;
3007
3008        /*
3009         * Update run-time statistics of the 'current'.
3010         */
3011        update_curr(cfs_rq);
3012        enqueue_entity_load_avg(cfs_rq, se);
3013        account_entity_enqueue(cfs_rq, se);
3014        update_cfs_shares(cfs_rq);
3015
3016        if (flags & ENQUEUE_WAKEUP) {
3017                place_entity(cfs_rq, se, 0);
3018                enqueue_sleeper(cfs_rq, se);
3019        }
3020
3021        update_stats_enqueue(cfs_rq, se);
3022        check_spread(cfs_rq, se);
3023        if (se != cfs_rq->curr)
3024                __enqueue_entity(cfs_rq, se);
3025        se->on_rq = 1;
3026
3027        if (cfs_rq->nr_running == 1) {
3028                list_add_leaf_cfs_rq(cfs_rq);
3029                check_enqueue_throttle(cfs_rq);
3030        }
3031}
3032
3033static void __clear_buddies_last(struct sched_entity *se)
3034{
3035        for_each_sched_entity(se) {
3036                struct cfs_rq *cfs_rq = cfs_rq_of(se);
3037                if (cfs_rq->last != se)
3038                        break;
3039
3040                cfs_rq->last = NULL;
3041        }
3042}
3043
3044static void __clear_buddies_next(struct sched_entity *se)
3045{
3046        for_each_sched_entity(se) {
3047                struct cfs_rq *cfs_rq = cfs_rq_of(se);
3048                if (cfs_rq->next != se)
3049                        break;
3050
3051                cfs_rq->next = NULL;
3052        }
3053}
3054
3055static void __clear_buddies_skip(struct sched_entity *se)
3056{
3057        for_each_sched_entity(se) {
3058                struct cfs_rq *cfs_rq = cfs_rq_of(se);
3059                if (cfs_rq->skip != se)
3060                        break;
3061
3062                cfs_rq->skip = NULL;
3063        }
3064}
3065
3066static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
3067{
3068        if (cfs_rq->last == se)
3069                __clear_buddies_last(se);
3070
3071        if (cfs_rq->next == se)
3072                __clear_buddies_next(se);
3073
3074        if (cfs_rq->skip == se)
3075                __clear_buddies_skip(se);
3076}
3077
3078static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3079
3080static void
3081dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3082{
3083        /*
3084         * Update run-time statistics of the 'current'.
3085         */
3086        update_curr(cfs_rq);
3087        dequeue_entity_load_avg(cfs_rq, se);
3088
3089        update_stats_dequeue(cfs_rq, se);
3090        if (flags & DEQUEUE_SLEEP) {
3091#ifdef CONFIG_SCHEDSTATS
3092                if (entity_is_task(se)) {
3093                        struct task_struct *tsk = task_of(se);
3094
3095                        if (tsk->state & TASK_INTERRUPTIBLE)
3096                                se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
3097                        if (tsk->state & TASK_UNINTERRUPTIBLE)
3098                                se->statistics.block_start = rq_clock(rq_of(cfs_rq));
3099                }
3100#endif
3101        }
3102
3103        clear_buddies(cfs_rq, se);
3104
3105        if (se != cfs_rq->curr)
3106                __dequeue_entity(cfs_rq, se);
3107        se->on_rq = 0;
3108        account_entity_dequeue(cfs_rq, se);
3109
3110        /*
3111         * Normalize the entity after updating the min_vruntime because the
3112         * update can refer to the ->curr item and we need to reflect this
3113         * movement in our normalized position.
3114         */
3115        if (!(flags & DEQUEUE_SLEEP))
3116                se->vruntime -= cfs_rq->min_vruntime;
3117
3118        /* return excess runtime on last dequeue */
3119        return_cfs_rq_runtime(cfs_rq);
3120
3121        update_min_vruntime(cfs_rq);
3122        update_cfs_shares(cfs_rq);
3123}
3124
3125/*
3126 * Preempt the current task with a newly woken task if needed:
3127 */
3128static void
3129check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3130{
3131        unsigned long ideal_runtime, delta_exec;
3132        struct sched_entity *se;
3133        s64 delta;
3134
3135        ideal_runtime = sched_slice(cfs_rq, curr);
3136        delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
3137        if (delta_exec > ideal_runtime) {
3138                resched_curr(rq_of(cfs_rq));
3139                /*
3140                 * The current task ran long enough, ensure it doesn't get
3141                 * re-elected due to buddy favours.
3142                 */
3143                clear_buddies(cfs_rq, curr);
3144                return;
3145        }
3146
3147        /*
3148         * Ensure that a task that missed wakeup preemption by a
3149         * narrow margin doesn't have to wait for a full slice.
3150         * This also mitigates buddy induced latencies under load.
3151         */
3152        if (delta_exec < sysctl_sched_min_granularity)
3153                return;
3154
3155        se = __pick_first_entity(cfs_rq);
3156        delta = curr->vruntime - se->vruntime;
3157
3158        if (delta < 0)
3159                return;
3160
3161        if (delta > ideal_runtime)
3162                resched_curr(rq_of(cfs_rq));
3163}
3164
3165static void
3166set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
3167{
3168        /* 'current' is not kept within the tree. */
3169        if (se->on_rq) {
3170                /*
3171                 * Any task has to be enqueued before it get to execute on
3172                 * a CPU. So account for the time it spent waiting on the
3173                 * runqueue.
3174                 */
3175                update_stats_wait_end(cfs_rq, se);
3176                __dequeue_entity(cfs_rq, se);
3177                update_load_avg(se, 1);
3178        }
3179
3180        update_stats_curr_start(cfs_rq, se);
3181        cfs_rq->curr = se;
3182#ifdef CONFIG_SCHEDSTATS
3183        /*
3184         * Track our maximum slice length, if the CPU's load is at
3185         * least twice that of our own weight (i.e. dont track it
3186         * when there are only lesser-weight tasks around):
3187         */
3188        if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
3189                se->statistics.slice_max = max(se->statistics.slice_max,
3190                        se->sum_exec_runtime - se->prev_sum_exec_runtime);
3191        }
3192#endif
3193        se->prev_sum_exec_runtime = se->sum_exec_runtime;
3194}
3195
3196static int
3197wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
3198
3199/*
3200 * Pick the next process, keeping these things in mind, in this order:
3201 * 1) keep things fair between processes/task groups
3202 * 2) pick the "next" process, since someone really wants that to run
3203 * 3) pick the "last" process, for cache locality
3204 * 4) do not run the "skip" process, if something else is available
3205 */
3206static struct sched_entity *
3207pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3208{
3209        struct sched_entity *left = __pick_first_entity(cfs_rq);
3210        struct sched_entity *se;
3211
3212        /*
3213         * If curr is set we have to see if its left of the leftmost entity
3214         * still in the tree, provided there was anything in the tree at all.
3215         */
3216        if (!left || (curr && entity_before(curr, left)))
3217                left = curr;
3218
3219        se = left; /* ideally we run the leftmost entity */
3220
3221        /*
3222         * Avoid running the skip buddy, if running something else can
3223         * be done without getting too unfair.
3224         */
3225        if (cfs_rq->skip == se) {
3226                struct sched_entity *second;
3227
3228                if (se == curr) {
3229                        second = __pick_first_entity(cfs_rq);
3230                } else {
3231                        second = __pick_next_entity(se);
3232                        if (!second || (curr && entity_before(curr, second)))
3233                                second = curr;
3234                }
3235
3236                if (second && wakeup_preempt_entity(second, left) < 1)
3237                        se = second;
3238        }
3239
3240        /*
3241         * Prefer last buddy, try to return the CPU to a preempted task.
3242         */
3243        if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
3244                se = cfs_rq->last;
3245
3246        /*
3247         * Someone really wants this to run. If it's not unfair, run it.
3248         */
3249        if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
3250                se = cfs_rq->next;
3251
3252        clear_buddies(cfs_rq, se);
3253
3254        return se;
3255}
3256
3257static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3258
3259static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
3260{
3261        /*
3262         * If still on the runqueue then deactivate_task()
3263         * was not called and update_curr() has to be done:
3264         */
3265        if (prev->on_rq)
3266                update_curr(cfs_rq);
3267
3268        /* throttle cfs_rqs exceeding runtime */
3269        check_cfs_rq_runtime(cfs_rq);
3270
3271        check_spread(cfs_rq, prev);
3272        if (prev->on_rq) {
3273                update_stats_wait_start(cfs_rq, prev);
3274                /* Put 'current' back into the tree. */
3275                __enqueue_entity(cfs_rq, prev);
3276                /* in !on_rq case, update occurred at dequeue */
3277                update_load_avg(prev, 0);
3278        }
3279        cfs_rq->curr = NULL;
3280}
3281
3282static void
3283entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
3284{
3285        /*
3286         * Update run-time statistics of the 'current'.
3287         */
3288        update_curr(cfs_rq);
3289
3290        /*
3291         * Ensure that runnable average is periodically updated.
3292         */
3293        update_load_avg(curr, 1);
3294        update_cfs_shares(cfs_rq);
3295
3296#ifdef CONFIG_SCHED_HRTICK
3297        /*
3298         * queued ticks are scheduled to match the slice, so don't bother
3299         * validating it and just reschedule.
3300         */
3301        if (queued) {
3302                resched_curr(rq_of(cfs_rq));
3303                return;
3304        }
3305        /*
3306         * don't let the period tick interfere with the hrtick preemption
3307         */
3308        if (!sched_feat(DOUBLE_TICK) &&
3309                        hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
3310                return;
3311#endif
3312
3313        if (cfs_rq->nr_running > 1)
3314                check_preempt_tick(cfs_rq, curr);
3315}
3316
3317
3318/**************************************************
3319 * CFS bandwidth control machinery
3320 */
3321
3322#ifdef CONFIG_CFS_BANDWIDTH
3323
3324#ifdef HAVE_JUMP_LABEL
3325static struct static_key __cfs_bandwidth_used;
3326
3327static inline bool cfs_bandwidth_used(void)
3328{
3329        return static_key_false(&__cfs_bandwidth_used);
3330}
3331
3332void cfs_bandwidth_usage_inc(void)
3333{
3334        static_key_slow_inc(&__cfs_bandwidth_used);
3335}
3336
3337void cfs_bandwidth_usage_dec(void)
3338{
3339        static_key_slow_dec(&__cfs_bandwidth_used);
3340}
3341#else /* HAVE_JUMP_LABEL */
3342static bool cfs_bandwidth_used(void)
3343{
3344        return true;
3345}
3346
3347void cfs_bandwidth_usage_inc(void) {}
3348void cfs_bandwidth_usage_dec(void) {}
3349#endif /* HAVE_JUMP_LABEL */
3350
3351/*
3352 * default period for cfs group bandwidth.
3353 * default: 0.1s, units: nanoseconds
3354 */
3355static inline u64 default_cfs_period(void)
3356{
3357        return 100000000ULL;
3358}
3359
3360static inline u64 sched_cfs_bandwidth_slice(void)
3361{
3362        return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
3363}
3364
3365/*
3366 * Replenish runtime according to assigned quota and update expiration time.
3367 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
3368 * additional synchronization around rq->lock.
3369 *
3370 * requires cfs_b->lock
3371 */
3372void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
3373{
3374        u64 now;
3375
3376        if (cfs_b->quota == RUNTIME_INF)
3377                return;
3378
3379        now = sched_clock_cpu(smp_processor_id());
3380        cfs_b->runtime = cfs_b->quota;
3381        cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
3382}
3383
3384static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3385{
3386        return &tg->cfs_bandwidth;
3387}
3388
3389/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
3390static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3391{
3392        if (unlikely(cfs_rq->throttle_count))
3393                return cfs_rq->throttled_clock_task;
3394
3395        return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
3396}
3397
3398/* returns 0 on failure to allocate runtime */
3399static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3400{
3401        struct task_group *tg = cfs_rq->tg;
3402        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
3403        u64 amount = 0, min_amount, expires;
3404
3405        /* note: this is a positive sum as runtime_remaining <= 0 */
3406        min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
3407
3408        raw_spin_lock(&cfs_b->lock);
3409        if (cfs_b->quota == RUNTIME_INF)
3410                amount = min_amount;
3411        else {
3412                start_cfs_bandwidth(cfs_b);
3413
3414                if (cfs_b->runtime > 0) {
3415                        amount = min(cfs_b->runtime, min_amount);
3416                        cfs_b->runtime -= amount;
3417                        cfs_b->idle = 0;
3418                }
3419        }
3420        expires = cfs_b->runtime_expires;
3421        raw_spin_unlock(&cfs_b->lock);
3422
3423        cfs_rq->runtime_remaining += amount;
3424        /*
3425         * we may have advanced our local expiration to account for allowed
3426         * spread between our sched_clock and the one on which runtime was
3427         * issued.
3428         */
3429        if ((s64)(expires - cfs_rq->runtime_expires) > 0)
3430                cfs_rq->runtime_expires = expires;
3431
3432        return cfs_rq->runtime_remaining > 0;
3433}
3434
3435/*
3436 * Note: This depends on the synchronization provided by sched_clock and the
3437 * fact that rq->clock snapshots this value.
3438 */
3439static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3440{
3441        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3442
3443        /* if the deadline is ahead of our clock, nothing to do */
3444        if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
3445                return;
3446
3447        if (cfs_rq->runtime_remaining < 0)
3448                return;
3449
3450        /*
3451         * If the local deadline has passed we have to consider the
3452         * possibility that our sched_clock is 'fast' and the global deadline
3453         * has not truly expired.
3454         *
3455         * Fortunately we can check determine whether this the case by checking
3456         * whether the global deadline has advanced. It is valid to compare
3457         * cfs_b->runtime_expires without any locks since we only care about
3458         * exact equality, so a partial write will still work.
3459         */
3460
3461        if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
3462                /* extend local deadline, drift is bounded above by 2 ticks */
3463                cfs_rq->runtime_expires += TICK_NSEC;
3464        } else {
3465                /* global deadline is ahead, expiration has passed */
3466                cfs_rq->runtime_remaining = 0;
3467        }
3468}
3469
3470static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3471{
3472        /* dock delta_exec before expiring quota (as it could span periods) */
3473        cfs_rq->runtime_remaining -= delta_exec;
3474        expire_cfs_rq_runtime(cfs_rq);
3475
3476        if (likely(cfs_rq->runtime_remaining > 0))
3477                return;
3478
3479        /*
3480         * if we're unable to extend our runtime we resched so that the active
3481         * hierarchy can be throttled
3482         */
3483        if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
3484                resched_curr(rq_of(cfs_rq));
3485}
3486
3487static __always_inline
3488void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3489{
3490        if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
3491                return;
3492
3493        __account_cfs_rq_runtime(cfs_rq, delta_exec);
3494}
3495
3496static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3497{
3498        return cfs_bandwidth_used() && cfs_rq->throttled;
3499}
3500
3501/* check whether cfs_rq, or any parent, is throttled */
3502static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3503{
3504        return cfs_bandwidth_used() && cfs_rq->throttle_count;
3505}
3506
3507/*
3508 * Ensure that neither of the group entities corresponding to src_cpu or
3509 * dest_cpu are members of a throttled hierarchy when performing group
3510 * load-balance operations.
3511 */
3512static inline int throttled_lb_pair(struct task_group *tg,
3513                                    int src_cpu, int dest_cpu)
3514{
3515        struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
3516
3517        src_cfs_rq = tg->cfs_rq[src_cpu];
3518        dest_cfs_rq = tg->cfs_rq[dest_cpu];
3519
3520        return throttled_hierarchy(src_cfs_rq) ||
3521               throttled_hierarchy(dest_cfs_rq);
3522}
3523
3524/* updated child weight may affect parent so we have to do this bottom up */
3525static int tg_unthrottle_up(struct task_group *tg, void *data)
3526{
3527        struct rq *rq = data;
3528        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3529
3530        cfs_rq->throttle_count--;
3531#ifdef CONFIG_SMP
3532        if (!cfs_rq->throttle_count) {
3533                /* adjust cfs_rq_clock_task() */
3534                cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
3535                                             cfs_rq->throttled_clock_task;
3536        }
3537#endif
3538
3539        return 0;
3540}
3541
3542static int tg_throttle_down(struct task_group *tg, void *data)
3543{
3544        struct rq *rq = data;
3545        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3546
3547        /* group is entering throttled state, stop time */
3548        if (!cfs_rq->throttle_count)
3549                cfs_rq->throttled_clock_task = rq_clock_task(rq);
3550        cfs_rq->throttle_count++;
3551
3552        return 0;
3553}
3554
3555static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3556{
3557        struct rq *rq = rq_of(cfs_rq);
3558        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3559        struct sched_entity *se;
3560        long task_delta, dequeue = 1;
3561        bool empty;
3562
3563        se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
3564
3565        /* freeze hierarchy runnable averages while throttled */
3566        rcu_read_lock();
3567        walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
3568        rcu_read_unlock();
3569
3570        task_delta = cfs_rq->h_nr_running;
3571        for_each_sched_entity(se) {
3572                struct cfs_rq *qcfs_rq = cfs_rq_of(se);
3573                /* throttled entity or throttle-on-deactivate */
3574                if (!se->on_rq)
3575                        break;
3576
3577                if (dequeue)
3578                        dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
3579                qcfs_rq->h_nr_running -= task_delta;
3580
3581                if (qcfs_rq->load.weight)
3582                        dequeue = 0;
3583        }
3584
3585        if (!se)
3586                sub_nr_running(rq, task_delta);
3587
3588        cfs_rq->throttled = 1;
3589        cfs_rq->throttled_clock = rq_clock(rq);
3590        raw_spin_lock(&cfs_b->lock);
3591        empty = list_empty(&cfs_b->throttled_cfs_rq);
3592
3593        /*
3594         * Add to the _head_ of the list, so that an already-started
3595         * distribute_cfs_runtime will not see us
3596         */
3597        list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3598
3599        /*
3600         * If we're the first throttled task, make sure the bandwidth
3601         * timer is running.
3602         */
3603        if (empty)
3604                start_cfs_bandwidth(cfs_b);
3605
3606        raw_spin_unlock(&cfs_b->lock);
3607}
3608
3609void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3610{
3611        struct rq *rq = rq_of(cfs_rq);
3612        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3613        struct sched_entity *se;
3614        int enqueue = 1;
3615        long task_delta;
3616
3617        se = cfs_rq->tg->se[cpu_of(rq)];
3618
3619        cfs_rq->throttled = 0;
3620
3621        update_rq_clock(rq);
3622
3623        raw_spin_lock(&cfs_b->lock);
3624        cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
3625        list_del_rcu(&cfs_rq->throttled_list);
3626        raw_spin_unlock(&cfs_b->lock);
3627
3628        /* update hierarchical throttle state */
3629        walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
3630
3631        if (!cfs_rq->load.weight)
3632                return;
3633
3634        task_delta = cfs_rq->h_nr_running;
3635        for_each_sched_entity(se) {
3636                if (se->on_rq)
3637                        enqueue = 0;
3638
3639                cfs_rq = cfs_rq_of(se);
3640                if (enqueue)
3641                        enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
3642                cfs_rq->h_nr_running += task_delta;
3643
3644                if (cfs_rq_throttled(cfs_rq))
3645                        break;
3646        }
3647
3648        if (!se)
3649                add_nr_running(rq, task_delta);
3650
3651        /* determine whether we need to wake up potentially idle cpu */
3652        if (rq->curr == rq->idle && rq->cfs.nr_running)
3653                resched_curr(rq);
3654}
3655
3656static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
3657                u64 remaining, u64 expires)
3658{
3659        struct cfs_rq *cfs_rq;
3660        u64 runtime;
3661        u64 starting_runtime = remaining;
3662
3663        rcu_read_lock();
3664        list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
3665                                throttled_list) {
3666                struct rq *rq = rq_of(cfs_rq);
3667
3668                raw_spin_lock(&rq->lock);
3669                if (!cfs_rq_throttled(cfs_rq))
3670                        goto next;
3671
3672                runtime = -cfs_rq->runtime_remaining + 1;
3673                if (runtime > remaining)
3674                        runtime = remaining;
3675                remaining -= runtime;
3676
3677                cfs_rq->runtime_remaining += runtime;
3678                cfs_rq->runtime_expires = expires;
3679
3680                /* we check whether we're throttled above */
3681                if (cfs_rq->runtime_remaining > 0)
3682                        unthrottle_cfs_rq(cfs_rq);
3683
3684next:
3685                raw_spin_unlock(&rq->lock);
3686
3687                if (!remaining)
3688                        break;
3689        }
3690        rcu_read_unlock();
3691
3692        return starting_runtime - remaining;
3693}
3694
3695/*
3696 * Responsible for refilling a task_group's bandwidth and unthrottling its
3697 * cfs_rqs as appropriate. If there has been no activity within the last
3698 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
3699 * used to track this state.
3700 */
3701static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3702{
3703        u64 runtime, runtime_expires;
3704        int throttled;
3705
3706        /* no need to continue the timer with no bandwidth constraint */
3707        if (cfs_b->quota == RUNTIME_INF)
3708                goto out_deactivate;
3709
3710        throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3711        cfs_b->nr_periods += overrun;
3712
3713        /*
3714         * idle depends on !throttled (for the case of a large deficit), and if
3715         * we're going inactive then everything else can be deferred
3716         */
3717        if (cfs_b->idle && !throttled)
3718                goto out_deactivate;
3719
3720        __refill_cfs_bandwidth_runtime(cfs_b);
3721
3722        if (!throttled) {
3723                /* mark as potentially idle for the upcoming period */
3724                cfs_b->idle = 1;
3725                return 0;
3726        }
3727
3728        /* account preceding periods in which throttling occurred */
3729        cfs_b->nr_throttled += overrun;
3730
3731        runtime_expires = cfs_b->runtime_expires;
3732
3733        /*
3734         * This check is repeated as we are holding onto the new bandwidth while
3735         * we unthrottle. This can potentially race with an unthrottled group
3736         * trying to acquire new bandwidth from the global pool. This can result
3737         * in us over-using our runtime if it is all used during this loop, but
3738         * only by limited amounts in that extreme case.
3739         */
3740        while (throttled && cfs_b->runtime > 0) {
3741                runtime = cfs_b->runtime;
3742                raw_spin_unlock(&cfs_b->lock);
3743                /* we can't nest cfs_b->lock while distributing bandwidth */
3744                runtime = distribute_cfs_runtime(cfs_b, runtime,
3745                                                 runtime_expires);
3746                raw_spin_lock(&cfs_b->lock);
3747
3748                throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3749
3750                cfs_b->runtime -= min(runtime, cfs_b->runtime);
3751        }
3752
3753        /*
3754         * While we are ensured activity in the period following an
3755         * unthrottle, this also covers the case in which the new bandwidth is
3756         * insufficient to cover the existing bandwidth deficit.  (Forcing the
3757         * timer to remain active while there are any throttled entities.)
3758         */
3759        cfs_b->idle = 0;
3760
3761        return 0;
3762
3763out_deactivate:
3764        return 1;
3765}
3766
3767/* a cfs_rq won't donate quota below this amount */
3768static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
3769/* minimum remaining period time to redistribute slack quota */
3770static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
3771/* how long we wait to gather additional slack before distributing */
3772static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
3773
3774/*
3775 * Are we near the end of the current quota period?
3776 *
3777 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
3778 * hrtimer base being cleared by hrtimer_start. In the case of
3779 * migrate_hrtimers, base is never cleared, so we are fine.
3780 */
3781static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
3782{
3783        struct hrtimer *refresh_timer = &cfs_b->period_timer;
3784        u64 remaining;
3785
3786        /* if the call-back is running a quota refresh is already occurring */
3787        if (hrtimer_callback_running(refresh_timer))
3788                return 1;
3789
3790        /* is a quota refresh about to occur? */
3791        remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
3792        if (remaining < min_expire)
3793                return 1;
3794
3795        return 0;
3796}
3797
3798static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
3799{
3800        u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
3801
3802        /* if there's a quota refresh soon don't bother with slack */
3803        if (runtime_refresh_within(cfs_b, min_left))
3804                return;
3805
3806        hrtimer_start(&cfs_b->slack_timer,
3807                        ns_to_ktime(cfs_bandwidth_slack_period),
3808                        HRTIMER_MODE_REL);
3809}
3810
3811/* we know any runtime found here is valid as update_curr() precedes return */
3812static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3813{
3814        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3815        s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
3816
3817        if (slack_runtime <= 0)
3818                return;
3819
3820        raw_spin_lock(&cfs_b->lock);
3821        if (cfs_b->quota != RUNTIME_INF &&
3822            cfs_rq->runtime_expires == cfs_b->runtime_expires) {
3823                cfs_b->runtime += slack_runtime;
3824
3825                /* we are under rq->lock, defer unthrottling using a timer */
3826                if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
3827                    !list_empty(&cfs_b->throttled_cfs_rq))
3828                        start_cfs_slack_bandwidth(cfs_b);
3829        }
3830        raw_spin_unlock(&cfs_b->lock);
3831
3832        /* even if it's not valid for return we don't want to try again */
3833        cfs_rq->runtime_remaining -= slack_runtime;
3834}
3835
3836static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3837{
3838        if (!cfs_bandwidth_used())
3839                return;
3840
3841        if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
3842                return;
3843
3844        __return_cfs_rq_runtime(cfs_rq);
3845}
3846
3847/*
3848 * This is done with a timer (instead of inline with bandwidth return) since
3849 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
3850 */
3851static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3852{
3853        u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
3854        u64 expires;
3855
3856        /* confirm we're still not at a refresh boundary */
3857        raw_spin_lock(&cfs_b->lock);
3858        if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
3859                raw_spin_unlock(&cfs_b->lock);
3860                return;
3861        }
3862
3863        if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
3864                runtime = cfs_b->runtime;
3865
3866        expires = cfs_b->runtime_expires;
3867        raw_spin_unlock(&cfs_b->lock);
3868
3869        if (!runtime)
3870                return;
3871
3872        runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
3873
3874        raw_spin_lock(&cfs_b->lock);
3875        if (expires == cfs_b->runtime_expires)
3876                cfs_b->runtime -= min(runtime, cfs_b->runtime);
3877        raw_spin_unlock(&cfs_b->lock);
3878}
3879
3880/*
3881 * When a group wakes up we want to make sure that its quota is not already
3882 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
3883 * runtime as update_curr() throttling can not not trigger until it's on-rq.
3884 */
3885static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
3886{
3887        if (!cfs_bandwidth_used())
3888                return;
3889
3890        /* an active group must be handled by the update_curr()->put() path */
3891        if (!cfs_rq->runtime_enabled || cfs_rq->curr)
3892                return;
3893
3894        /* ensure the group is not already throttled */
3895        if (cfs_rq_throttled(cfs_rq))
3896                return;
3897
3898        /* update runtime allocation */
3899        account_cfs_rq_runtime(cfs_rq, 0);
3900        if (cfs_rq->runtime_remaining <= 0)
3901                throttle_cfs_rq(cfs_rq);
3902}
3903
3904/* conditionally throttle active cfs_rq's from put_prev_entity() */
3905static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3906{
3907        if (!cfs_bandwidth_used())
3908                return false;
3909
3910        if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
3911                return false;
3912
3913        /*
3914         * it's possible for a throttled entity to be forced into a running
3915         * state (e.g. set_curr_task), in this case we're finished.
3916         */
3917        if (cfs_rq_throttled(cfs_rq))
3918                return true;
3919
3920        throttle_cfs_rq(cfs_rq);
3921        return true;
3922}
3923
3924static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
3925{
3926        struct cfs_bandwidth *cfs_b =
3927                container_of(timer, struct cfs_bandwidth, slack_timer);
3928
3929        do_sched_cfs_slack_timer(cfs_b);
3930
3931        return HRTIMER_NORESTART;
3932}
3933
3934static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
3935{
3936        struct cfs_bandwidth *cfs_b =
3937                container_of(timer, struct cfs_bandwidth, period_timer);
3938        int overrun;
3939        int idle = 0;
3940
3941        raw_spin_lock(&cfs_b->lock);
3942        for (;;) {
3943                overrun = hrtimer_forward_now(timer, cfs_b->period);
3944                if (!overrun)
3945                        break;
3946
3947                idle = do_sched_cfs_period_timer(cfs_b, overrun);
3948        }
3949        if (idle)
3950                cfs_b->period_active = 0;
3951        raw_spin_unlock(&cfs_b->lock);
3952
3953        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
3954}
3955
3956void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3957{
3958        raw_spin_lock_init(&cfs_b->lock);
3959        cfs_b->runtime = 0;
3960        cfs_b->quota = RUNTIME_INF;
3961        cfs_b->period = ns_to_ktime(default_cfs_period());
3962
3963        INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
3964        hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
3965        cfs_b->period_timer.function = sched_cfs_period_timer;
3966        hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3967        cfs_b->slack_timer.function = sched_cfs_slack_timer;
3968}
3969
3970static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3971{
3972        cfs_rq->runtime_enabled = 0;
3973        INIT_LIST_HEAD(&cfs_rq->throttled_list);
3974}
3975
3976void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3977{
3978        lockdep_assert_held(&cfs_b->lock);
3979
3980        if (!cfs_b->period_active) {
3981                cfs_b->period_active = 1;
3982                hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
3983                hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
3984        }
3985}
3986
3987static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3988{
3989        /* init_cfs_bandwidth() was not called */
3990        if (!cfs_b->throttled_cfs_rq.next)
3991                return;
3992
3993        hrtimer_cancel(&cfs_b->period_timer);
3994        hrtimer_cancel(&cfs_b->slack_timer);
3995}
3996
3997static void __maybe_unused update_runtime_enabled(struct rq *rq)
3998{
3999        struct cfs_rq *cfs_rq;
4000
4001        for_each_leaf_cfs_rq(rq, cfs_rq) {
4002                struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
4003
4004                raw_spin_lock(&cfs_b->lock);
4005                cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
4006                raw_spin_unlock(&cfs_b->lock);
4007        }
4008}
4009
4010static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
4011{
4012        struct cfs_rq *cfs_rq;
4013
4014        for_each_leaf_cfs_rq(rq, cfs_rq) {
4015                if (!cfs_rq->runtime_enabled)
4016                        continue;
4017
4018                /*
4019                 * clock_task is not advancing so we just need to make sure
4020                 * there's some valid quota amount
4021                 */
4022                cfs_rq->runtime_remaining = 1;
4023                /*
4024                 * Offline rq is schedulable till cpu is completely disabled
4025                 * in take_cpu_down(), so we prevent new cfs throttling here.
4026                 */
4027                cfs_rq->runtime_enabled = 0;
4028
4029                if (cfs_rq_throttled(cfs_rq))
4030                        unthrottle_cfs_rq(cfs_rq);
4031        }
4032}
4033
4034#else /* CONFIG_CFS_BANDWIDTH */
4035static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4036{
4037        return rq_clock_task(rq_of(cfs_rq));
4038}
4039
4040static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
4041static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
4042static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
4043static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4044
4045static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4046{
4047        return 0;
4048}
4049
4050static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4051{
4052        return 0;
4053}
4054
4055static inline int throttled_lb_pair(struct task_group *tg,
4056                                    int src_cpu, int dest_cpu)
4057{
4058        return 0;
4059}
4060
4061void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4062
4063#ifdef CONFIG_FAIR_GROUP_SCHED
4064static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4065#endif
4066
4067static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4068{
4069        return NULL;
4070}
4071static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4072static inline void update_runtime_enabled(struct rq *rq) {}
4073static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
4074
4075#endif /* CONFIG_CFS_BANDWIDTH */
4076
4077/**************************************************
4078 * CFS operations on tasks:
4079 */
4080
4081#ifdef CONFIG_SCHED_HRTICK
4082static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
4083{
4084        struct sched_entity *se = &p->se;
4085        struct cfs_rq *cfs_rq = cfs_rq_of(se);
4086
4087        WARN_ON(task_rq(p) != rq);
4088
4089        if (cfs_rq->nr_running > 1) {
4090                u64 slice = sched_slice(cfs_rq, se);
4091                u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
4092                s64 delta = slice - ran;
4093
4094                if (delta < 0) {
4095                        if (rq->curr == p)
4096                                resched_curr(rq);
4097                        return;
4098                }
4099                hrtick_start(rq, delta);
4100        }
4101}
4102
4103/*
4104 * called from enqueue/dequeue and updates the hrtick when the
4105 * current task is from our class and nr_running is low enough
4106 * to matter.
4107 */
4108static void hrtick_update(struct rq *rq)
4109{
4110        struct task_struct *curr = rq->curr;
4111
4112        if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
4113                return;
4114
4115        if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
4116                hrtick_start_fair(rq, curr);
4117}
4118#else /* !CONFIG_SCHED_HRTICK */
4119static inline void
4120hrtick_start_fair(struct rq *rq, struct task_struct *p)
4121{
4122}
4123
4124static inline void hrtick_update(struct rq *rq)
4125{
4126}
4127#endif
4128
4129/*
4130 * The enqueue_task method is called before nr_running is
4131 * increased. Here we update the fair scheduling stats and
4132 * then put the task into the rbtree:
4133 */
4134static void
4135enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4136{
4137        struct cfs_rq *cfs_rq;
4138        struct sched_entity *se = &p->se;
4139
4140        for_each_sched_entity(se) {
4141                if (se->on_rq)
4142                        break;
4143                cfs_rq = cfs_rq_of(se);
4144                enqueue_entity(cfs_rq, se, flags);
4145
4146                /*
4147                 * end evaluation on encountering a throttled cfs_rq
4148                 *
4149                 * note: in the case of encountering a throttled cfs_rq we will
4150                 * post the final h_nr_running increment below.
4151                */
4152                if (cfs_rq_throttled(cfs_rq))
4153                        break;
4154                cfs_rq->h_nr_running++;
4155
4156                flags = ENQUEUE_WAKEUP;
4157        }
4158
4159        for_each_sched_entity(se) {
4160                cfs_rq = cfs_rq_of(se);
4161                cfs_rq->h_nr_running++;
4162
4163                if (cfs_rq_throttled(cfs_rq))
4164                        break;
4165
4166                update_load_avg(se, 1);
4167                update_cfs_shares(cfs_rq);
4168        }
4169
4170        if (!se)
4171                add_nr_running(rq, 1);
4172
4173        hrtick_update(rq);
4174}
4175
4176static void set_next_buddy(struct sched_entity *se);
4177
4178/*
4179 * The dequeue_task method is called before nr_running is
4180 * decreased. We remove the task from the rbtree and
4181 * update the fair scheduling stats:
4182 */
4183static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4184{
4185        struct cfs_rq *cfs_rq;
4186        struct sched_entity *se = &p->se;
4187        int task_sleep = flags & DEQUEUE_SLEEP;
4188
4189        for_each_sched_entity(se) {
4190                cfs_rq = cfs_rq_of(se);
4191                dequeue_entity(cfs_rq, se, flags);
4192
4193                /*
4194                 * end evaluation on encountering a throttled cfs_rq
4195                 *
4196                 * note: in the case of encountering a throttled cfs_rq we will
4197                 * post the final h_nr_running decrement below.
4198                */
4199                if (cfs_rq_throttled(cfs_rq))
4200                        break;
4201                cfs_rq->h_nr_running--;
4202
4203                /* Don't dequeue parent if it has other entities besides us */
4204                if (cfs_rq->load.weight) {
4205                        /*
4206                         * Bias pick_next to pick a task from this cfs_rq, as
4207                         * p is sleeping when it is within its sched_slice.
4208                         */
4209                        if (task_sleep && parent_entity(se))
4210                                set_next_buddy(parent_entity(se));
4211
4212                        /* avoid re-evaluating load for this entity */
4213                        se = parent_entity(se);
4214                        break;
4215                }
4216                flags |= DEQUEUE_SLEEP;
4217        }
4218
4219        for_each_sched_entity(se) {
4220                cfs_rq = cfs_rq_of(se);
4221                cfs_rq->h_nr_running--;
4222
4223                if (cfs_rq_throttled(cfs_rq))
4224                        break;
4225
4226                update_load_avg(se, 1);
4227                update_cfs_shares(cfs_rq);
4228        }
4229
4230        if (!se)
4231                sub_nr_running(rq, 1);
4232
4233        hrtick_update(rq);
4234}
4235
4236#ifdef CONFIG_SMP
4237
4238/*
4239 * per rq 'load' arrray crap; XXX kill this.
4240 */
4241
4242/*
4243 * The exact cpuload at various idx values, calculated at every tick would be
4244 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
4245 *
4246 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
4247 * on nth tick when cpu may be busy, then we have:
4248 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
4249 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
4250 *
4251 * decay_load_missed() below does efficient calculation of
4252 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
4253 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
4254 *
4255 * The calculation is approximated on a 128 point scale.
4256 * degrade_zero_ticks is the number of ticks after which load at any
4257 * particular idx is approximated to be zero.
4258 * degrade_factor is a precomputed table, a row for each load idx.
4259 * Each column corresponds to degradation factor for a power of two ticks,
4260 * based on 128 point scale.
4261 * Example:
4262 * row 2, col 3 (=12) says that the degradation at load idx 2 after
4263 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
4264 *
4265 * With this power of 2 load factors, we can degrade the load n times
4266 * by looking at 1 bits in n and doing as many mult/shift instead of
4267 * n mult/shifts needed by the exact degradation.
4268 */
4269#define DEGRADE_SHIFT           7
4270static const unsigned char
4271                degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
4272static const unsigned char
4273                degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
4274                                        {0, 0, 0, 0, 0, 0, 0, 0},
4275                                        {64, 32, 8, 0, 0, 0, 0, 0},
4276                                        {96, 72, 40, 12, 1, 0, 0},
4277                                        {112, 98, 75, 43, 15, 1, 0},
4278                                        {120, 112, 98, 76, 45, 16, 2} };
4279
4280/*
4281 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
4282 * would be when CPU is idle and so we just decay the old load without
4283 * adding any new load.
4284 */
4285static unsigned long
4286decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
4287{
4288        int j = 0;
4289
4290        if (!missed_updates)
4291                return load;
4292
4293        if (missed_updates >= degrade_zero_ticks[idx])
4294                return 0;
4295
4296        if (idx == 1)
4297                return load >> missed_updates;
4298
4299        while (missed_updates) {
4300                if (missed_updates % 2)
4301                        load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
4302
4303                missed_updates >>= 1;
4304                j++;
4305        }
4306        return load;
4307}
4308
4309/*
4310 * Update rq->cpu_load[] statistics. This function is usually called every
4311 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
4312 * every tick. We fix it up based on jiffies.
4313 */
4314static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
4315                              unsigned long pending_updates)
4316{
4317        int i, scale;
4318
4319        this_rq->nr_load_updates++;
4320
4321        /* Update our load: */
4322        this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
4323        for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
4324                unsigned long old_load, new_load;
4325
4326                /* scale is effectively 1 << i now, and >> i divides by scale */
4327
4328                old_load = this_rq->cpu_load[i];
4329                old_load = decay_load_missed(old_load, pending_updates - 1, i);
4330                new_load = this_load;
4331                /*
4332                 * Round up the averaging division if load is increasing. This
4333                 * prevents us from getting stuck on 9 if the load is 10, for
4334                 * example.
4335                 */
4336                if (new_load > old_load)
4337                        new_load += scale - 1;
4338
4339                this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
4340        }
4341
4342        sched_avg_update(this_rq);
4343}
4344
4345/* Used instead of source_load when we know the type == 0 */
4346static unsigned long weighted_cpuload(const int cpu)
4347{
4348        return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
4349}
4350
4351#ifdef CONFIG_NO_HZ_COMMON
4352/*
4353 * There is no sane way to deal with nohz on smp when using jiffies because the
4354 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
4355 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
4356 *
4357 * Therefore we cannot use the delta approach from the regular tick since that
4358 * would seriously skew the load calculation. However we'll make do for those
4359 * updates happening while idle (nohz_idle_balance) or coming out of idle
4360 * (tick_nohz_idle_exit).
4361 *
4362 * This means we might still be one tick off for nohz periods.
4363 */
4364
4365/*
4366 * Called from nohz_idle_balance() to update the load ratings before doing the
4367 * idle balance.
4368 */
4369static void update_idle_cpu_load(struct rq *this_rq)
4370{
4371        unsigned long curr_jiffies = READ_ONCE(jiffies);
4372        unsigned long load = weighted_cpuload(cpu_of(this_rq));
4373        unsigned long pending_updates;
4374
4375        /*
4376         * bail if there's load or we're actually up-to-date.
4377         */
4378        if (load || curr_jiffies == this_rq->last_load_update_tick)
4379                return;
4380
4381        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4382        this_rq->last_load_update_tick = curr_jiffies;
4383
4384        __update_cpu_load(this_rq, load, pending_updates);
4385}
4386
4387/*
4388 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
4389 */
4390void update_cpu_load_nohz(void)
4391{
4392        struct rq *this_rq = this_rq();
4393        unsigned long curr_jiffies = READ_ONCE(jiffies);
4394        unsigned long pending_updates;
4395
4396        if (curr_jiffies == this_rq->last_load_update_tick)
4397                return;
4398
4399        raw_spin_lock(&this_rq->lock);
4400        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4401        if (pending_updates) {
4402                this_rq->last_load_update_tick = curr_jiffies;
4403                /*
4404                 * We were idle, this means load 0, the current load might be
4405                 * !0 due to remote wakeups and the sort.
4406                 */
4407                __update_cpu_load(this_rq, 0, pending_updates);
4408        }
4409        raw_spin_unlock(&this_rq->lock);
4410}
4411#endif /* CONFIG_NO_HZ */
4412
4413/*
4414 * Called from scheduler_tick()
4415 */
4416void update_cpu_load_active(struct rq *this_rq)
4417{
4418        unsigned long load = weighted_cpuload(cpu_of(this_rq));
4419        /*
4420         * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
4421         */
4422        this_rq->last_load_update_tick = jiffies;
4423        __update_cpu_load(this_rq, load, 1);
4424}
4425
4426/*
4427 * Return a low guess at the load of a migration-source cpu weighted
4428 * according to the scheduling class and "nice" value.
4429 *
4430 * We want to under-estimate the load of migration sources, to
4431 * balance conservatively.
4432 */
4433static unsigned long source_load(int cpu, int type)
4434{
4435        struct rq *rq = cpu_rq(cpu);
4436        unsigned long total = weighted_cpuload(cpu);
4437
4438        if (type == 0 || !sched_feat(LB_BIAS))
4439                return total;
4440
4441        return min(rq->cpu_load[type-1], total);
4442}
4443
4444/*
4445 * Return a high guess at the load of a migration-target cpu weighted
4446 * according to the scheduling class and "nice" value.
4447 */
4448static unsigned long target_load(int cpu, int type)
4449{
4450        struct rq *rq = cpu_rq(cpu);
4451        unsigned long total = weighted_cpuload(cpu);
4452
4453        if (type == 0 || !sched_feat(LB_BIAS))
4454                return total;
4455
4456        return max(rq->cpu_load[type-1], total);
4457}
4458
4459static unsigned long capacity_of(int cpu)
4460{
4461        return cpu_rq(cpu)->cpu_capacity;
4462}
4463
4464static unsigned long capacity_orig_of(int cpu)
4465{
4466        return cpu_rq(cpu)->cpu_capacity_orig;
4467}
4468
4469static unsigned long cpu_avg_load_per_task(int cpu)
4470{
4471        struct rq *rq = cpu_rq(cpu);
4472        unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
4473        unsigned long load_avg = weighted_cpuload(cpu);
4474
4475        if (nr_running)
4476                return load_avg / nr_running;
4477
4478        return 0;
4479}
4480
4481static void record_wakee(struct task_struct *p)
4482{
4483        /*
4484         * Rough decay (wiping) for cost saving, don't worry
4485         * about the boundary, really active task won't care
4486         * about the loss.
4487         */
4488        if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
4489                current->wakee_flips >>= 1;
4490                current->wakee_flip_decay_ts = jiffies;
4491        }
4492
4493        if (current->last_wakee != p) {
4494                current->last_wakee = p;
4495                current->wakee_flips++;
4496        }
4497}
4498
4499static void task_waking_fair(struct task_struct *p)
4500{
4501        struct sched_entity *se = &p->se;
4502        struct cfs_rq *cfs_rq = cfs_rq_of(se);
4503        u64 min_vruntime;
4504
4505#ifndef CONFIG_64BIT
4506        u64 min_vruntime_copy;
4507
4508        do {
4509                min_vruntime_copy = cfs_rq->min_vruntime_copy;
4510                smp_rmb();
4511                min_vruntime = cfs_rq->min_vruntime;
4512        } while (min_vruntime != min_vruntime_copy);
4513#else
4514        min_vruntime = cfs_rq->min_vruntime;
4515#endif
4516
4517        se->vruntime -= min_vruntime;
4518        record_wakee(p);
4519}
4520
4521#ifdef CONFIG_FAIR_GROUP_SCHED
4522/*
4523 * effective_load() calculates the load change as seen from the root_task_group
4524 *
4525 * Adding load to a group doesn't make a group heavier, but can cause movement
4526 * of group shares between cpus. Assuming the shares were perfectly aligned one
4527 * can calculate the shift in shares.
4528 *
4529 * Calculate the effective load difference if @wl is added (subtracted) to @tg
4530 * on this @cpu and results in a total addition (subtraction) of @wg to the
4531 * total group weight.
4532 *
4533 * Given a runqueue weight distribution (rw_i) we can compute a shares
4534 * distribution (s_i) using:
4535 *
4536 *   s_i = rw_i / \Sum rw_j                                             (1)
4537 *
4538 * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
4539 * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
4540 * shares distribution (s_i):
4541 *
4542 *   rw_i = {   2,   4,   1,   0 }
4543 *   s_i  = { 2/7, 4/7, 1/7,   0 }
4544 *
4545 * As per wake_affine() we're interested in the load of two CPUs (the CPU the
4546 * task used to run on and the CPU the waker is running on), we need to
4547 * compute the effect of waking a task on either CPU and, in case of a sync
4548 * wakeup, compute the effect of the current task going to sleep.
4549 *
4550 * So for a change of @wl to the local @cpu with an overall group weight change
4551 * of @wl we can compute the new shares distribution (s'_i) using:
4552 *
4553 *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)                            (2)
4554 *
4555 * Suppose we're interested in CPUs 0 and 1, and want to compute the load
4556 * differences in waking a task to CPU 0. The additional task changes the
4557 * weight and shares distributions like:
4558 *
4559 *   rw'_i = {   3,   4,   1,   0 }
4560 *   s'_i  = { 3/8, 4/8, 1/8,   0 }
4561 *
4562 * We can then compute the difference in effective weight by using:
4563 *
4564 *   dw_i = S * (s'_i - s_i)                                            (3)
4565 *
4566 * Where 'S' is the group weight as seen by its parent.
4567 *
4568 * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
4569 * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
4570 * 4/7) times the weight of the group.
4571 */
4572static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4573{
4574        struct sched_entity *se = tg->se[cpu];
4575
4576        if (!tg->parent)        /* the trivial, non-cgroup case */
4577                return wl;
4578
4579        for_each_sched_entity(se) {
4580                long w, W;
4581
4582                tg = se->my_q->tg;
4583
4584                /*
4585                 * W = @wg + \Sum rw_j
4586                 */
4587                W = wg + calc_tg_weight(tg, se->my_q);
4588
4589                /*
4590                 * w = rw_i + @wl
4591                 */
4592                w = cfs_rq_load_avg(se->my_q) + wl;
4593
4594                /*
4595                 * wl = S * s'_i; see (2)
4596                 */
4597                if (W > 0 && w < W)
4598                        wl = (w * (long)tg->shares) / W;
4599                else
4600                        wl = tg->shares;
4601
4602                /*
4603                 * Per the above, wl is the new se->load.weight value; since
4604                 * those are clipped to [MIN_SHARES, ...) do so now. See
4605                 * calc_cfs_shares().
4606                 */
4607                if (wl < MIN_SHARES)
4608                        wl = MIN_SHARES;
4609
4610                /*
4611                 * wl = dw_i = S * (s'_i - s_i); see (3)
4612                 */
4613                wl -= se->avg.load_avg;
4614
4615                /*
4616                 * Recursively apply this logic to all parent groups to compute
4617                 * the final effective load change on the root group. Since
4618                 * only the @tg group gets extra weight, all parent groups can
4619                 * only redistribute existing shares. @wl is the shift in shares
4620                 * resulting from this level per the above.
4621                 */
4622                wg = 0;
4623        }
4624
4625        return wl;
4626}
4627#else
4628
4629static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4630{
4631        return wl;
4632}
4633
4634#endif
4635
4636/*
4637 * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
4638 * A waker of many should wake a different task than the one last awakened
4639 * at a frequency roughly N times higher than one of its wakees.  In order
4640 * to determine whether we should let the load spread vs consolodating to
4641 * shared cache, we look for a minimum 'flip' frequency of llc_size in one
4642 * partner, and a factor of lls_size higher frequency in the other.  With
4643 * both conditions met, we can be relatively sure that the relationship is
4644 * non-monogamous, with partner count exceeding socket size.  Waker/wakee
4645 * being client/server, worker/dispatcher, interrupt source or whatever is
4646 * irrelevant, spread criteria is apparent partner count exceeds socket size.
4647 */
4648static int wake_wide(struct task_struct *p)
4649{
4650        unsigned int master = current->wakee_flips;
4651        unsigned int slave = p->wakee_flips;
4652        int factor = this_cpu_read(sd_llc_size);
4653
4654        if (master < slave)
4655                swap(master, slave);
4656        if (slave < factor || master < slave * factor)
4657                return 0;
4658        return 1;
4659}
4660
4661static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4662{
4663        s64 this_load, load;
4664        s64 this_eff_load, prev_eff_load;
4665        int idx, this_cpu, prev_cpu;
4666        struct task_group *tg;
4667        unsigned long weight;
4668        int balanced;
4669
4670        idx       = sd->wake_idx;
4671        this_cpu  = smp_processor_id();
4672        prev_cpu  = task_cpu(p);
4673        load      = source_load(prev_cpu, idx);
4674        this_load = target_load(this_cpu, idx);
4675
4676        /*
4677         * If sync wakeup then subtract the (maximum possible)
4678         * effect of the currently running task from the load
4679         * of the current CPU:
4680         */
4681        if (sync) {
4682                tg = task_group(current);
4683                weight = current->se.avg.load_avg;
4684
4685                this_load += effective_load(tg, this_cpu, -weight, -weight);
4686                load += effective_load(tg, prev_cpu, 0, -weight);
4687        }
4688
4689        tg = task_group(p);
4690        weight = p->se.avg.load_avg;
4691
4692        /*
4693         * In low-load situations, where prev_cpu is idle and this_cpu is idle
4694         * due to the sync cause above having dropped this_load to 0, we'll
4695         * always have an imbalance, but there's really nothing you can do
4696         * about that, so that's good too.
4697         *
4698         * Otherwise check if either cpus are near enough in load to allow this
4699         * task to be woken on this_cpu.
4700         */
4701        this_eff_load = 100;
4702        this_eff_load *= capacity_of(prev_cpu);
4703
4704        prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
4705        prev_eff_load *= capacity_of(this_cpu);
4706
4707        if (this_load > 0) {
4708                this_eff_load *= this_load +
4709                        effective_load(tg, this_cpu, weight, weight);
4710
4711                prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
4712        }
4713
4714        balanced = this_eff_load <= prev_eff_load;
4715
4716        schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
4717
4718        if (!balanced)
4719                return 0;
4720
4721        schedstat_inc(sd, ttwu_move_affine);
4722        schedstat_inc(p, se.statistics.nr_wakeups_affine);
4723
4724        return 1;
4725}
4726
4727/*
4728 * find_idlest_group finds and returns the least busy CPU group within the
4729 * domain.
4730 */
4731static struct sched_group *
4732find_idlest_group(struct sched_domain *sd, struct task_struct *p,
4733                  int this_cpu, int sd_flag)
4734{
4735        struct sched_group *idlest = NULL, *group = sd->groups;
4736        unsigned long min_load = ULONG_MAX, this_load = 0;
4737        int load_idx = sd->forkexec_idx;
4738        int imbalance = 100 + (sd->imbalance_pct-100)/2;
4739
4740        if (sd_flag & SD_BALANCE_WAKE)
4741                load_idx = sd->wake_idx;
4742
4743        do {
4744                unsigned long load, avg_load;
4745                int local_group;
4746                int i;
4747
4748                /* Skip over this group if it has no CPUs allowed */
4749                if (!cpumask_intersects(sched_group_cpus(group),
4750                                        tsk_cpus_allowed(p)))
4751                        continue;
4752
4753                local_group = cpumask_test_cpu(this_cpu,
4754                                               sched_group_cpus(group));
4755
4756                /* Tally up the load of all CPUs in the group */
4757                avg_load = 0;
4758
4759                for_each_cpu(i, sched_group_cpus(group)) {
4760                        /* Bias balancing toward cpus of our domain */
4761                        if (local_group)
4762                                load = source_load(i, load_idx);
4763                        else
4764                                load = target_load(i, load_idx);
4765
4766                        avg_load += load;
4767                }
4768
4769                /* Adjust by relative CPU capacity of the group */
4770                avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
4771
4772                if (local_group) {
4773                        this_load = avg_load;
4774                } else if (avg_load < min_load) {
4775                        min_load = avg_load;
4776                        idlest = group;
4777                }
4778        } while (group = group->next, group != sd->groups);
4779
4780        if (!idlest || 100*this_load < imbalance*min_load)
4781                return NULL;
4782        return idlest;
4783}
4784
4785/*
4786 * find_idlest_cpu - find the idlest cpu among the cpus in group.
4787 */
4788static int
4789find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
4790{
4791        unsigned long load, min_load = ULONG_MAX;
4792        unsigned int min_exit_latency = UINT_MAX;
4793        u64 latest_idle_timestamp = 0;
4794        int least_loaded_cpu = this_cpu;
4795        int shallowest_idle_cpu = -1;
4796        int i;
4797
4798        /* Traverse only the allowed CPUs */
4799        for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
4800                if (idle_cpu(i)) {
4801                        struct rq *rq = cpu_rq(i);
4802                        struct cpuidle_state *idle = idle_get_state(rq);
4803                        if (idle && idle->exit_latency < min_exit_latency) {
4804                                /*
4805                                 * We give priority to a CPU whose idle state
4806                                 * has the smallest exit latency irrespective
4807                                 * of any idle timestamp.
4808                                 */
4809                                min_exit_latency = idle->exit_latency;
4810                                latest_idle_timestamp = rq->idle_stamp;
4811                                shallowest_idle_cpu = i;
4812                        } else if ((!idle || idle->exit_latency == min_exit_latency) &&
4813                                   rq->idle_stamp > latest_idle_timestamp) {
4814                                /*
4815                                 * If equal or no active idle state, then
4816                                 * the most recently idled CPU might have
4817                                 * a warmer cache.
4818                                 */
4819                                latest_idle_timestamp = rq->idle_stamp;
4820                                shallowest_idle_cpu = i;
4821                        }
4822                } else if (shallowest_idle_cpu == -1) {
4823                        load = weighted_cpuload(i);
4824                        if (load < min_load || (load == min_load && i == this_cpu)) {
4825                                min_load = load;
4826                                least_loaded_cpu = i;
4827                        }
4828                }
4829        }
4830
4831        return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
4832}
4833
4834/*
4835 * Try and locate an idle CPU in the sched_domain.
4836 */
4837static int select_idle_sibling(struct task_struct *p, int target)
4838{
4839        struct sched_domain *sd;
4840        struct sched_group *sg;
4841        int i = task_cpu(p);
4842
4843        if (idle_cpu(target))
4844                return target;
4845
4846        /*
4847         * If the prevous cpu is cache affine and idle, don't be stupid.
4848         */
4849        if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
4850                return i;
4851
4852        /*
4853         * Otherwise, iterate the domains and find an elegible idle cpu.
4854         */
4855        sd = rcu_dereference(per_cpu(sd_llc, target));
4856        for_each_lower_domain(sd) {
4857                sg = sd->groups;
4858                do {
4859                        if (!cpumask_intersects(sched_group_cpus(sg),
4860                                                tsk_cpus_allowed(p)))
4861                                goto next;
4862
4863                        for_each_cpu(i, sched_group_cpus(sg)) {
4864                                if (i == target || !idle_cpu(i))
4865                                        goto next;
4866                        }
4867
4868                        target = cpumask_first_and(sched_group_cpus(sg),
4869                                        tsk_cpus_allowed(p));
4870                        goto done;
4871next:
4872                        sg = sg->next;
4873                } while (sg != sd->groups);
4874        }
4875done:
4876        return target;
4877}
4878
4879/*
4880 * cpu_util returns the amount of capacity of a CPU that is used by CFS
4881 * tasks. The unit of the return value must be the one of capacity so we can
4882 * compare the utilization with the capacity of the CPU that is available for
4883 * CFS task (ie cpu_capacity).
4884 *
4885 * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
4886 * recent utilization of currently non-runnable tasks on a CPU. It represents
4887 * the amount of utilization of a CPU in the range [0..capacity_orig] where
4888 * capacity_orig is the cpu_capacity available at the highest frequency
4889 * (arch_scale_freq_capacity()).
4890 * The utilization of a CPU converges towards a sum equal to or less than the
4891 * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
4892 * the running time on this CPU scaled by capacity_curr.
4893 *
4894 * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
4895 * higher than capacity_orig because of unfortunate rounding in
4896 * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
4897 * the average stabilizes with the new running time. We need to check that the
4898 * utilization stays within the range of [0..capacity_orig] and cap it if
4899 * necessary. Without utilization capping, a group could be seen as overloaded
4900 * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
4901 * available capacity. We allow utilization to overshoot capacity_curr (but not
4902 * capacity_orig) as it useful for predicting the capacity required after task
4903 * migrations (scheduler-driven DVFS).
4904 */
4905static int cpu_util(int cpu)
4906{
4907        unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
4908        unsigned long capacity = capacity_orig_of(cpu);
4909
4910        return (util >= capacity) ? capacity : util;
4911}
4912
4913/*
4914 * select_task_rq_fair: Select target runqueue for the waking task in domains
4915 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
4916 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
4917 *
4918 * Balances load by selecting the idlest cpu in the idlest group, or under
4919 * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
4920 *
4921 * Returns the target cpu number.
4922 *
4923 * preempt must be disabled.
4924 */
4925static int
4926select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
4927{
4928        struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
4929        int cpu = smp_processor_id();
4930        int new_cpu = prev_cpu;
4931        int want_affine = 0;
4932        int sync = wake_flags & WF_SYNC;
4933
4934        if (sd_flag & SD_BALANCE_WAKE)
4935                want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
4936
4937        rcu_read_lock();
4938        for_each_domain(cpu, tmp) {
4939                if (!(tmp->flags & SD_LOAD_BALANCE))
4940                        break;
4941
4942                /*
4943                 * If both cpu and prev_cpu are part of this domain,
4944                 * cpu is a valid SD_WAKE_AFFINE target.
4945                 */
4946                if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
4947                    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
4948                        affine_sd = tmp;
4949                        break;
4950                }
4951
4952                if (tmp->flags & sd_flag)
4953                        sd = tmp;
4954                else if (!want_affine)
4955                        break;
4956        }
4957
4958        if (affine_sd) {
4959                sd = NULL; /* Prefer wake_affine over balance flags */
4960                if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
4961                        new_cpu = cpu;
4962        }
4963
4964        if (!sd) {
4965                if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
4966                        new_cpu = select_idle_sibling(p, new_cpu);
4967
4968        } else while (sd) {
4969                struct sched_group *group;
4970                int weight;
4971
4972                if (!(sd->flags & sd_flag)) {
4973                        sd = sd->child;
4974                        continue;
4975                }
4976
4977                group = find_idlest_group(sd, p, cpu, sd_flag);
4978                if (!group) {
4979                        sd = sd->child;
4980                        continue;
4981                }
4982
4983                new_cpu = find_idlest_cpu(group, p, cpu);
4984                if (new_cpu == -1 || new_cpu == cpu) {
4985                        /* Now try balancing at a lower domain level of cpu */
4986                        sd = sd->child;
4987                        continue;
4988                }
4989
4990                /* Now try balancing at a lower domain level of new_cpu */
4991                cpu = new_cpu;
4992                weight = sd->span_weight;
4993                sd = NULL;
4994                for_each_domain(cpu, tmp) {
4995                        if (weight <= tmp->span_weight)
4996                                break;
4997                        if (tmp->flags & sd_flag)
4998                                sd = tmp;
4999                }
5000                /* while loop will break here if sd == NULL */
5001        }
5002        rcu_read_unlock();
5003
5004        return new_cpu;
5005}
5006
5007/*
5008 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
5009 * cfs_rq_of(p) references at time of call are still valid and identify the
5010 * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
5011 * other assumptions, including the state of rq->lock, should be made.
5012 */
5013static void migrate_task_rq_fair(struct task_struct *p)
5014{
5015        /*
5016         * We are supposed to update the task to "current" time, then its up to date
5017         * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
5018         * what current time is, so simply throw away the out-of-date time. This
5019         * will result in the wakee task is less decayed, but giving the wakee more
5020         * load sounds not bad.
5021         */
5022        remove_entity_load_avg(&p->se);
5023
5024        /* Tell new CPU we are migrated */
5025        p->se.avg.last_update_time = 0;
5026
5027        /* We have migrated, no longer consider this task hot */
5028        p->se.exec_start = 0;
5029}
5030
5031static void task_dead_fair(struct task_struct *p)
5032{
5033        remove_entity_load_avg(&p->se);
5034}
5035#endif /* CONFIG_SMP */
5036
5037static unsigned long
5038wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
5039{
5040        unsigned long gran = sysctl_sched_wakeup_granularity;
5041
5042        /*
5043         * Since its curr running now, convert the gran from real-time
5044         * to virtual-time in his units.
5045         *
5046         * By using 'se' instead of 'curr' we penalize light tasks, so
5047         * they get preempted easier. That is, if 'se' < 'curr' then
5048         * the resulting gran will be larger, therefore penalizing the
5049         * lighter, if otoh 'se' > 'curr' then the resulting gran will
5050         * be smaller, again penalizing the lighter task.
5051         *
5052         * This is especially important for buddies when the leftmost
5053         * task is higher priority than the buddy.
5054         */
5055        return calc_delta_fair(gran, se);
5056}
5057
5058/*
5059 * Should 'se' preempt 'curr'.
5060 *
5061 *             |s1
5062 *        |s2
5063 *   |s3
5064 *         g
5065 *      |<--->|c
5066 *
5067 *  w(c, s1) = -1
5068 *  w(c, s2) =  0
5069 *  w(c, s3) =  1
5070 *
5071 */
5072static int
5073wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
5074{
5075        s64 gran, vdiff = curr->vruntime - se->vruntime;
5076
5077        if (vdiff <= 0)
5078                return -1;
5079
5080        gran = wakeup_gran(curr, se);
5081        if (vdiff > gran)
5082                return 1;
5083
5084        return 0;
5085}
5086
5087static void set_last_buddy(struct sched_entity *se)
5088{
5089        if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5090                return;
5091
5092        for_each_sched_entity(se)
5093                cfs_rq_of(se)->last = se;
5094}
5095
5096static void set_next_buddy(struct sched_entity *se)
5097{
5098        if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5099                return;
5100
5101        for_each_sched_entity(se)
5102                cfs_rq_of(se)->next = se;
5103}
5104
5105static void set_skip_buddy(struct sched_entity *se)
5106{
5107        for_each_sched_entity(se)
5108                cfs_rq_of(se)->skip = se;
5109}
5110
5111/*
5112 * Preempt the current task with a newly woken task if needed:
5113 */
5114static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
5115{
5116        struct task_struct *curr = rq->curr;
5117        struct sched_entity *se = &curr->se, *pse = &p->se;
5118        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5119        int scale = cfs_rq->nr_running >= sched_nr_latency;
5120        int next_buddy_marked = 0;
5121
5122        if (unlikely(se == pse))
5123                return;
5124
5125        /*
5126         * This is possible from callers such as attach_tasks(), in which we
5127         * unconditionally check_prempt_curr() after an enqueue (which may have
5128         * lead to a throttle).  This both saves work and prevents false
5129         * next-buddy nomination below.
5130         */
5131        if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
5132                return;
5133
5134        if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
5135                set_next_buddy(pse);
5136                next_buddy_marked = 1;
5137        }
5138
5139        /*
5140         * We can come here with TIF_NEED_RESCHED already set from new task
5141         * wake up path.
5142         *
5143         * Note: this also catches the edge-case of curr being in a throttled
5144         * group (e.g. via set_curr_task), since update_curr() (in the
5145         * enqueue of curr) will have resulted in resched being set.  This
5146         * prevents us from potentially nominating it as a false LAST_BUDDY
5147         * below.
5148         */
5149        if (test_tsk_need_resched(curr))
5150                return;
5151
5152        /* Idle tasks are by definition preempted by non-idle tasks. */
5153        if (unlikely(curr->policy == SCHED_IDLE) &&
5154            likely(p->policy != SCHED_IDLE))
5155                goto preempt;
5156
5157        /*
5158         * Batch and idle tasks do not preempt non-idle tasks (their preemption
5159         * is driven by the tick):
5160         */
5161        if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
5162                return;
5163
5164        find_matching_se(&se, &pse);
5165        update_curr(cfs_rq_of(se));
5166        BUG_ON(!pse);
5167        if (wakeup_preempt_entity(se, pse) == 1) {
5168                /*
5169                 * Bias pick_next to pick the sched entity that is
5170                 * triggering this preemption.
5171                 */
5172                if (!next_buddy_marked)
5173                        set_next_buddy(pse);
5174                goto preempt;
5175        }
5176
5177        return;
5178
5179preempt:
5180        resched_curr(rq);
5181        /*
5182         * Only set the backward buddy when the current task is still
5183         * on the rq. This can happen when a wakeup gets interleaved
5184         * with schedule on the ->pre_schedule() or idle_balance()
5185         * point, either of which can * drop the rq lock.
5186         *
5187         * Also, during early boot the idle thread is in the fair class,
5188         * for obvious reasons its a bad idea to schedule back to it.
5189         */
5190        if (unlikely(!se->on_rq || curr == rq->idle))
5191                return;
5192
5193        if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
5194                set_last_buddy(se);
5195}
5196
5197static struct task_struct *
5198pick_next_task_fair(struct rq *rq, struct task_struct *prev)
5199{
5200        struct cfs_rq *cfs_rq = &rq->cfs;
5201        struct sched_entity *se;
5202        struct task_struct *p;
5203        int new_tasks;
5204
5205again:
5206#ifdef CONFIG_FAIR_GROUP_SCHED
5207        if (!cfs_rq->nr_running)
5208                goto idle;
5209
5210        if (prev->sched_class != &fair_sched_class)
5211                goto simple;
5212
5213        /*
5214         * Because of the set_next_buddy() in dequeue_task_fair() it is rather
5215         * likely that a next task is from the same cgroup as the current.
5216         *
5217         * Therefore attempt to avoid putting and setting the entire cgroup
5218         * hierarchy, only change the part that actually changes.
5219         */
5220
5221        do {
5222                struct sched_entity *curr = cfs_rq->curr;
5223
5224                /*
5225                 * Since we got here without doing put_prev_entity() we also
5226                 * have to consider cfs_rq->curr. If it is still a runnable
5227                 * entity, update_curr() will update its vruntime, otherwise
5228                 * forget we've ever seen it.
5229                 */
5230                if (curr) {
5231                        if (curr->on_rq)
5232                                update_curr(cfs_rq);
5233                        else
5234                                curr = NULL;
5235
5236                        /*
5237                         * This call to check_cfs_rq_runtime() will do the
5238                         * throttle and dequeue its entity in the parent(s).
5239                         * Therefore the 'simple' nr_running test will indeed
5240                         * be correct.
5241                         */
5242                        if (unlikely(check_cfs_rq_runtime(cfs_rq)))
5243                                goto simple;
5244                }
5245
5246                se = pick_next_entity(cfs_rq, curr);
5247                cfs_rq = group_cfs_rq(se);
5248        } while (cfs_rq);
5249
5250        p = task_of(se);
5251
5252        /*
5253         * Since we haven't yet done put_prev_entity and if the selected task
5254         * is a different task than we started out with, try and touch the
5255         * least amount of cfs_rqs.
5256         */
5257        if (prev != p) {
5258                struct sched_entity *pse = &prev->se;
5259
5260                while (!(cfs_rq = is_same_group(se, pse))) {
5261                        int se_depth = se->depth;
5262                        int pse_depth = pse->depth;
5263
5264                        if (se_depth <= pse_depth) {
5265                                put_prev_entity(cfs_rq_of(pse), pse);
5266                                pse = parent_entity(pse);
5267                        }
5268                        if (se_depth >= pse_depth) {
5269                                set_next_entity(cfs_rq_of(se), se);
5270                                se = parent_entity(se);
5271                        }
5272                }
5273
5274                put_prev_entity(cfs_rq, pse);
5275                set_next_entity(cfs_rq, se);
5276        }
5277
5278        if (hrtick_enabled(rq))
5279                hrtick_start_fair(rq, p);
5280
5281        return p;
5282simple:
5283        cfs_rq = &rq->cfs;
5284#endif
5285
5286        if (!cfs_rq->nr_running)
5287                goto idle;
5288
5289        put_prev_task(rq, prev);
5290
5291        do {
5292                se = pick_next_entity(cfs_rq, NULL);
5293                set_next_entity(cfs_rq, se);
5294                cfs_rq = group_cfs_rq(se);
5295        } while (cfs_rq);
5296
5297        p = task_of(se);
5298
5299        if (hrtick_enabled(rq))
5300                hrtick_start_fair(rq, p);
5301
5302        return p;
5303
5304idle:
5305        /*
5306         * This is OK, because current is on_cpu, which avoids it being picked
5307         * for load-balance and preemption/IRQs are still disabled avoiding
5308         * further scheduler activity on it and we're being very careful to
5309         * re-start the picking loop.
5310         */
5311        lockdep_unpin_lock(&rq->lock);
5312        new_tasks = idle_balance(rq);
5313        lockdep_pin_lock(&rq->lock);
5314        /*
5315         * Because idle_balance() releases (and re-acquires) rq->lock, it is
5316         * possible for any higher priority task to appear. In that case we
5317         * must re-start the pick_next_entity() loop.
5318         */
5319        if (new_tasks < 0)
5320                return RETRY_TASK;
5321
5322        if (new_tasks > 0)
5323                goto again;
5324
5325        return NULL;
5326}
5327
5328/*
5329 * Account for a descheduled task:
5330 */
5331static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
5332{
5333        struct sched_entity *se = &prev->se;
5334        struct cfs_rq *cfs_rq;
5335
5336        for_each_sched_entity(se) {
5337                cfs_rq = cfs_rq_of(se);
5338                put_prev_entity(cfs_rq, se);
5339        }
5340}
5341
5342/*
5343 * sched_yield() is very simple
5344 *
5345 * The magic of dealing with the ->skip buddy is in pick_next_entity.
5346 */
5347static void yield_task_fair(struct rq *rq)
5348{
5349        struct task_struct *curr = rq->curr;
5350        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5351        struct sched_entity *se = &curr->se;
5352
5353        /*
5354         * Are we the only task in the tree?
5355         */
5356        if (unlikely(rq->nr_running == 1))
5357                return;
5358
5359        clear_buddies(cfs_rq, se);
5360
5361        if (curr->policy != SCHED_BATCH) {
5362                update_rq_clock(rq);
5363                /*
5364                 * Update run-time statistics of the 'current'.
5365                 */
5366                update_curr(cfs_rq);
5367                /*
5368                 * Tell update_rq_clock() that we've just updated,
5369                 * so we don't do microscopic update in schedule()
5370                 * and double the fastpath cost.
5371                 */
5372                rq_clock_skip_update(rq, true);
5373        }
5374
5375        set_skip_buddy(se);
5376}
5377
5378static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
5379{
5380        struct sched_entity *se = &p->se;
5381
5382        /* throttled hierarchies are not runnable */
5383        if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
5384                return false;
5385
5386        /* Tell the scheduler that we'd really like pse to run next. */
5387        set_next_buddy(se);
5388
5389        yield_task_fair(rq);
5390
5391        return true;
5392}
5393
5394#ifdef CONFIG_SMP
5395/**************************************************
5396 * Fair scheduling class load-balancing methods.
5397 *
5398 * BASICS
5399 *
5400 * The purpose of load-balancing is to achieve the same basic fairness the
5401 * per-cpu scheduler provides, namely provide a proportional amount of compute
5402 * time to each task. This is expressed in the following equation:
5403 *
5404 *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
5405 *
5406 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
5407 * W_i,0 is defined as:
5408 *
5409 *   W_i,0 = \Sum_j w_i,j                                             (2)
5410 *
5411 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
5412 * is derived from the nice value as per prio_to_weight[].
5413 *
5414 * The weight average is an exponential decay average of the instantaneous
5415 * weight:
5416 *
5417 *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
5418 *
5419 * C_i is the compute capacity of cpu i, typically it is the
5420 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
5421 * can also include other factors [XXX].
5422 *
5423 * To achieve this balance we define a measure of imbalance which follows
5424 * directly from (1):
5425 *
5426 *   imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j }    (4)
5427 *
5428 * We them move tasks around to minimize the imbalance. In the continuous
5429 * function space it is obvious this converges, in the discrete case we get
5430 * a few fun cases generally called infeasible weight scenarios.
5431 *
5432 * [XXX expand on:
5433 *     - infeasible weights;
5434 *     - local vs global optima in the discrete case. ]
5435 *
5436 *
5437 * SCHED DOMAINS
5438 *
5439 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
5440 * for all i,j solution, we create a tree of cpus that follows the hardware
5441 * topology where each level pairs two lower groups (or better). This results
5442 * in O(log n) layers. Furthermore we reduce the number of cpus going up the
5443 * tree to only the first of the previous level and we decrease the frequency
5444 * of load-balance at each level inv. proportional to the number of cpus in
5445 * the groups.
5446 *
5447 * This yields:
5448 *
5449 *     log_2 n     1     n
5450 *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
5451 *     i = 0      2^i   2^i
5452 *                               `- size of each group
5453 *         |         |     `- number of cpus doing load-balance
5454 *         |         `- freq
5455 *         `- sum over all levels
5456 *
5457 * Coupled with a limit on how many tasks we can migrate every balance pass,
5458 * this makes (5) the runtime complexity of the balancer.
5459 *
5460 * An important property here is that each CPU is still (indirectly) connected
5461 * to every other cpu in at most O(log n) steps:
5462 *
5463 * The adjacency matrix of the resulting graph is given by:
5464 *
5465 *             log_2 n     
5466 *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
5467 *             k = 0
5468 *
5469 * And you'll find that:
5470 *
5471 *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
5472 *
5473 * Showing there's indeed a path between every cpu in at most O(log n) steps.
5474 * The task movement gives a factor of O(m), giving a convergence complexity
5475 * of:
5476 *
5477 *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
5478 *
5479 *
5480 * WORK CONSERVING
5481 *
5482 * In order to avoid CPUs going idle while there's still work to do, new idle
5483 * balancing is more aggressive and has the newly idle cpu iterate up the domain
5484 * tree itself instead of relying on other CPUs to bring it work.
5485 *
5486 * This adds some complexity to both (5) and (8) but it reduces the total idle
5487 * time.
5488 *
5489 * [XXX more?]
5490 *
5491 *
5492 * CGROUPS
5493 *
5494 * Cgroups make a horror show out of (2), instead of a simple sum we get:
5495 *
5496 *                                s_k,i
5497 *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
5498 *                                 S_k
5499 *
5500 * Where
5501 *
5502 *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
5503 *
5504 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
5505 *
5506 * The big problem is S_k, its a global sum needed to compute a local (W_i)
5507 * property.
5508 *
5509 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
5510 *      rewrite all of this once again.]
5511 */ 
5512
5513static unsigned long __read_mostly max_load_balance_interval = HZ/10;
5514
5515enum fbq_type { regular, remote, all };
5516
5517#define LBF_ALL_PINNED  0x01
5518#define LBF_NEED_BREAK  0x02
5519#define LBF_DST_PINNED  0x04
5520#define LBF_SOME_PINNED 0x08
5521
5522struct lb_env {
5523        struct sched_domain     *sd;
5524
5525        struct rq               *src_rq;
5526        int                     src_cpu;
5527
5528        int                     dst_cpu;
5529        struct rq               *dst_rq;
5530
5531        struct cpumask          *dst_grpmask;
5532        int                     new_dst_cpu;
5533        enum cpu_idle_type      idle;
5534        long                    imbalance;
5535        /* The set of CPUs under consideration for load-balancing */
5536        struct cpumask          *cpus;
5537
5538        unsigned int            flags;
5539
5540        unsigned int            loop;
5541        unsigned int            loop_break;
5542        unsigned int            loop_max;
5543
5544        enum fbq_type           fbq_type;
5545        struct list_head        tasks;
5546};
5547
5548/*
5549 * Is this task likely cache-hot:
5550 */
5551static int task_hot(struct task_struct *p, struct lb_env *env)
5552{
5553        s64 delta;
5554
5555        lockdep_assert_held(&env->src_rq->lock);
5556
5557        if (p->sched_class != &fair_sched_class)
5558                return 0;
5559
5560        if (unlikely(p->policy == SCHED_IDLE))
5561                return 0;
5562
5563        /*
5564         * Buddy candidates are cache hot:
5565         */
5566        if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
5567                        (&p->se == cfs_rq_of(&p->se)->next ||
5568                         &p->se == cfs_rq_of(&p->se)->last))
5569                return 1;
5570
5571        if (sysctl_sched_migration_cost == -1)
5572                return 1;
5573        if (sysctl_sched_migration_cost == 0)
5574                return 0;
5575
5576        delta = rq_clock_task(env->src_rq) - p->se.exec_start;
5577
5578        return delta < (s64)sysctl_sched_migration_cost;
5579}
5580
5581#ifdef CONFIG_NUMA_BALANCING
5582/*
5583 * Returns 1, if task migration degrades locality
5584 * Returns 0, if task migration improves locality i.e migration preferred.
5585 * Returns -1, if task migration is not affected by locality.
5586 */
5587static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5588{
5589        struct numa_group *numa_group = rcu_dereference(p->numa_group);
5590        unsigned long src_faults, dst_faults;
5591        int src_nid, dst_nid;
5592
5593        if (!static_branch_likely(&sched_numa_balancing))
5594                return -1;
5595
5596        if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
5597                return -1;
5598
5599        src_nid = cpu_to_node(env->src_cpu);
5600        dst_nid = cpu_to_node(env->dst_cpu);
5601
5602        if (src_nid == dst_nid)
5603                return -1;
5604
5605        /* Migrating away from the preferred node is always bad. */
5606        if (src_nid == p->numa_preferred_nid) {
5607                if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
5608                        return 1;
5609                else
5610                        return -1;
5611        }
5612
5613        /* Encourage migration to the preferred node. */
5614        if (dst_nid == p->numa_preferred_nid)
5615                return 0;
5616
5617        if (numa_group) {
5618                src_faults = group_faults(p, src_nid);
5619                dst_faults = group_faults(p, dst_nid);
5620        } else {
5621                src_faults = task_faults(p, src_nid);
5622                dst_faults = task_faults(p, dst_nid);
5623        }
5624
5625        return dst_faults < src_faults;
5626}
5627
5628#else
5629static inline int migrate_degrades_locality(struct task_struct *p,
5630                                             struct lb_env *env)
5631{
5632        return -1;
5633}
5634#endif
5635
5636/*
5637 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
5638 */
5639static
5640int can_migrate_task(struct task_struct *p, struct lb_env *env)
5641{
5642        int tsk_cache_hot;
5643
5644        lockdep_assert_held(&env->src_rq->lock);
5645
5646        /*
5647         * We do not migrate tasks that are:
5648         * 1) throttled_lb_pair, or
5649         * 2) cannot be migrated to this CPU due to cpus_allowed, or
5650         * 3) running (obviously), or
5651         * 4) are cache-hot on their current CPU.
5652         */
5653        if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
5654                return 0;
5655
5656        if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
5657                int cpu;
5658
5659                schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
5660
5661                env->flags |= LBF_SOME_PINNED;
5662
5663                /*
5664                 * Remember if this task can be migrated to any other cpu in
5665                 * our sched_group. We may want to revisit it if we couldn't
5666                 * meet load balance goals by pulling other tasks on src_cpu.
5667                 *
5668                 * Also avoid computing new_dst_cpu if we have already computed
5669                 * one in current iteration.
5670                 */
5671                if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
5672                        return 0;
5673
5674                /* Prevent to re-select dst_cpu via env's cpus */
5675                for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
5676                        if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
5677                                env->flags |= LBF_DST_PINNED;
5678                                env->new_dst_cpu = cpu;
5679                                break;
5680                        }
5681                }
5682
5683                return 0;
5684        }
5685
5686        /* Record that we found atleast one task that could run on dst_cpu */
5687        env->flags &= ~LBF_ALL_PINNED;
5688
5689        if (task_running(env->src_rq, p)) {
5690                schedstat_inc(p, se.statistics.nr_failed_migrations_running);
5691                return 0;
5692        }
5693
5694        /*
5695         * Aggressive migration if:
5696         * 1) destination numa is preferred
5697         * 2) task is cache cold, or
5698         * 3) too many balance attempts have failed.
5699         */
5700        tsk_cache_hot = migrate_degrades_locality(p, env);
5701        if (tsk_cache_hot == -1)
5702                tsk_cache_hot = task_hot(p, env);
5703
5704        if (tsk_cache_hot <= 0 ||
5705            env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
5706                if (tsk_cache_hot == 1) {
5707                        schedstat_inc(env->sd, lb_hot_gained[env->idle]);
5708                        schedstat_inc(p, se.statistics.nr_forced_migrations);
5709                }
5710                return 1;
5711        }
5712
5713        schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
5714        return 0;
5715}
5716
5717/*
5718 * detach_task() -- detach the task for the migration specified in env
5719 */
5720static void detach_task(struct task_struct *p, struct lb_env *env)
5721{
5722        lockdep_assert_held(&env->src_rq->lock);
5723
5724        deactivate_task(env->src_rq, p, 0);
5725        p->on_rq = TASK_ON_RQ_MIGRATING;
5726        set_task_cpu(p, env->dst_cpu);
5727}
5728
5729/*
5730 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
5731 * part of active balancing operations within "domain".
5732 *
5733 * Returns a task if successful and NULL otherwise.
5734 */
5735static struct task_struct *detach_one_task(struct lb_env *env)
5736{
5737        struct task_struct *p, *n;
5738
5739        lockdep_assert_held(&env->src_rq->lock);
5740
5741        list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
5742                if (!can_migrate_task(p, env))
5743                        continue;
5744
5745                detach_task(p, env);
5746
5747                /*
5748                 * Right now, this is only the second place where
5749                 * lb_gained[env->idle] is updated (other is detach_tasks)
5750                 * so we can safely collect stats here rather than
5751                 * inside detach_tasks().
5752                 */
5753                schedstat_inc(env->sd, lb_gained[env->idle]);
5754                return p;
5755        }
5756        return NULL;
5757}
5758
5759static const unsigned int sched_nr_migrate_break = 32;
5760
5761/*
5762 * detach_tasks() -- tries to detach up to imbalance weighted load from
5763 * busiest_rq, as part of a balancing operation within domain "sd".
5764 *
5765 * Returns number of detached tasks if successful and 0 otherwise.
5766 */
5767static int detach_tasks(struct lb_env *env)
5768{
5769        struct list_head *tasks = &env->src_rq->cfs_tasks;
5770        struct task_struct *p;
5771        unsigned long load;
5772        int detached = 0;
5773
5774        lockdep_assert_held(&env->src_rq->lock);
5775
5776        if (env->imbalance <= 0)
5777                return 0;
5778
5779        while (!list_empty(tasks)) {
5780                /*
5781                 * We don't want to steal all, otherwise we may be treated likewise,
5782                 * which could at worst lead to a livelock crash.
5783                 */
5784                if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
5785                        break;
5786
5787                p = list_first_entry(tasks, struct task_struct, se.group_node);
5788
5789                env->loop++;
5790                /* We've more or less seen every task there is, call it quits */
5791                if (env->loop > env->loop_max)
5792                        break;
5793
5794                /* take a breather every nr_migrate tasks */
5795                if (env->loop > env->loop_break) {
5796                        env->loop_break += sched_nr_migrate_break;
5797                        env->flags |= LBF_NEED_BREAK;
5798                        break;
5799                }
5800
5801                if (!can_migrate_task(p, env))
5802                        goto next;
5803
5804                load = task_h_load(p);
5805
5806                if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
5807                        goto next;
5808
5809                if ((load / 2) > env->imbalance)
5810                        goto next;
5811
5812                detach_task(p, env);
5813                list_add(&p->se.group_node, &env->tasks);
5814
5815                detached++;
5816                env->imbalance -= load;
5817
5818#ifdef CONFIG_PREEMPT
5819                /*
5820                 * NEWIDLE balancing is a source of latency, so preemptible
5821                 * kernels will stop after the first task is detached to minimize
5822                 * the critical section.
5823                 */
5824                if (env->idle == CPU_NEWLY_IDLE)
5825                        break;
5826#endif
5827
5828                /*
5829                 * We only want to steal up to the prescribed amount of
5830                 * weighted load.
5831                 */
5832                if (env->imbalance <= 0)
5833                        break;
5834
5835                continue;
5836next:
5837                list_move_tail(&p->se.group_node, tasks);
5838        }
5839
5840        /*
5841         * Right now, this is one of only two places we collect this stat
5842         * so we can safely collect detach_one_task() stats here rather
5843         * than inside detach_one_task().
5844         */
5845        schedstat_add(env->sd, lb_gained[env->idle], detached);
5846
5847        return detached;
5848}
5849
5850/*
5851 * attach_task() -- attach the task detached by detach_task() to its new rq.
5852 */
5853static void attach_task(struct rq *rq, struct task_struct *p)
5854{
5855        lockdep_assert_held(&rq->lock);
5856
5857        BUG_ON(task_rq(p) != rq);
5858        p->on_rq = TASK_ON_RQ_QUEUED;
5859        activate_task(rq, p, 0);
5860        check_preempt_curr(rq, p, 0);
5861}
5862
5863/*
5864 * attach_one_task() -- attaches the task returned from detach_one_task() to
5865 * its new rq.
5866 */
5867static void attach_one_task(struct rq *rq, struct task_struct *p)
5868{
5869        raw_spin_lock(&rq->lock);
5870        attach_task(rq, p);
5871        raw_spin_unlock(&rq->lock);
5872}
5873
5874/*
5875 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
5876 * new rq.
5877 */
5878static void attach_tasks(struct lb_env *env)
5879{
5880        struct list_head *tasks = &env->tasks;
5881        struct task_struct *p;
5882
5883        raw_spin_lock(&env->dst_rq->lock);
5884
5885        while (!list_empty(tasks)) {
5886                p = list_first_entry(tasks, struct task_struct, se.group_node);
5887                list_del_init(&p->se.group_node);
5888
5889                attach_task(env->dst_rq, p);
5890        }
5891
5892        raw_spin_unlock(&env->dst_rq->lock);
5893}
5894
5895#ifdef CONFIG_FAIR_GROUP_SCHED
5896static void update_blocked_averages(int cpu)
5897{
5898        struct rq *rq = cpu_rq(cpu);
5899        struct cfs_rq *cfs_rq;
5900        unsigned long flags;
5901
5902        raw_spin_lock_irqsave(&rq->lock, flags);
5903        update_rq_clock(rq);
5904
5905        /*
5906         * Iterates the task_group tree in a bottom up fashion, see
5907         * list_add_leaf_cfs_rq() for details.
5908         */
5909        for_each_leaf_cfs_rq(rq, cfs_rq) {
5910                /* throttled entities do not contribute to load */
5911                if (throttled_hierarchy(cfs_rq))
5912                        continue;
5913
5914                if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
5915                        update_tg_load_avg(cfs_rq, 0);
5916        }
5917        raw_spin_unlock_irqrestore(&rq->lock, flags);
5918}
5919
5920/*
5921 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
5922 * This needs to be done in a top-down fashion because the load of a child
5923 * group is a fraction of its parents load.
5924 */
5925static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
5926{
5927        struct rq *rq = rq_of(cfs_rq);
5928        struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
5929        unsigned long now = jiffies;
5930        unsigned long load;
5931
5932        if (cfs_rq->last_h_load_update == now)
5933                return;
5934
5935        cfs_rq->h_load_next = NULL;
5936        for_each_sched_entity(se) {
5937                cfs_rq = cfs_rq_of(se);
5938                cfs_rq->h_load_next = se;
5939                if (cfs_rq->last_h_load_update == now)
5940                        break;
5941        }
5942
5943        if (!se) {
5944                cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
5945                cfs_rq->last_h_load_update = now;
5946        }
5947
5948        while ((se = cfs_rq->h_load_next) != NULL) {
5949                load = cfs_rq->h_load;
5950                load = div64_ul(load * se->avg.load_avg,
5951                        cfs_rq_load_avg(cfs_rq) + 1);
5952                cfs_rq = group_cfs_rq(se);
5953                cfs_rq->h_load = load;
5954                cfs_rq->last_h_load_update = now;
5955        }
5956}
5957
5958static unsigned long task_h_load(struct task_struct *p)
5959{
5960        struct cfs_rq *cfs_rq = task_cfs_rq(p);
5961
5962        update_cfs_rq_h_load(cfs_rq);
5963        return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
5964                        cfs_rq_load_avg(cfs_rq) + 1);
5965}
5966#else
5967static inline void update_blocked_averages(int cpu)
5968{
5969        struct rq *rq = cpu_rq(cpu);
5970        struct cfs_rq *cfs_rq = &rq->cfs;
5971        unsigned long flags;
5972
5973        raw_spin_lock_irqsave(&rq->lock, flags);
5974        update_rq_clock(rq);
5975        update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
5976        raw_spin_unlock_irqrestore(&rq->lock, flags);
5977}
5978
5979static unsigned long task_h_load(struct task_struct *p)
5980{
5981        return p->se.avg.load_avg;
5982}
5983#endif
5984
5985/********** Helpers for find_busiest_group ************************/
5986
5987enum group_type {
5988        group_other = 0,
5989        group_imbalanced,
5990        group_overloaded,
5991};
5992
5993/*
5994 * sg_lb_stats - stats of a sched_group required for load_balancing
5995 */
5996struct sg_lb_stats {
5997        unsigned long avg_load; /*Avg load across the CPUs of the group */
5998        unsigned long group_load; /* Total load over the CPUs of the group */
5999        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
6000        unsigned long load_per_task;
6001        unsigned long group_capacity;
6002        unsigned long group_util; /* Total utilization of the group */
6003        unsigned int sum_nr_running; /* Nr tasks running in the group */
6004        unsigned int idle_cpus;
6005        unsigned int group_weight;
6006        enum group_type group_type;
6007        int group_no_capacity;
6008#ifdef CONFIG_NUMA_BALANCING
6009        unsigned int nr_numa_running;
6010        unsigned int nr_preferred_running;
6011#endif
6012};
6013
6014/*
6015 * sd_lb_stats - Structure to store the statistics of a sched_domain
6016 *               during load balancing.
6017 */
6018struct sd_lb_stats {
6019        struct sched_group *busiest;    /* Busiest group in this sd */
6020        struct sched_group *local;      /* Local group in this sd */
6021        unsigned long total_load;       /* Total load of all groups in sd */
6022        unsigned long total_capacity;   /* Total capacity of all groups in sd */
6023        unsigned long avg_load; /* Average load across all groups in sd */
6024
6025        struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
6026        struct sg_lb_stats local_stat;  /* Statistics of the local group */
6027};
6028
6029static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
6030{
6031        /*
6032         * Skimp on the clearing to avoid duplicate work. We can avoid clearing
6033         * local_stat because update_sg_lb_stats() does a full clear/assignment.
6034         * We must however clear busiest_stat::avg_load because
6035         * update_sd_pick_busiest() reads this before assignment.
6036         */
6037        *sds = (struct sd_lb_stats){
6038                .busiest = NULL,
6039                .local = NULL,
6040                .total_load = 0UL,
6041                .total_capacity = 0UL,
6042                .busiest_stat = {
6043                        .avg_load = 0UL,
6044                        .sum_nr_running = 0,
6045                        .group_type = group_other,
6046                },
6047        };
6048}
6049
6050/**
6051 * get_sd_load_idx - Obtain the load index for a given sched domain.
6052 * @sd: The sched_domain whose load_idx is to be obtained.
6053 * @idle: The idle status of the CPU for whose sd load_idx is obtained.
6054 *
6055 * Return: The load index.
6056 */
6057static inline int get_sd_load_idx(struct sched_domain *sd,
6058                                        enum cpu_idle_type idle)
6059{
6060        int load_idx;
6061
6062        switch (idle) {
6063        case CPU_NOT_IDLE:
6064                load_idx = sd->busy_idx;
6065                break;
6066
6067        case CPU_NEWLY_IDLE:
6068                load_idx = sd->newidle_idx;
6069                break;
6070        default:
6071                load_idx = sd->idle_idx;
6072                break;
6073        }
6074
6075        return load_idx;
6076}
6077
6078static unsigned long scale_rt_capacity(int cpu)
6079{
6080        struct rq *rq = cpu_rq(cpu);
6081        u64 total, used, age_stamp, avg;
6082        s64 delta;
6083
6084        /*
6085         * Since we're reading these variables without serialization make sure
6086         * we read them once before doing sanity checks on them.
6087         */
6088        age_stamp = READ_ONCE(rq->age_stamp);
6089        avg = READ_ONCE(rq->rt_avg);
6090        delta = __rq_clock_broken(rq) - age_stamp;
6091
6092        if (unlikely(delta < 0))
6093                delta = 0;
6094
6095        total = sched_avg_period() + delta;
6096
6097        used = div_u64(avg, total);
6098
6099        if (likely(used < SCHED_CAPACITY_SCALE))
6100                return SCHED_CAPACITY_SCALE - used;
6101
6102        return 1;
6103}
6104
6105static void update_cpu_capacity(struct sched_domain *sd, int cpu)
6106{
6107        unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
6108        struct sched_group *sdg = sd->groups;
6109
6110        cpu_rq(cpu)->cpu_capacity_orig = capacity;
6111
6112        capacity *= scale_rt_capacity(cpu);
6113        capacity >>= SCHED_CAPACITY_SHIFT;
6114
6115        if (!capacity)
6116                capacity = 1;
6117
6118        cpu_rq(cpu)->cpu_capacity = capacity;
6119        sdg->sgc->capacity = capacity;
6120}
6121
6122void update_group_capacity(struct sched_domain *sd, int cpu)
6123{
6124        struct sched_domain *child = sd->child;
6125        struct sched_group *group, *sdg = sd->groups;
6126        unsigned long capacity;
6127        unsigned long interval;
6128
6129        interval = msecs_to_jiffies(sd->balance_interval);
6130        interval = clamp(interval, 1UL, max_load_balance_interval);
6131        sdg->sgc->next_update = jiffies + interval;
6132
6133        if (!child) {
6134                update_cpu_capacity(sd, cpu);
6135                return;
6136        }
6137
6138        capacity = 0;
6139
6140        if (child->flags & SD_OVERLAP) {
6141                /*
6142                 * SD_OVERLAP domains cannot assume that child groups
6143                 * span the current group.
6144                 */
6145
6146                for_each_cpu(cpu, sched_group_cpus(sdg)) {
6147                        struct sched_group_capacity *sgc;
6148                        struct rq *rq = cpu_rq(cpu);
6149
6150                        /*
6151                         * build_sched_domains() -> init_sched_groups_capacity()
6152                         * gets here before we've attached the domains to the
6153                         * runqueues.
6154                         *
6155                         * Use capacity_of(), which is set irrespective of domains
6156                         * in update_cpu_capacity().
6157                         *
6158                         * This avoids capacity from being 0 and
6159                         * causing divide-by-zero issues on boot.
6160                         */
6161                        if (unlikely(!rq->sd)) {
6162                                capacity += capacity_of(cpu);
6163                                continue;
6164                        }
6165
6166                        sgc = rq->sd->groups->sgc;
6167                        capacity += sgc->capacity;
6168                }
6169        } else  {
6170                /*
6171                 * !SD_OVERLAP domains can assume that child groups
6172                 * span the current group.
6173                 */ 
6174
6175                group = child->groups;
6176                do {
6177                        capacity += group->sgc->capacity;
6178                        group = group->next;
6179                } while (group != child->groups);
6180        }
6181
6182        sdg->sgc->capacity = capacity;
6183}
6184
6185/*
6186 * Check whether the capacity of the rq has been noticeably reduced by side
6187 * activity. The imbalance_pct is used for the threshold.
6188 * Return true is the capacity is reduced
6189 */
6190static inline int
6191check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
6192{
6193        return ((rq->cpu_capacity * sd->imbalance_pct) <
6194                                (rq->cpu_capacity_orig * 100));
6195}
6196
6197/*
6198 * Group imbalance indicates (and tries to solve) the problem where balancing
6199 * groups is inadequate due to tsk_cpus_allowed() constraints.
6200 *
6201 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
6202 * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
6203 * Something like:
6204 *
6205 *      { 0 1 2 3 } { 4 5 6 7 }
6206 *              *     * * *
6207 *
6208 * If we were to balance group-wise we'd place two tasks in the first group and
6209 * two tasks in the second group. Clearly this is undesired as it will overload
6210 * cpu 3 and leave one of the cpus in the second group unused.
6211 *
6212 * The current solution to this issue is detecting the skew in the first group
6213 * by noticing the lower domain failed to reach balance and had difficulty
6214 * moving tasks due to affinity constraints.
6215 *
6216 * When this is so detected; this group becomes a candidate for busiest; see
6217 * update_sd_pick_busiest(). And calculate_imbalance() and
6218 * find_busiest_group() avoid some of the usual balance conditions to allow it
6219 * to create an effective group imbalance.
6220 *
6221 * This is a somewhat tricky proposition since the next run might not find the
6222 * group imbalance and decide the groups need to be balanced again. A most
6223 * subtle and fragile situation.
6224 */
6225
6226static inline int sg_imbalanced(struct sched_group *group)
6227{
6228        return group->sgc->imbalance;
6229}
6230
6231/*
6232 * group_has_capacity returns true if the group has spare capacity that could
6233 * be used by some tasks.
6234 * We consider that a group has spare capacity if the  * number of task is
6235 * smaller than the number of CPUs or if the utilization is lower than the
6236 * available capacity for CFS tasks.
6237 * For the latter, we use a threshold to stabilize the state, to take into
6238 * account the variance of the tasks' load and to return true if the available
6239 * capacity in meaningful for the load balancer.
6240 * As an example, an available capacity of 1% can appear but it doesn't make
6241 * any benefit for the load balance.
6242 */
6243static inline bool
6244group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
6245{
6246        if (sgs->sum_nr_running < sgs->group_weight)
6247                return true;
6248
6249        if ((sgs->group_capacity * 100) >
6250                        (sgs->group_util * env->sd->imbalance_pct))
6251                return true;
6252
6253        return false;
6254}
6255
6256/*
6257 *  group_is_overloaded returns true if the group has more tasks than it can
6258 *  handle.
6259 *  group_is_overloaded is not equals to !group_has_capacity because a group
6260 *  with the exact right number of tasks, has no more spare capacity but is not
6261 *  overloaded so both group_has_capacity and group_is_overloaded return
6262 *  false.
6263 */
6264static inline bool
6265group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
6266{
6267        if (sgs->sum_nr_running <= sgs->group_weight)
6268                return false;
6269
6270        if ((sgs->group_capacity * 100) <
6271                        (sgs->group_util * env->sd->imbalance_pct))
6272                return true;
6273
6274        return false;
6275}
6276
6277static inline enum
6278group_type group_classify(struct sched_group *group,
6279                          struct sg_lb_stats *sgs)
6280{
6281        if (sgs->group_no_capacity)
6282                return group_overloaded;
6283
6284        if (sg_imbalanced(group))
6285                return group_imbalanced;
6286
6287        return group_other;
6288}
6289
6290/**
6291 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
6292 * @env: The load balancing environment.
6293 * @group: sched_group whose statistics are to be updated.
6294 * @load_idx: Load index of sched_domain of this_cpu for load calc.
6295 * @local_group: Does group contain this_cpu.
6296 * @sgs: variable to hold the statistics for this group.
6297 * @overload: Indicate more than one runnable task for any CPU.
6298 */
6299static inline void update_sg_lb_stats(struct lb_env *env,
6300                        struct sched_group *group, int load_idx,
6301                        int local_group, struct sg_lb_stats *sgs,
6302                        bool *overload)
6303{
6304        unsigned long load;
6305        int i;
6306
6307        memset(sgs, 0, sizeof(*sgs));
6308
6309        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
6310                struct rq *rq = cpu_rq(i);
6311
6312                /* Bias balancing toward cpus of our domain */
6313                if (local_group)
6314                        load = target_load(i, load_idx);
6315                else
6316                        load = source_load(i, load_idx);
6317
6318                sgs->group_load += load;
6319                sgs->group_util += cpu_util(i);
6320                sgs->sum_nr_running += rq->cfs.h_nr_running;
6321
6322                if (rq->nr_running > 1)
6323                        *overload = true;
6324
6325#ifdef CONFIG_NUMA_BALANCING
6326                sgs->nr_numa_running += rq->nr_numa_running;
6327                sgs->nr_preferred_running += rq->nr_preferred_running;
6328#endif
6329                sgs->sum_weighted_load += weighted_cpuload(i);
6330                if (idle_cpu(i))
6331                        sgs->idle_cpus++;
6332        }
6333
6334        /* Adjust by relative CPU capacity of the group */
6335        sgs->group_capacity = group->sgc->capacity;
6336        sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
6337
6338        if (sgs->sum_nr_running)
6339                sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
6340
6341        sgs->group_weight = group->group_weight;
6342
6343        sgs->group_no_capacity = group_is_overloaded(env, sgs);
6344        sgs->group_type = group_classify(group, sgs);
6345}
6346
6347/**
6348 * update_sd_pick_busiest - return 1 on busiest group
6349 * @env: The load balancing environment.
6350 * @sds: sched_domain statistics
6351 * @sg: sched_group candidate to be checked for being the busiest
6352 * @sgs: sched_group statistics
6353 *
6354 * Determine if @sg is a busier group than the previously selected
6355 * busiest group.
6356 *
6357 * Return: %true if @sg is a busier group than the previously selected
6358 * busiest group. %false otherwise.
6359 */
6360static bool update_sd_pick_busiest(struct lb_env *env,
6361                                   struct sd_lb_stats *sds,
6362                                   struct sched_group *sg,
6363                                   struct sg_lb_stats *sgs)
6364{
6365        struct sg_lb_stats *busiest = &sds->busiest_stat;
6366
6367        if (sgs->group_type > busiest->group_type)
6368                return true;
6369
6370        if (sgs->group_type < busiest->group_type)
6371                return false;
6372
6373        if (sgs->avg_load <= busiest->avg_load)
6374                return false;
6375
6376        /* This is the busiest node in its class. */
6377        if (!(env->sd->flags & SD_ASYM_PACKING))
6378                return true;
6379
6380        /*
6381         * ASYM_PACKING needs to move all the work to the lowest
6382         * numbered CPUs in the group, therefore mark all groups
6383         * higher than ourself as busy.
6384         */
6385        if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
6386                if (!sds->busiest)
6387                        return true;
6388
6389                if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
6390                        return true;
6391        }
6392
6393        return false;
6394}
6395
6396#ifdef CONFIG_NUMA_BALANCING
6397static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
6398{
6399        if (sgs->sum_nr_running > sgs->nr_numa_running)
6400                return regular;
6401        if (sgs->sum_nr_running > sgs->nr_preferred_running)
6402                return remote;
6403        return all;
6404}
6405
6406static inline enum fbq_type fbq_classify_rq(struct rq *rq)
6407{
6408        if (rq->nr_running > rq->nr_numa_running)
6409                return regular;
6410        if (rq->nr_running > rq->nr_preferred_running)
6411                return remote;
6412        return all;
6413}
6414#else
6415static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
6416{
6417        return all;
6418}
6419
6420static inline enum fbq_type fbq_classify_rq(struct rq *rq)
6421{
6422        return regular;
6423}
6424#endif /* CONFIG_NUMA_BALANCING */
6425
6426/**
6427 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
6428 * @env: The load balancing environment.
6429 * @sds: variable to hold the statistics for this sched_domain.
6430 */
6431static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
6432{
6433        struct sched_domain *child = env->sd->child;
6434        struct sched_group *sg = env->sd->groups;
6435        struct sg_lb_stats tmp_sgs;
6436        int load_idx, prefer_sibling = 0;
6437        bool overload = false;
6438
6439        if (child && child->flags & SD_PREFER_SIBLING)
6440                prefer_sibling = 1;
6441
6442        load_idx = get_sd_load_idx(env->sd, env->idle);
6443
6444        do {
6445                struct sg_lb_stats *sgs = &tmp_sgs;
6446                int local_group;
6447
6448                local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
6449                if (local_group) {
6450                        sds->local = sg;
6451                        sgs = &sds->local_stat;
6452
6453                        if (env->idle != CPU_NEWLY_IDLE ||
6454                            time_after_eq(jiffies, sg->sgc->next_update))
6455                                update_group_capacity(env->sd, env->dst_cpu);
6456                }
6457
6458                update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
6459                                                &overload);
6460
6461                if (local_group)
6462                        goto next_group;
6463
6464                /*
6465                 * In case the child domain prefers tasks go to siblings
6466                 * first, lower the sg capacity so that we'll try
6467                 * and move all the excess tasks away. We lower the capacity
6468                 * of a group only if the local group has the capacity to fit
6469                 * these excess tasks. The extra check prevents the case where
6470                 * you always pull from the heaviest group when it is already
6471                 * under-utilized (possible with a large weight task outweighs
6472                 * the tasks on the system).
6473                 */
6474                if (prefer_sibling && sds->local &&
6475                    group_has_capacity(env, &sds->local_stat) &&
6476                    (sgs->sum_nr_running > 1)) {
6477                        sgs->group_no_capacity = 1;
6478                        sgs->group_type = group_classify(sg, sgs);
6479                }
6480
6481                if (update_sd_pick_busiest(env, sds, sg, sgs)) {
6482                        sds->busiest = sg;
6483                        sds->busiest_stat = *sgs;
6484                }
6485
6486next_group:
6487                /* Now, start updating sd_lb_stats */
6488                sds->total_load += sgs->group_load;
6489                sds->total_capacity += sgs->group_capacity;
6490
6491                sg = sg->next;
6492        } while (sg != env->sd->groups);
6493
6494        if (env->sd->flags & SD_NUMA)
6495                env->fbq_type = fbq_classify_group(&sds->busiest_stat);
6496
6497        if (!env->sd->parent) {
6498                /* update overload indicator if we are at root domain */
6499                if (env->dst_rq->rd->overload != overload)
6500                        env->dst_rq->rd->overload = overload;
6501        }
6502
6503}
6504
6505/**
6506 * check_asym_packing - Check to see if the group is packed into the
6507 *                      sched doman.
6508 *
6509 * This is primarily intended to used at the sibling level.  Some
6510 * cores like POWER7 prefer to use lower numbered SMT threads.  In the
6511 * case of POWER7, it can move to lower SMT modes only when higher
6512 * threads are idle.  When in lower SMT modes, the threads will
6513 * perform better since they share less core resources.  Hence when we
6514 * have idle threads, we want them to be the higher ones.
6515 *
6516 * This packing function is run on idle threads.  It checks to see if
6517 * the busiest CPU in this domain (core in the P7 case) has a higher
6518 * CPU number than the packing function is being run on.  Here we are
6519 * assuming lower CPU number will be equivalent to lower a SMT thread
6520 * number.
6521 *
6522 * Return: 1 when packing is required and a task should be moved to
6523 * this CPU.  The amount of the imbalance is returned in *imbalance.
6524 *
6525 * @env: The load balancing environment.
6526 * @sds: Statistics of the sched_domain which is to be packed
6527 */
6528static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
6529{
6530        int busiest_cpu;
6531
6532        if (!(env->sd->flags & SD_ASYM_PACKING))
6533                return 0;
6534
6535        if (!sds->busiest)
6536                return 0;
6537
6538        busiest_cpu = group_first_cpu(sds->busiest);
6539        if (env->dst_cpu > busiest_cpu)
6540                return 0;
6541
6542        env->imbalance = DIV_ROUND_CLOSEST(
6543                sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
6544                SCHED_CAPACITY_SCALE);
6545
6546        return 1;
6547}
6548
6549/**
6550 * fix_small_imbalance - Calculate the minor imbalance that exists
6551 *                      amongst the groups of a sched_domain, during
6552 *                      load balancing.
6553 * @env: The load balancing environment.
6554 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
6555 */
6556static inline
6557void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
6558{
6559        unsigned long tmp, capa_now = 0, capa_move = 0;
6560        unsigned int imbn = 2;
6561        unsigned long scaled_busy_load_per_task;
6562        struct sg_lb_stats *local, *busiest;
6563
6564        local = &sds->local_stat;
6565        busiest = &sds->busiest_stat;
6566
6567        if (!local->sum_nr_running)
6568                local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
6569        else if (busiest->load_per_task > local->load_per_task)
6570                imbn = 1;
6571
6572        scaled_busy_load_per_task =
6573                (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
6574                busiest->group_capacity;
6575
6576        if (busiest->avg_load + scaled_busy_load_per_task >=
6577            local->avg_load + (scaled_busy_load_per_task * imbn)) {
6578                env->imbalance = busiest->load_per_task;
6579                return;
6580        }
6581
6582        /*
6583         * OK, we don't have enough imbalance to justify moving tasks,
6584         * however we may be able to increase total CPU capacity used by
6585         * moving them.
6586         */
6587
6588        capa_now += busiest->group_capacity *
6589                        min(busiest->load_per_task, busiest->avg_load);
6590        capa_now += local->group_capacity *
6591                        min(local->load_per_task, local->avg_load);
6592        capa_now /= SCHED_CAPACITY_SCALE;
6593
6594        /* Amount of load we'd subtract */
6595        if (busiest->avg_load > scaled_busy_load_per_task) {
6596                capa_move += busiest->group_capacity *
6597                            min(busiest->load_per_task,
6598                                busiest->avg_load - scaled_busy_load_per_task);
6599        }
6600
6601        /* Amount of load we'd add */
6602        if (busiest->avg_load * busiest->group_capacity <
6603            busiest->load_per_task * SCHED_CAPACITY_SCALE) {
6604                tmp = (busiest->avg_load * busiest->group_capacity) /
6605                      local->group_capacity;
6606        } else {
6607                tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
6608                      local->group_capacity;
6609        }
6610        capa_move += local->group_capacity *
6611                    min(local->load_per_task, local->avg_load + tmp);
6612        capa_move /= SCHED_CAPACITY_SCALE;
6613
6614        /* Move if we gain throughput */
6615        if (capa_move > capa_now)
6616                env->imbalance = busiest->load_per_task;
6617}
6618
6619/**
6620 * calculate_imbalance - Calculate the amount of imbalance present within the
6621 *                       groups of a given sched_domain during load balance.
6622 * @env: load balance environment
6623 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
6624 */
6625static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
6626{
6627        unsigned long max_pull, load_above_capacity = ~0UL;
6628        struct sg_lb_stats *local, *busiest;
6629
6630        local = &sds->local_stat;
6631        busiest = &sds->busiest_stat;
6632
6633        if (busiest->group_type == group_imbalanced) {
6634                /*
6635                 * In the group_imb case we cannot rely on group-wide averages
6636                 * to ensure cpu-load equilibrium, look at wider averages. XXX
6637                 */
6638                busiest->load_per_task =
6639                        min(busiest->load_per_task, sds->avg_load);
6640        }
6641
6642        /*
6643         * In the presence of smp nice balancing, certain scenarios can have
6644         * max load less than avg load(as we skip the groups at or below
6645         * its cpu_capacity, while calculating max_load..)
6646         */
6647        if (busiest->avg_load <= sds->avg_load ||
6648            local->avg_load >= sds->avg_load) {
6649                env->imbalance = 0;
6650                return fix_small_imbalance(env, sds);
6651        }
6652
6653        /*
6654         * If there aren't any idle cpus, avoid creating some.
6655         */
6656        if (busiest->group_type == group_overloaded &&
6657            local->group_type   == group_overloaded) {
6658                load_above_capacity = busiest->sum_nr_running *
6659                                        SCHED_LOAD_SCALE;
6660                if (load_above_capacity > busiest->group_capacity)
6661                        load_above_capacity -= busiest->group_capacity;
6662                else
6663                        load_above_capacity = ~0UL;
6664        }
6665
6666        /*
6667         * We're trying to get all the cpus to the average_load, so we don't
6668         * want to push ourselves above the average load, nor do we wish to
6669         * reduce the max loaded cpu below the average load. At the same time,
6670         * we also don't want to reduce the group load below the group capacity
6671         * (so that we can implement power-savings policies etc). Thus we look
6672         * for the minimum possible imbalance.
6673         */
6674        max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
6675
6676        /* How much load to actually move to equalise the imbalance */
6677        env->imbalance = min(
6678                max_pull * busiest->group_capacity,
6679                (sds->avg_load - local->avg_load) * local->group_capacity
6680        ) / SCHED_CAPACITY_SCALE;
6681
6682        /*
6683         * if *imbalance is less than the average load per runnable task
6684         * there is no guarantee that any tasks will be moved so we'll have
6685         * a think about bumping its value to force at least one task to be
6686         * moved
6687         */
6688        if (env->imbalance < busiest->load_per_task)
6689                return fix_small_imbalance(env, sds);
6690}
6691
6692/******* find_busiest_group() helpers end here *********************/
6693
6694/**
6695 * find_busiest_group - Returns the busiest group within the sched_domain
6696 * if there is an imbalance. If there isn't an imbalance, and
6697 * the user has opted for power-savings, it returns a group whose
6698 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
6699 * such a group exists.
6700 *
6701 * Also calculates the amount of weighted load which should be moved
6702 * to restore balance.
6703 *
6704 * @env: The load balancing environment.
6705 *
6706 * Return:      - The busiest group if imbalance exists.
6707 *              - If no imbalance and user has opted for power-savings balance,
6708 *                 return the least loaded group whose CPUs can be
6709 *                 put to idle by rebalancing its tasks onto our group.
6710 */
6711static struct sched_group *find_busiest_group(struct lb_env *env)
6712{
6713        struct sg_lb_stats *local, *busiest;
6714        struct sd_lb_stats sds;
6715
6716        init_sd_lb_stats(&sds);
6717
6718        /*
6719         * Compute the various statistics relavent for load balancing at
6720         * this level.
6721         */
6722        update_sd_lb_stats(env, &sds);
6723        local = &sds.local_stat;
6724        busiest = &sds.busiest_stat;
6725
6726        /* ASYM feature bypasses nice load balance check */
6727        if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
6728            check_asym_packing(env, &sds))
6729                return sds.busiest;
6730
6731        /* There is no busy sibling group to pull tasks from */
6732        if (!sds.busiest || busiest->sum_nr_running == 0)
6733                goto out_balanced;
6734
6735        sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
6736                                                / sds.total_capacity;
6737
6738        /*
6739         * If the busiest group is imbalanced the below checks don't
6740         * work because they assume all things are equal, which typically
6741         * isn't true due to cpus_allowed constraints and the like.
6742         */
6743        if (busiest->group_type == group_imbalanced)
6744                goto force_balance;
6745
6746        /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
6747        if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
6748            busiest->group_no_capacity)
6749                goto force_balance;
6750
6751        /*
6752         * If the local group is busier than the selected busiest group
6753         * don't try and pull any tasks.
6754         */
6755        if (local->avg_load >= busiest->avg_load)
6756                goto out_balanced;
6757
6758        /*
6759         * Don't pull any tasks if this group is already above the domain
6760         * average load.
6761         */
6762        if (local->avg_load >= sds.avg_load)
6763                goto out_balanced;
6764
6765        if (env->idle == CPU_IDLE) {
6766                /*
6767                 * This cpu is idle. If the busiest group is not overloaded
6768                 * and there is no imbalance between this and busiest group
6769                 * wrt idle cpus, it is balanced. The imbalance becomes
6770                 * significant if the diff is greater than 1 otherwise we
6771                 * might end up to just move the imbalance on another group
6772                 */
6773                if ((busiest->group_type != group_overloaded) &&
6774                                (local->idle_cpus <= (busiest->idle_cpus + 1)))
6775                        goto out_balanced;
6776        } else {
6777                /*
6778                 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
6779                 * imbalance_pct to be conservative.
6780                 */
6781                if (100 * busiest->avg_load <=
6782                                env->sd->imbalance_pct * local->avg_load)
6783                        goto out_balanced;
6784        }
6785
6786force_balance:
6787        /* Looks like there is an imbalance. Compute it */
6788        calculate_imbalance(env, &sds);
6789        return sds.busiest;
6790
6791out_balanced:
6792        env->imbalance = 0;
6793        return NULL;
6794}
6795
6796/*
6797 * find_busiest_queue - find the busiest runqueue among the cpus in group.
6798 */
6799static struct rq *find_busiest_queue(struct lb_env *env,
6800                                     struct sched_group *group)
6801{
6802        struct rq *busiest = NULL, *rq;
6803        unsigned long busiest_load = 0, busiest_capacity = 1;
6804        int i;
6805
6806        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
6807                unsigned long capacity, wl;
6808                enum fbq_type rt;
6809
6810                rq = cpu_rq(i);
6811                rt = fbq_classify_rq(rq);
6812
6813                /*
6814                 * We classify groups/runqueues into three groups:
6815                 *  - regular: there are !numa tasks
6816                 *  - remote:  there are numa tasks that run on the 'wrong' node
6817                 *  - all:     there is no distinction
6818                 *
6819                 * In order to avoid migrating ideally placed numa tasks,
6820                 * ignore those when there's better options.
6821                 *
6822                 * If we ignore the actual busiest queue to migrate another
6823                 * task, the next balance pass can still reduce the busiest
6824                 * queue by moving tasks around inside the node.
6825                 *
6826                 * If we cannot move enough load due to this classification
6827                 * the next pass will adjust the group classification and
6828                 * allow migration of more tasks.
6829                 *
6830                 * Both cases only affect the total convergence complexity.
6831                 */
6832                if (rt > env->fbq_type)
6833                        continue;
6834
6835                capacity = capacity_of(i);
6836
6837                wl = weighted_cpuload(i);
6838
6839                /*
6840                 * When comparing with imbalance, use weighted_cpuload()
6841                 * which is not scaled with the cpu capacity.
6842                 */
6843
6844                if (rq->nr_running == 1 && wl > env->imbalance &&
6845                    !check_cpu_capacity(rq, env->sd))
6846                        continue;
6847
6848                /*
6849                 * For the load comparisons with the other cpu's, consider
6850                 * the weighted_cpuload() scaled with the cpu capacity, so
6851                 * that the load can be moved away from the cpu that is
6852                 * potentially running at a lower capacity.
6853                 *
6854                 * Thus we're looking for max(wl_i / capacity_i), crosswise
6855                 * multiplication to rid ourselves of the division works out
6856                 * to: wl_i * capacity_j > wl_j * capacity_i;  where j is
6857                 * our previous maximum.
6858                 */
6859                if (wl * busiest_capacity > busiest_load * capacity) {
6860                        busiest_load = wl;
6861                        busiest_capacity = capacity;
6862                        busiest = rq;
6863                }
6864        }
6865
6866        return busiest;
6867}
6868
6869/*
6870 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
6871 * so long as it is large enough.
6872 */
6873#define MAX_PINNED_INTERVAL     512
6874
6875/* Working cpumask for load_balance and load_balance_newidle. */
6876DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
6877
6878static int need_active_balance(struct lb_env *env)
6879{
6880        struct sched_domain *sd = env->sd;
6881
6882        if (env->idle == CPU_NEWLY_IDLE) {
6883
6884                /*
6885                 * ASYM_PACKING needs to force migrate tasks from busy but
6886                 * higher numbered CPUs in order to pack all tasks in the
6887                 * lowest numbered CPUs.
6888                 */
6889                if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
6890                        return 1;
6891        }
6892
6893        /*
6894         * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
6895         * It's worth migrating the task if the src_cpu's capacity is reduced
6896         * because of other sched_class or IRQs if more capacity stays
6897         * available on dst_cpu.
6898         */
6899        if ((env->idle != CPU_NOT_IDLE) &&
6900            (env->src_rq->cfs.h_nr_running == 1)) {
6901                if ((check_cpu_capacity(env->src_rq, sd)) &&
6902                    (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
6903                        return 1;
6904        }
6905
6906        return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
6907}
6908
6909static int active_load_balance_cpu_stop(void *data);
6910
6911static int should_we_balance(struct lb_env *env)
6912{
6913        struct sched_group *sg = env->sd->groups;
6914        struct cpumask *sg_cpus, *sg_mask;
6915        int cpu, balance_cpu = -1;
6916
6917        /*
6918         * In the newly idle case, we will allow all the cpu's
6919         * to do the newly idle load balance.
6920         */
6921        if (env->idle == CPU_NEWLY_IDLE)
6922                return 1;
6923
6924        sg_cpus = sched_group_cpus(sg);
6925        sg_mask = sched_group_mask(sg);
6926        /* Try to find first idle cpu */
6927        for_each_cpu_and(cpu, sg_cpus, env->cpus) {
6928                if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
6929                        continue;
6930
6931                balance_cpu = cpu;
6932                break;
6933        }
6934
6935        if (balance_cpu == -1)
6936                balance_cpu = group_balance_cpu(sg);
6937
6938        /*
6939         * First idle cpu or the first cpu(busiest) in this sched group
6940         * is eligible for doing load balancing at this and above domains.
6941         */
6942        return balance_cpu == env->dst_cpu;
6943}
6944
6945/*
6946 * Check this_cpu to ensure it is balanced within domain. Attempt to move
6947 * tasks if there is an imbalance.
6948 */
6949static int load_balance(int this_cpu, struct rq *this_rq,
6950                        struct sched_domain *sd, enum cpu_idle_type idle,
6951                        int *continue_balancing)
6952{
6953        int ld_moved, cur_ld_moved, active_balance = 0;
6954        struct sched_domain *sd_parent = sd->parent;
6955        struct sched_group *group;
6956        struct rq *busiest;
6957        unsigned long flags;
6958        struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
6959
6960        struct lb_env env = {
6961                .sd             = sd,
6962                .dst_cpu        = this_cpu,
6963                .dst_rq         = this_rq,
6964                .dst_grpmask    = sched_group_cpus(sd->groups),
6965                .idle           = idle,
6966                .loop_break     = sched_nr_migrate_break,
6967                .cpus           = cpus,
6968                .fbq_type       = all,
6969                .tasks          = LIST_HEAD_INIT(env.tasks),
6970        };
6971
6972        /*
6973         * For NEWLY_IDLE load_balancing, we don't need to consider
6974         * other cpus in our group
6975         */
6976        if (idle == CPU_NEWLY_IDLE)
6977                env.dst_grpmask = NULL;
6978
6979        cpumask_copy(cpus, cpu_active_mask);
6980
6981        schedstat_inc(sd, lb_count[idle]);
6982
6983redo:
6984        if (!should_we_balance(&env)) {
6985                *continue_balancing = 0;
6986                goto out_balanced;
6987        }
6988
6989        group = find_busiest_group(&env);
6990        if (!group) {
6991                schedstat_inc(sd, lb_nobusyg[idle]);
6992                goto out_balanced;
6993        }
6994
6995        busiest = find_busiest_queue(&env, group);
6996        if (!busiest) {
6997                schedstat_inc(sd, lb_nobusyq[idle]);
6998                goto out_balanced;
6999        }
7000
7001        BUG_ON(busiest == env.dst_rq);
7002
7003        schedstat_add(sd, lb_imbalance[idle], env.imbalance);
7004
7005        env.src_cpu = busiest->cpu;
7006        env.src_rq = busiest;
7007
7008        ld_moved = 0;
7009        if (busiest->nr_running > 1) {
7010                /*
7011                 * Attempt to move tasks. If find_busiest_group has found
7012                 * an imbalance but busiest->nr_running <= 1, the group is
7013                 * still unbalanced. ld_moved simply stays zero, so it is
7014                 * correctly treated as an imbalance.
7015                 */
7016                env.flags |= LBF_ALL_PINNED;
7017                env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
7018
7019more_balance:
7020                raw_spin_lock_irqsave(&busiest->lock, flags);
7021
7022                /*
7023                 * cur_ld_moved - load moved in current iteration
7024                 * ld_moved     - cumulative load moved across iterations
7025                 */
7026                cur_ld_moved = detach_tasks(&env);
7027
7028                /*
7029                 * We've detached some tasks from busiest_rq. Every
7030                 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
7031                 * unlock busiest->lock, and we are able to be sure
7032                 * that nobody can manipulate the tasks in parallel.
7033                 * See task_rq_lock() family for the details.
7034                 */
7035
7036                raw_spin_unlock(&busiest->lock);
7037
7038                if (cur_ld_moved) {
7039                        attach_tasks(&env);
7040                        ld_moved += cur_ld_moved;
7041                }
7042
7043                local_irq_restore(flags);
7044
7045                if (env.flags & LBF_NEED_BREAK) {
7046                        env.flags &= ~LBF_NEED_BREAK;
7047                        goto more_balance;
7048                }
7049
7050                /*
7051                 * Revisit (affine) tasks on src_cpu that couldn't be moved to
7052                 * us and move them to an alternate dst_cpu in our sched_group
7053                 * where they can run. The upper limit on how many times we
7054                 * iterate on same src_cpu is dependent on number of cpus in our
7055                 * sched_group.
7056                 *
7057                 * This changes load balance semantics a bit on who can move
7058                 * load to a given_cpu. In addition to the given_cpu itself
7059                 * (or a ilb_cpu acting on its behalf where given_cpu is
7060                 * nohz-idle), we now have balance_cpu in a position to move
7061                 * load to given_cpu. In rare situations, this may cause
7062                 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
7063                 * _independently_ and at _same_ time to move some load to
7064                 * given_cpu) causing exceess load to be moved to given_cpu.
7065                 * This however should not happen so much in practice and
7066                 * moreover subsequent load balance cycles should correct the
7067                 * excess load moved.
7068                 */
7069                if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
7070
7071                        /* Prevent to re-select dst_cpu via env's cpus */
7072                        cpumask_clear_cpu(env.dst_cpu, env.cpus);
7073
7074                        env.dst_rq       = cpu_rq(env.new_dst_cpu);
7075                        env.dst_cpu      = env.new_dst_cpu;
7076                        env.flags       &= ~LBF_DST_PINNED;
7077                        env.loop         = 0;
7078                        env.loop_break   = sched_nr_migrate_break;
7079
7080                        /*
7081                         * Go back to "more_balance" rather than "redo" since we
7082                         * need to continue with same src_cpu.
7083                         */
7084                        goto more_balance;
7085                }
7086
7087                /*
7088                 * We failed to reach balance because of affinity.
7089                 */
7090                if (sd_parent) {
7091                        int *group_imbalance = &sd_parent->groups->sgc->imbalance;
7092
7093                        if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
7094                                *group_imbalance = 1;
7095                }
7096
7097                /* All tasks on this runqueue were pinned by CPU affinity */
7098                if (unlikely(env.flags & LBF_ALL_PINNED)) {
7099                        cpumask_clear_cpu(cpu_of(busiest), cpus);
7100                        if (!cpumask_empty(cpus)) {
7101                                env.loop = 0;
7102                                env.loop_break = sched_nr_migrate_break;
7103                                goto redo;
7104                        }
7105                        goto out_all_pinned;
7106                }
7107        }
7108
7109        if (!ld_moved) {
7110                schedstat_inc(sd, lb_failed[idle]);
7111                /*
7112                 * Increment the failure counter only on periodic balance.
7113                 * We do not want newidle balance, which can be very
7114                 * frequent, pollute the failure counter causing
7115                 * excessive cache_hot migrations and active balances.
7116                 */
7117                if (idle != CPU_NEWLY_IDLE)
7118                        sd->nr_balance_failed++;
7119
7120                if (need_active_balance(&env)) {
7121                        raw_spin_lock_irqsave(&busiest->lock, flags);
7122
7123                        /* don't kick the active_load_balance_cpu_stop,
7124                         * if the curr task on busiest cpu can't be
7125                         * moved to this_cpu
7126                         */
7127                        if (!cpumask_test_cpu(this_cpu,
7128                                        tsk_cpus_allowed(busiest->curr))) {
7129                                raw_spin_unlock_irqrestore(&busiest->lock,
7130                                                            flags);
7131                                env.flags |= LBF_ALL_PINNED;
7132                                goto out_one_pinned;
7133                        }
7134
7135                        /*
7136                         * ->active_balance synchronizes accesses to
7137                         * ->active_balance_work.  Once set, it's cleared
7138                         * only after active load balance is finished.
7139                         */
7140                        if (!busiest->active_balance) {
7141                                busiest->active_balance = 1;
7142                                busiest->push_cpu = this_cpu;
7143                                active_balance = 1;
7144                        }
7145                        raw_spin_unlock_irqrestore(&busiest->lock, flags);
7146
7147                        if (active_balance) {
7148                                stop_one_cpu_nowait(cpu_of(busiest),
7149                                        active_load_balance_cpu_stop, busiest,
7150                                        &busiest->active_balance_work);
7151                        }
7152
7153                        /*
7154                         * We've kicked active balancing, reset the failure
7155                         * counter.
7156                         */
7157                        sd->nr_balance_failed = sd->cache_nice_tries+1;
7158                }
7159        } else
7160                sd->nr_balance_failed = 0;
7161
7162        if (likely(!active_balance)) {
7163                /* We were unbalanced, so reset the balancing interval */
7164                sd->balance_interval = sd->min_interval;
7165        } else {
7166                /*
7167                 * If we've begun active balancing, start to back off. This
7168                 * case may not be covered by the all_pinned logic if there
7169                 * is only 1 task on the busy runqueue (because we don't call
7170                 * detach_tasks).
7171                 */
7172                if (sd->balance_interval < sd->max_interval)
7173                        sd->balance_interval *= 2;
7174        }
7175
7176        goto out;
7177
7178out_balanced:
7179        /*
7180         * We reach balance although we may have faced some affinity
7181         * constraints. Clear the imbalance flag if it was set.
7182         */
7183        if (sd_parent) {
7184                int *group_imbalance = &sd_parent->groups->sgc->imbalance;
7185
7186                if (*group_imbalance)
7187                        *group_imbalance = 0;
7188        }
7189
7190out_all_pinned:
7191        /*
7192         * We reach balance because all tasks are pinned at this level so
7193         * we can't migrate them. Let the imbalance flag set so parent level
7194         * can try to migrate them.
7195         */
7196        schedstat_inc(sd, lb_balanced[idle]);
7197
7198        sd->nr_balance_failed = 0;
7199
7200out_one_pinned:
7201        /* tune up the balancing interval */
7202        if (((env.flags & LBF_ALL_PINNED) &&
7203                        sd->balance_interval < MAX_PINNED_INTERVAL) ||
7204                        (sd->balance_interval < sd->max_interval))
7205                sd->balance_interval *= 2;
7206
7207        ld_moved = 0;
7208out:
7209        return ld_moved;
7210}
7211
7212static inline unsigned long
7213get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
7214{
7215        unsigned long interval = sd->balance_interval;
7216
7217        if (cpu_busy)
7218                interval *= sd->busy_factor;
7219
7220        /* scale ms to jiffies */
7221        interval = msecs_to_jiffies(interval);
7222        interval = clamp(interval, 1UL, max_load_balance_interval);
7223
7224        return interval;
7225}
7226
7227static inline void
7228update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
7229{
7230        unsigned long interval, next;
7231
7232        interval = get_sd_balance_interval(sd, cpu_busy);
7233        next = sd->last_balance + interval;
7234
7235        if (time_after(*next_balance, next))
7236                *next_balance = next;
7237}
7238
7239/*
7240 * idle_balance is called by schedule() if this_cpu is about to become
7241 * idle. Attempts to pull tasks from other CPUs.
7242 */
7243static int idle_balance(struct rq *this_rq)
7244{
7245        unsigned long next_balance = jiffies + HZ;
7246        int this_cpu = this_rq->cpu;
7247        struct sched_domain *sd;
7248        int pulled_task = 0;
7249        u64 curr_cost = 0;
7250
7251        idle_enter_fair(this_rq);
7252
7253        /*
7254         * We must set idle_stamp _before_ calling idle_balance(), such that we
7255         * measure the duration of idle_balance() as idle time.
7256         */
7257        this_rq->idle_stamp = rq_clock(this_rq);
7258
7259        if (this_rq->avg_idle < sysctl_sched_migration_cost ||
7260            !this_rq->rd->overload) {
7261                rcu_read_lock();
7262                sd = rcu_dereference_check_sched_domain(this_rq->sd);
7263                if (sd)
7264                        update_next_balance(sd, 0, &next_balance);
7265                rcu_read_unlock();
7266
7267                goto out;
7268        }
7269
7270        raw_spin_unlock(&this_rq->lock);
7271
7272        update_blocked_averages(this_cpu);
7273        rcu_read_lock();
7274        for_each_domain(this_cpu, sd) {
7275                int continue_balancing = 1;
7276                u64 t0, domain_cost;
7277
7278                if (!(sd->flags & SD_LOAD_BALANCE))
7279                        continue;
7280
7281                if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
7282                        update_next_balance(sd, 0, &next_balance);
7283                        break;
7284                }
7285
7286                if (sd->flags & SD_BALANCE_NEWIDLE) {
7287                        t0 = sched_clock_cpu(this_cpu);
7288
7289                        pulled_task = load_balance(this_cpu, this_rq,
7290                                                   sd, CPU_NEWLY_IDLE,
7291                                                   &continue_balancing);
7292
7293                        domain_cost = sched_clock_cpu(this_cpu) - t0;
7294                        if (domain_cost > sd->max_newidle_lb_cost)
7295                                sd->max_newidle_lb_cost = domain_cost;
7296
7297                        curr_cost += domain_cost;
7298                }
7299
7300                update_next_balance(sd, 0, &next_balance);
7301
7302                /*
7303                 * Stop searching for tasks to pull if there are
7304                 * now runnable tasks on this rq.
7305                 */
7306                if (pulled_task || this_rq->nr_running > 0)
7307                        break;
7308        }
7309        rcu_read_unlock();
7310
7311        raw_spin_lock(&this_rq->lock);
7312
7313        if (curr_cost > this_rq->max_idle_balance_cost)
7314                this_rq->max_idle_balance_cost = curr_cost;
7315
7316        /*
7317         * While browsing the domains, we released the rq lock, a task could
7318         * have been enqueued in the meantime. Since we're not going idle,
7319         * pretend we pulled a task.
7320         */
7321        if (this_rq->cfs.h_nr_running && !pulled_task)
7322                pulled_task = 1;
7323
7324out:
7325        /* Move the next balance forward */
7326        if (time_after(this_rq->next_balance, next_balance))
7327                this_rq->next_balance = next_balance;
7328
7329        /* Is there a task of a high priority class? */
7330        if (this_rq->nr_running != this_rq->cfs.h_nr_running)
7331                pulled_task = -1;
7332
7333        if (pulled_task) {
7334                idle_exit_fair(this_rq);
7335                this_rq->idle_stamp = 0;
7336        }
7337
7338        return pulled_task;
7339}
7340
7341/*
7342 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
7343 * running tasks off the busiest CPU onto idle CPUs. It requires at
7344 * least 1 task to be running on each physical CPU where possible, and
7345 * avoids physical / logical imbalances.
7346 */
7347static int active_load_balance_cpu_stop(void *data)
7348{
7349        struct rq *busiest_rq = data;
7350        int busiest_cpu = cpu_of(busiest_rq);
7351        int target_cpu = busiest_rq->push_cpu;
7352        struct rq *target_rq = cpu_rq(target_cpu);
7353        struct sched_domain *sd;
7354        struct task_struct *p = NULL;
7355
7356        raw_spin_lock_irq(&busiest_rq->lock);
7357
7358        /* make sure the requested cpu hasn't gone down in the meantime */
7359        if (unlikely(busiest_cpu != smp_processor_id() ||
7360                     !busiest_rq->active_balance))
7361                goto out_unlock;
7362
7363        /* Is there any task to move? */
7364        if (busiest_rq->nr_running <= 1)
7365                goto out_unlock;
7366
7367        /*
7368         * This condition is "impossible", if it occurs
7369         * we need to fix it. Originally reported by
7370         * Bjorn Helgaas on a 128-cpu setup.
7371         */
7372        BUG_ON(busiest_rq == target_rq);
7373
7374        /* Search for an sd spanning us and the target CPU. */
7375        rcu_read_lock();
7376        for_each_domain(target_cpu, sd) {
7377                if ((sd->flags & SD_LOAD_BALANCE) &&
7378                    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
7379                                break;
7380        }
7381
7382        if (likely(sd)) {
7383                struct lb_env env = {
7384                        .sd             = sd,
7385                        .dst_cpu        = target_cpu,
7386                        .dst_rq         = target_rq,
7387                        .src_cpu        = busiest_rq->cpu,
7388                        .src_rq         = busiest_rq,
7389                        .idle           = CPU_IDLE,
7390                };
7391
7392                schedstat_inc(sd, alb_count);
7393
7394                p = detach_one_task(&env);
7395                if (p)
7396                        schedstat_inc(sd, alb_pushed);
7397                else
7398                        schedstat_inc(sd, alb_failed);
7399        }
7400        rcu_read_unlock();
7401out_unlock:
7402        busiest_rq->active_balance = 0;
7403        raw_spin_unlock(&busiest_rq->lock);
7404
7405        if (p)
7406                attach_one_task(target_rq, p);
7407
7408        local_irq_enable();
7409
7410        return 0;
7411}
7412
7413static inline int on_null_domain(struct rq *rq)
7414{
7415        return unlikely(!rcu_dereference_sched(rq->sd));
7416}
7417
7418#ifdef CONFIG_NO_HZ_COMMON
7419/*
7420 * idle load balancing details
7421 * - When one of the busy CPUs notice that there may be an idle rebalancing
7422 *   needed, they will kick the idle load balancer, which then does idle
7423 *   load balancing for all the idle CPUs.
7424 */
7425static struct {
7426        cpumask_var_t idle_cpus_mask;
7427        atomic_t nr_cpus;
7428        unsigned long next_balance;     /* in jiffy units */
7429} nohz ____cacheline_aligned;
7430
7431static inline int find_new_ilb(void)
7432{
7433        int ilb = cpumask_first(nohz.idle_cpus_mask);
7434
7435        if (ilb < nr_cpu_ids && idle_cpu(ilb))
7436                return ilb;
7437
7438        return nr_cpu_ids;
7439}
7440
7441/*
7442 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
7443 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
7444 * CPU (if there is one).
7445 */
7446static void nohz_balancer_kick(void)
7447{
7448        int ilb_cpu;
7449
7450        nohz.next_balance++;
7451
7452        ilb_cpu = find_new_ilb();
7453
7454        if (ilb_cpu >= nr_cpu_ids)
7455                return;
7456
7457        if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
7458                return;
7459        /*
7460         * Use smp_send_reschedule() instead of resched_cpu().
7461         * This way we generate a sched IPI on the target cpu which
7462         * is idle. And the softirq performing nohz idle load balance
7463         * will be run before returning from the IPI.
7464         */
7465        smp_send_reschedule(ilb_cpu);
7466        return;
7467}
7468
7469static inline void nohz_balance_exit_idle(int cpu)
7470{
7471        if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
7472                /*
7473                 * Completely isolated CPUs don't ever set, so we must test.
7474                 */
7475                if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
7476                        cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
7477                        atomic_dec(&nohz.nr_cpus);
7478                }
7479                clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
7480        }
7481}
7482
7483static inline void set_cpu_sd_state_busy(void)
7484{
7485        struct sched_domain *sd;
7486        int cpu = smp_processor_id();
7487
7488        rcu_read_lock();
7489        sd = rcu_dereference(per_cpu(sd_busy, cpu));
7490
7491        if (!sd || !sd->nohz_idle)
7492                goto unlock;
7493        sd->nohz_idle = 0;
7494
7495        atomic_inc(&sd->groups->sgc->nr_busy_cpus);
7496unlock:
7497        rcu_read_unlock();
7498}
7499
7500void set_cpu_sd_state_idle(void)
7501{
7502        struct sched_domain *sd;
7503        int cpu = smp_processor_id();
7504
7505        rcu_read_lock();
7506        sd = rcu_dereference(per_cpu(sd_busy, cpu));
7507
7508        if (!sd || sd->nohz_idle)
7509                goto unlock;
7510        sd->nohz_idle = 1;
7511
7512        atomic_dec(&sd->groups->sgc->nr_busy_cpus);
7513unlock:
7514        rcu_read_unlock();
7515}
7516
7517/*
7518 * This routine will record that the cpu is going idle with tick stopped.
7519 * This info will be used in performing idle load balancing in the future.
7520 */
7521void nohz_balance_enter_idle(int cpu)
7522{
7523        /*
7524         * If this cpu is going down, then nothing needs to be done.
7525         */
7526        if (!cpu_active(cpu))
7527                return;
7528
7529        if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
7530                return;
7531
7532        /*
7533         * If we're a completely isolated CPU, we don't play.
7534         */
7535        if (on_null_domain(cpu_rq(cpu)))
7536                return;
7537
7538        cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
7539        atomic_inc(&nohz.nr_cpus);
7540        set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
7541}
7542
7543static int sched_ilb_notifier(struct notifier_block *nfb,
7544                                        unsigned long action, void *hcpu)
7545{
7546        switch (action & ~CPU_TASKS_FROZEN) {
7547        case CPU_DYING:
7548                nohz_balance_exit_idle(smp_processor_id());
7549                return NOTIFY_OK;
7550        default:
7551                return NOTIFY_DONE;
7552        }
7553}
7554#endif
7555
7556static DEFINE_SPINLOCK(balancing);
7557
7558/*
7559 * Scale the max load_balance interval with the number of CPUs in the system.
7560 * This trades load-balance latency on larger machines for less cross talk.
7561 */
7562void update_max_interval(void)
7563{
7564        max_load_balance_interval = HZ*num_online_cpus()/10;
7565}
7566
7567/*
7568 * It checks each scheduling domain to see if it is due to be balanced,
7569 * and initiates a balancing operation if so.
7570 *
7571 * Balancing parameters are set up in init_sched_domains.
7572 */
7573static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
7574{
7575        int continue_balancing = 1;
7576        int cpu = rq->cpu;
7577        unsigned long interval;
7578        struct sched_domain *sd;
7579        /* Earliest time when we have to do rebalance again */
7580        unsigned long next_balance = jiffies + 60*HZ;
7581        int update_next_balance = 0;
7582        int need_serialize, need_decay = 0;
7583        u64 max_cost = 0;
7584
7585        update_blocked_averages(cpu);
7586
7587        rcu_read_lock();
7588        for_each_domain(cpu, sd) {
7589                /*
7590                 * Decay the newidle max times here because this is a regular
7591                 * visit to all the domains. Decay ~1% per second.
7592                 */
7593                if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
7594                        sd->max_newidle_lb_cost =
7595                                (sd->max_newidle_lb_cost * 253) / 256;
7596                        sd->next_decay_max_lb_cost = jiffies + HZ;
7597                        need_decay = 1;
7598                }
7599                max_cost += sd->max_newidle_lb_cost;
7600
7601                if (!(sd->flags & SD_LOAD_BALANCE))
7602                        continue;
7603
7604                /*
7605                 * Stop the load balance at this level. There is another
7606                 * CPU in our sched group which is doing load balancing more
7607                 * actively.
7608                 */
7609                if (!continue_balancing) {
7610                        if (need_decay)
7611                                continue;
7612                        break;
7613                }
7614
7615                interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
7616
7617                need_serialize = sd->flags & SD_SERIALIZE;
7618                if (need_serialize) {
7619                        if (!spin_trylock(&balancing))
7620                                goto out;
7621                }
7622
7623                if (time_after_eq(jiffies, sd->last_balance + interval)) {
7624                        if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
7625                                /*
7626                                 * The LBF_DST_PINNED logic could have changed
7627                                 * env->dst_cpu, so we can't know our idle
7628                                 * state even if we migrated tasks. Update it.
7629                                 */
7630                                idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
7631                        }
7632                        sd->last_balance = jiffies;
7633                        interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
7634                }
7635                if (need_serialize)
7636                        spin_unlock(&balancing);
7637out:
7638                if (time_after(next_balance, sd->last_balance + interval)) {
7639                        next_balance = sd->last_balance + interval;
7640                        update_next_balance = 1;
7641                }
7642        }
7643        if (need_decay) {
7644                /*
7645                 * Ensure the rq-wide value also decays but keep it at a
7646                 * reasonable floor to avoid funnies with rq->avg_idle.
7647                 */
7648                rq->max_idle_balance_cost =
7649                        max((u64)sysctl_sched_migration_cost, max_cost);
7650        }
7651        rcu_read_unlock();
7652
7653        /*
7654         * next_balance will be updated only when there is a need.
7655         * When the cpu is attached to null domain for ex, it will not be
7656         * updated.
7657         */
7658        if (likely(update_next_balance)) {
7659                rq->next_balance = next_balance;
7660
7661#ifdef CONFIG_NO_HZ_COMMON
7662                /*
7663                 * If this CPU has been elected to perform the nohz idle
7664                 * balance. Other idle CPUs have already rebalanced with
7665                 * nohz_idle_balance() and nohz.next_balance has been
7666                 * updated accordingly. This CPU is now running the idle load
7667                 * balance for itself and we need to update the
7668                 * nohz.next_balance accordingly.
7669                 */
7670                if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
7671                        nohz.next_balance = rq->next_balance;
7672#endif
7673        }
7674}
7675
7676#ifdef CONFIG_NO_HZ_COMMON
7677/*
7678 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
7679 * rebalancing for all the cpus for whom scheduler ticks are stopped.
7680 */
7681static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
7682{
7683        int this_cpu = this_rq->cpu;
7684        struct rq *rq;
7685        int balance_cpu;
7686        /* Earliest time when we have to do rebalance again */
7687        unsigned long next_balance = jiffies + 60*HZ;
7688        int update_next_balance = 0;
7689
7690        if (idle != CPU_IDLE ||
7691            !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
7692                goto end;
7693
7694        for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
7695                if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
7696                        continue;
7697
7698                /*
7699                 * If this cpu gets work to do, stop the load balancing
7700                 * work being done for other cpus. Next load
7701                 * balancing owner will pick it up.
7702                 */
7703                if (need_resched())
7704                        break;
7705
7706                rq = cpu_rq(balance_cpu);
7707
7708                /*
7709                 * If time for next balance is due,
7710                 * do the balance.
7711                 */
7712                if (time_after_eq(jiffies, rq->next_balance)) {
7713                        raw_spin_lock_irq(&rq->lock);
7714                        update_rq_clock(rq);
7715                        update_idle_cpu_load(rq);
7716                        raw_spin_unlock_irq(&rq->lock);
7717                        rebalance_domains(rq, CPU_IDLE);
7718                }
7719
7720                if (time_after(next_balance, rq->next_balance)) {
7721                        next_balance = rq->next_balance;
7722                        update_next_balance = 1;
7723                }
7724        }
7725
7726        /*
7727         * next_balance will be updated only when there is a need.
7728         * When the CPU is attached to null domain for ex, it will not be
7729         * updated.
7730         */
7731        if (likely(update_next_balance))
7732                nohz.next_balance = next_balance;
7733end:
7734        clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
7735}
7736
7737/*
7738 * Current heuristic for kicking the idle load balancer in the presence
7739 * of an idle cpu in the system.
7740 *   - This rq has more than one task.
7741 *   - This rq has at least one CFS task and the capacity of the CPU is
7742 *     significantly reduced because of RT tasks or IRQs.
7743 *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
7744 *     multiple busy cpu.
7745 *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
7746 *     domain span are idle.
7747 */
7748static inline bool nohz_kick_needed(struct rq *rq)
7749{
7750        unsigned long now = jiffies;
7751        struct sched_domain *sd;
7752        struct sched_group_capacity *sgc;
7753        int nr_busy, cpu = rq->cpu;
7754        bool kick = false;
7755
7756        if (unlikely(rq->idle_balance))
7757                return false;
7758
7759       /*
7760        * We may be recently in ticked or tickless idle mode. At the first
7761        * busy tick after returning from idle, we will update the busy stats.
7762        */
7763        set_cpu_sd_state_busy();
7764        nohz_balance_exit_idle(cpu);
7765
7766        /*
7767         * None are in tickless mode and hence no need for NOHZ idle load
7768         * balancing.
7769         */
7770        if (likely(!atomic_read(&nohz.nr_cpus)))
7771                return false;
7772
7773        if (time_before(now, nohz.next_balance))
7774                return false;
7775
7776        if (rq->nr_running >= 2)
7777                return true;
7778
7779        rcu_read_lock();
7780        sd = rcu_dereference(per_cpu(sd_busy, cpu));
7781        if (sd) {
7782                sgc = sd->groups->sgc;
7783                nr_busy = atomic_read(&sgc->nr_busy_cpus);
7784
7785                if (nr_busy > 1) {
7786                        kick = true;
7787                        goto unlock;
7788                }
7789
7790        }
7791
7792        sd = rcu_dereference(rq->sd);
7793        if (sd) {
7794                if ((rq->cfs.h_nr_running >= 1) &&
7795                                check_cpu_capacity(rq, sd)) {
7796                        kick = true;
7797                        goto unlock;
7798                }
7799        }
7800
7801        sd = rcu_dereference(per_cpu(sd_asym, cpu));
7802        if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
7803                                  sched_domain_span(sd)) < cpu)) {
7804                kick = true;
7805                goto unlock;
7806        }
7807
7808unlock:
7809        rcu_read_unlock();
7810        return kick;
7811}
7812#else
7813static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
7814#endif
7815
7816/*
7817 * run_rebalance_domains is triggered when needed from the scheduler tick.
7818 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
7819 */
7820static void run_rebalance_domains(struct softirq_action *h)
7821{
7822        struct rq *this_rq = this_rq();
7823        enum cpu_idle_type idle = this_rq->idle_balance ?
7824                                                CPU_IDLE : CPU_NOT_IDLE;
7825
7826        /*
7827         * If this cpu has a pending nohz_balance_kick, then do the
7828         * balancing on behalf of the other idle cpus whose ticks are
7829         * stopped. Do nohz_idle_balance *before* rebalance_domains to
7830         * give the idle cpus a chance to load balance. Else we may
7831         * load balance only within the local sched_domain hierarchy
7832         * and abort nohz_idle_balance altogether if we pull some load.
7833         */
7834        nohz_idle_balance(this_rq, idle);
7835        rebalance_domains(this_rq, idle);
7836}
7837
7838/*
7839 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
7840 */
7841void trigger_load_balance(struct rq *rq)
7842{
7843        /* Don't need to rebalance while attached to NULL domain */
7844        if (unlikely(on_null_domain(rq)))
7845                return;
7846
7847        if (time_after_eq(jiffies, rq->next_balance))
7848                raise_softirq(SCHED_SOFTIRQ);
7849#ifdef CONFIG_NO_HZ_COMMON
7850        if (nohz_kick_needed(rq))
7851                nohz_balancer_kick();
7852#endif
7853}
7854
7855static void rq_online_fair(struct rq *rq)
7856{
7857        update_sysctl();
7858
7859        update_runtime_enabled(rq);
7860}
7861
7862static void rq_offline_fair(struct rq *rq)
7863{
7864        update_sysctl();
7865
7866        /* Ensure any throttled groups are reachable by pick_next_task */
7867        unthrottle_offline_cfs_rqs(rq);
7868}
7869
7870#endif /* CONFIG_SMP */
7871
7872/*
7873 * scheduler tick hitting a task of our scheduling class:
7874 */
7875static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
7876{
7877        struct cfs_rq *cfs_rq;
7878        struct sched_entity *se = &curr->se;
7879
7880        for_each_sched_entity(se) {
7881                cfs_rq = cfs_rq_of(se);
7882                entity_tick(cfs_rq, se, queued);
7883        }
7884
7885        if (static_branch_unlikely(&sched_numa_balancing))
7886                task_tick_numa(rq, curr);
7887}
7888
7889/*
7890 * called on fork with the child task as argument from the parent's context
7891 *  - child not yet on the tasklist
7892 *  - preemption disabled
7893 */
7894static void task_fork_fair(struct task_struct *p)
7895{
7896        struct cfs_rq *cfs_rq;
7897        struct sched_entity *se = &p->se, *curr;
7898        int this_cpu = smp_processor_id();
7899        struct rq *rq = this_rq();
7900        unsigned long flags;
7901
7902        raw_spin_lock_irqsave(&rq->lock, flags);
7903
7904        update_rq_clock(rq);
7905
7906        cfs_rq = task_cfs_rq(current);
7907        curr = cfs_rq->curr;
7908
7909        /*
7910         * Not only the cpu but also the task_group of the parent might have
7911         * been changed after parent->se.parent,cfs_rq were copied to
7912         * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
7913         * of child point to valid ones.
7914         */
7915        rcu_read_lock();
7916        __set_task_cpu(p, this_cpu);
7917        rcu_read_unlock();
7918
7919        update_curr(cfs_rq);
7920
7921        if (curr)
7922                se->vruntime = curr->vruntime;
7923        place_entity(cfs_rq, se, 1);
7924
7925        if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
7926                /*
7927                 * Upon rescheduling, sched_class::put_prev_task() will place
7928                 * 'current' within the tree based on its new key value.
7929                 */
7930                swap(curr->vruntime, se->vruntime);
7931                resched_curr(rq);
7932        }
7933
7934        se->vruntime -= cfs_rq->min_vruntime;
7935
7936        raw_spin_unlock_irqrestore(&rq->lock, flags);
7937}
7938
7939/*
7940 * Priority of the task has changed. Check to see if we preempt
7941 * the current task.
7942 */
7943static void
7944prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
7945{
7946        if (!task_on_rq_queued(p))
7947                return;
7948
7949        /*
7950         * Reschedule if we are currently running on this runqueue and
7951         * our priority decreased, or if we are not currently running on
7952         * this runqueue and our priority is higher than the current's
7953         */
7954        if (rq->curr == p) {
7955                if (p->prio > oldprio)
7956                        resched_curr(rq);
7957        } else
7958                check_preempt_curr(rq, p, 0);
7959}
7960
7961static inline bool vruntime_normalized(struct task_struct *p)
7962{
7963        struct sched_entity *se = &p->se;
7964
7965        /*
7966         * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
7967         * the dequeue_entity(.flags=0) will already have normalized the
7968         * vruntime.
7969         */
7970        if (p->on_rq)
7971                return true;
7972
7973        /*
7974         * When !on_rq, vruntime of the task has usually NOT been normalized.
7975         * But there are some cases where it has already been normalized:
7976         *
7977         * - A forked child which is waiting for being woken up by
7978         *   wake_up_new_task().
7979         * - A task which has been woken up by try_to_wake_up() and
7980         *   waiting for actually being woken up by sched_ttwu_pending().
7981         */
7982        if (!se->sum_exec_runtime || p->state == TASK_WAKING)
7983                return true;
7984
7985        return false;
7986}
7987
7988static void detach_task_cfs_rq(struct task_struct *p)
7989{
7990        struct sched_entity *se = &p->se;
7991        struct cfs_rq *cfs_rq = cfs_rq_of(se);
7992
7993        if (!vruntime_normalized(p)) {
7994                /*
7995                 * Fix up our vruntime so that the current sleep doesn't
7996                 * cause 'unlimited' sleep bonus.
7997                 */
7998                place_entity(cfs_rq, se, 0);
7999                se->vruntime -= cfs_rq->min_vruntime;
8000        }
8001
8002        /* Catch up with the cfs_rq and remove our load when we leave */
8003        detach_entity_load_avg(cfs_rq, se);
8004}
8005
8006static void attach_task_cfs_rq(struct task_struct *p)
8007{
8008        struct sched_entity *se = &p->se;
8009        struct cfs_rq *cfs_rq = cfs_rq_of(se);
8010
8011#ifdef CONFIG_FAIR_GROUP_SCHED
8012        /*
8013         * Since the real-depth could have been changed (only FAIR
8014         * class maintain depth value), reset depth properly.
8015         */
8016        se->depth = se->parent ? se->parent->depth + 1 : 0;
8017#endif
8018
8019        /* Synchronize task with its cfs_rq */
8020        attach_entity_load_avg(cfs_rq, se);
8021
8022        if (!vruntime_normalized(p))
8023                se->vruntime += cfs_rq->min_vruntime;
8024}
8025
8026static void switched_from_fair(struct rq *rq, struct task_struct *p)
8027{
8028        detach_task_cfs_rq(p);
8029}
8030
8031static void switched_to_fair(struct rq *rq, struct task_struct *p)
8032{
8033        attach_task_cfs_rq(p);
8034
8035        if (task_on_rq_queued(p)) {
8036                /*
8037                 * We were most likely switched from sched_rt, so
8038                 * kick off the schedule if running, otherwise just see
8039                 * if we can still preempt the current task.
8040                 */
8041                if (rq->curr == p)
8042                        resched_curr(rq);
8043                else
8044                        check_preempt_curr(rq, p, 0);
8045        }
8046}
8047
8048/* Account for a task changing its policy or group.
8049 *
8050 * This routine is mostly called to set cfs_rq->curr field when a task
8051 * migrates between groups/classes.
8052 */
8053static void set_curr_task_fair(struct rq *rq)
8054{
8055        struct sched_entity *se = &rq->curr->se;
8056
8057        for_each_sched_entity(se) {
8058                struct cfs_rq *cfs_rq = cfs_rq_of(se);
8059
8060                set_next_entity(cfs_rq, se);
8061                /* ensure bandwidth has been allocated on our new cfs_rq */
8062                account_cfs_rq_runtime(cfs_rq, 0);
8063        }
8064}
8065
8066void init_cfs_rq(struct cfs_rq *cfs_rq)
8067{
8068        cfs_rq->tasks_timeline = RB_ROOT;
8069        cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8070#ifndef CONFIG_64BIT
8071        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
8072#endif
8073#ifdef CONFIG_SMP
8074        atomic_long_set(&cfs_rq->removed_load_avg, 0);
8075        atomic_long_set(&cfs_rq->removed_util_avg, 0);
8076#endif
8077}
8078
8079#ifdef CONFIG_FAIR_GROUP_SCHED
8080static void task_move_group_fair(struct task_struct *p)
8081{
8082        detach_task_cfs_rq(p);
8083        set_task_rq(p, task_cpu(p));
8084
8085#ifdef CONFIG_SMP
8086        /* Tell se's cfs_rq has been changed -- migrated */
8087        p->se.avg.last_update_time = 0;
8088#endif
8089        attach_task_cfs_rq(p);
8090}
8091
8092void free_fair_sched_group(struct task_group *tg)
8093{
8094        int i;
8095
8096        destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8097
8098        for_each_possible_cpu(i) {
8099                if (tg->cfs_rq)
8100                        kfree(tg->cfs_rq[i]);
8101                if (tg->se) {
8102                        if (tg->se[i])
8103                                remove_entity_load_avg(tg->se[i]);
8104                        kfree(tg->se[i]);
8105                }
8106        }
8107
8108        kfree(tg->cfs_rq);
8109        kfree(tg->se);
8110}
8111
8112int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8113{
8114        struct cfs_rq *cfs_rq;
8115        struct sched_entity *se;
8116        int i;
8117
8118        tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8119        if (!tg->cfs_rq)
8120                goto err;
8121        tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8122        if (!tg->se)
8123                goto err;
8124
8125        tg->shares = NICE_0_LOAD;
8126
8127        init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8128
8129        for_each_possible_cpu(i) {
8130                cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8131                                      GFP_KERNEL, cpu_to_node(i));
8132                if (!cfs_rq)
8133                        goto err;
8134
8135                se = kzalloc_node(sizeof(struct sched_entity),
8136                                  GFP_KERNEL, cpu_to_node(i));
8137                if (!se)
8138                        goto err_free_rq;
8139
8140                init_cfs_rq(cfs_rq);
8141                init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8142                init_entity_runnable_average(se);
8143        }
8144
8145        return 1;
8146
8147err_free_rq:
8148        kfree(cfs_rq);
8149err:
8150        return 0;
8151}
8152
8153void unregister_fair_sched_group(struct task_group *tg, int cpu)
8154{
8155        struct rq *rq = cpu_rq(cpu);
8156        unsigned long flags;
8157
8158        /*
8159        * Only empty task groups can be destroyed; so we can speculatively
8160        * check on_list without danger of it being re-added.
8161        */
8162        if (!tg->cfs_rq[cpu]->on_list)
8163                return;
8164
8165        raw_spin_lock_irqsave(&rq->lock, flags);
8166        list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8167        raw_spin_unlock_irqrestore(&rq->lock, flags);
8168}
8169
8170void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8171                        struct sched_entity *se, int cpu,
8172                        struct sched_entity *parent)
8173{
8174        struct rq *rq = cpu_rq(cpu);
8175
8176        cfs_rq->tg = tg;
8177        cfs_rq->rq = rq;
8178        init_cfs_rq_runtime(cfs_rq);
8179
8180        tg->cfs_rq[cpu] = cfs_rq;
8181        tg->se[cpu] = se;
8182
8183        /* se could be NULL for root_task_group */
8184        if (!se)
8185                return;
8186
8187        if (!parent) {
8188                se->cfs_rq = &rq->cfs;
8189                se->depth = 0;
8190        } else {
8191                se->cfs_rq = parent->my_q;
8192                se->depth = parent->depth + 1;
8193        }
8194
8195        se->my_q = cfs_rq;
8196        /* guarantee group entities always have weight */
8197        update_load_set(&se->load, NICE_0_LOAD);
8198        se->parent = parent;
8199}
8200
8201static DEFINE_MUTEX(shares_mutex);
8202
8203int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8204{
8205        int i;
8206        unsigned long flags;
8207
8208        /*
8209         * We can't change the weight of the root cgroup.
8210         */
8211        if (!tg->se[0])
8212                return -EINVAL;
8213
8214        shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8215
8216        mutex_lock(&shares_mutex);
8217        if (tg->shares == shares)
8218                goto done;
8219
8220        tg->shares = shares;
8221        for_each_possible_cpu(i) {
8222                struct rq *rq = cpu_rq(i);
8223                struct sched_entity *se;
8224
8225                se = tg->se[i];
8226                /* Propagate contribution to hierarchy */
8227                raw_spin_lock_irqsave(&rq->lock, flags);
8228
8229                /* Possible calls to update_curr() need rq clock */
8230                update_rq_clock(rq);
8231                for_each_sched_entity(se)
8232                        update_cfs_shares(group_cfs_rq(se));
8233                raw_spin_unlock_irqrestore(&rq->lock, flags);
8234        }
8235
8236done:
8237        mutex_unlock(&shares_mutex);
8238        return 0;
8239}
8240#else /* CONFIG_FAIR_GROUP_SCHED */
8241
8242void free_fair_sched_group(struct task_group *tg) { }
8243
8244int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8245{
8246        return 1;
8247}
8248
8249void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
8250
8251#endif /* CONFIG_FAIR_GROUP_SCHED */
8252
8253
8254static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
8255{
8256        struct sched_entity *se = &task->se;
8257        unsigned int rr_interval = 0;
8258
8259        /*
8260         * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
8261         * idle runqueue:
8262         */
8263        if (rq->cfs.load.weight)
8264                rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
8265
8266        return rr_interval;
8267}
8268
8269/*
8270 * All the scheduling class methods:
8271 */
8272const struct sched_class fair_sched_class = {
8273        .next                   = &idle_sched_class,
8274        .enqueue_task           = enqueue_task_fair,
8275        .dequeue_task           = dequeue_task_fair,
8276        .yield_task             = yield_task_fair,
8277        .yield_to_task          = yield_to_task_fair,
8278
8279        .check_preempt_curr     = check_preempt_wakeup,
8280
8281        .pick_next_task         = pick_next_task_fair,
8282        .put_prev_task          = put_prev_task_fair,
8283
8284#ifdef CONFIG_SMP
8285        .select_task_rq         = select_task_rq_fair,
8286        .migrate_task_rq        = migrate_task_rq_fair,
8287
8288        .rq_online              = rq_online_fair,
8289        .rq_offline             = rq_offline_fair,
8290
8291        .task_waking            = task_waking_fair,
8292        .task_dead              = task_dead_fair,
8293        .set_cpus_allowed       = set_cpus_allowed_common,
8294#endif
8295
8296        .set_curr_task          = set_curr_task_fair,
8297        .task_tick              = task_tick_fair,
8298        .task_fork              = task_fork_fair,
8299
8300        .prio_changed           = prio_changed_fair,
8301        .switched_from          = switched_from_fair,
8302        .switched_to            = switched_to_fair,
8303
8304        .get_rr_interval        = get_rr_interval_fair,
8305
8306        .update_curr            = update_curr_fair,
8307
8308#ifdef CONFIG_FAIR_GROUP_SCHED
8309        .task_move_group        = task_move_group_fair,
8310#endif
8311};
8312
8313#ifdef CONFIG_SCHED_DEBUG
8314void print_cfs_stats(struct seq_file *m, int cpu)
8315{
8316        struct cfs_rq *cfs_rq;
8317
8318        rcu_read_lock();
8319        for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
8320                print_cfs_rq(m, cpu, cfs_rq);
8321        rcu_read_unlock();
8322}
8323
8324#ifdef CONFIG_NUMA_BALANCING
8325void show_numa_stats(struct task_struct *p, struct seq_file *m)
8326{
8327        int node;
8328        unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
8329
8330        for_each_online_node(node) {
8331                if (p->numa_faults) {
8332                        tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
8333                        tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
8334                }
8335                if (p->numa_group) {
8336                        gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
8337                        gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
8338                }
8339                print_numa_stats(m, node, tsf, tpf, gsf, gpf);
8340        }
8341}
8342#endif /* CONFIG_NUMA_BALANCING */
8343#endif /* CONFIG_SCHED_DEBUG */
8344
8345__init void init_sched_fair_class(void)
8346{
8347#ifdef CONFIG_SMP
8348        open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8349
8350#ifdef CONFIG_NO_HZ_COMMON
8351        nohz.next_balance = jiffies;
8352        zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8353        cpu_notifier(sched_ilb_notifier, 0);
8354#endif
8355#endif /* SMP */
8356
8357}
8358