LXR linux/kernel/sched/fair.c

   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
   4 *
   5 *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   6 *
   7 *  Interactivity improvements by Mike Galbraith
   8 *  (C) 2007 Mike Galbraith <efault@gmx.de>
   9 *
  10 *  Various enhancements by Dmitry Adamushko.
  11 *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  12 *
  13 *  Group scheduling enhancements by Srivatsa Vaddagiri
  14 *  Copyright IBM Corporation, 2007
  15 *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  16 *
  17 *  Scaled math optimizations by Thomas Gleixner
  18 *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  19 *
  20 *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  21 *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  22 */
  23#include "sched.h"
  24
  25#include <trace/events/sched.h>
  26
  27/*
  28 * Targeted preemption latency for CPU-bound tasks:
  29 *
  30 * NOTE: this latency value is not the same as the concept of
  31 * 'timeslice length' - timeslices in CFS are of variable length
  32 * and have no persistent notion like in traditional, time-slice
  33 * based scheduling concepts.
  34 *
  35 * (to see the precise effective timeslice length of your workload,
  36 *  run vmstat and monitor the context-switches (cs) field)
  37 *
  38 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
  39 */
  40unsigned int sysctl_sched_latency                       = 6000000ULL;
  41static unsigned int normalized_sysctl_sched_latency     = 6000000ULL;
  42
  43/*
  44 * The initial- and re-scaling of tunables is configurable
  45 *
  46 * Options are:
  47 *
  48 *   SCHED_TUNABLESCALING_NONE - unscaled, always *1
  49 *   SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
  50 *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
  51 *
  52 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
  53 */
  54enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
  55
  56/*
  57 * Minimal preemption granularity for CPU-bound tasks:
  58 *
  59 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  60 */
  61unsigned int sysctl_sched_min_granularity                       = 750000ULL;
  62static unsigned int normalized_sysctl_sched_min_granularity     = 750000ULL;
  63
  64/*
  65 * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
  66 */
  67static unsigned int sched_nr_latency = 8;
  68
  69/*
  70 * After fork, child runs first. If set to 0 (default) then
  71 * parent will (try to) run first.
  72 */
  73unsigned int sysctl_sched_child_runs_first __read_mostly;
  74
  75/*
  76 * SCHED_OTHER wake-up granularity.
  77 *
  78 * This option delays the preemption effects of decoupled workloads
  79 * and reduces their over-scheduling. Synchronous workloads will still
  80 * have immediate wakeup/sleep latencies.
  81 *
  82 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
  83 */
  84unsigned int sysctl_sched_wakeup_granularity                    = 1000000UL;
  85static unsigned int normalized_sysctl_sched_wakeup_granularity  = 1000000UL;
  86
  87const_debug unsigned int sysctl_sched_migration_cost    = 500000UL;
  88
  89#ifdef CONFIG_SMP
  90/*
  91 * For asym packing, by default the lower numbered CPU has higher priority.
  92 */
  93int __weak arch_asym_cpu_priority(int cpu)
  94{
  95        return -cpu;
  96}
  97
  98/*
  99 * The margin used when comparing utilization with CPU capacity:
 100 * util * margin < capacity * 1024
 101 *
 102 * (default: ~20%)
 103 */
 104static unsigned int capacity_margin                     = 1280;
 105#endif
 106
 107#ifdef CONFIG_CFS_BANDWIDTH
 108/*
 109 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
 110 * each time a cfs_rq requests quota.
 111 *
 112 * Note: in the case that the slice exceeds the runtime remaining (either due
 113 * to consumption or the quota being specified to be smaller than the slice)
 114 * we will always only issue the remaining available time.
 115 *
 116 * (default: 5 msec, units: microseconds)
 117 */
 118unsigned int sysctl_sched_cfs_bandwidth_slice           = 5000UL;
 119#endif
 120
 121static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 122{
 123        lw->weight += inc;
 124        lw->inv_weight = 0;
 125}
 126
 127static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
 128{
 129        lw->weight -= dec;
 130        lw->inv_weight = 0;
 131}
 132
 133static inline void update_load_set(struct load_weight *lw, unsigned long w)
 134{
 135        lw->weight = w;
 136        lw->inv_weight = 0;
 137}
 138
 139/*
 140 * Increase the granularity value when there are more CPUs,
 141 * because with more CPUs the 'effective latency' as visible
 142 * to users decreases. But the relationship is not linear,
 143 * so pick a second-best guess by going with the log2 of the
 144 * number of CPUs.
 145 *
 146 * This idea comes from the SD scheduler of Con Kolivas:
 147 */
 148static unsigned int get_update_sysctl_factor(void)
 149{
 150        unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
 151        unsigned int factor;
 152
 153        switch (sysctl_sched_tunable_scaling) {
 154        case SCHED_TUNABLESCALING_NONE:
 155                factor = 1;
 156                break;
 157        case SCHED_TUNABLESCALING_LINEAR:
 158                factor = cpus;
 159                break;
 160        case SCHED_TUNABLESCALING_LOG:
 161        default:
 162                factor = 1 + ilog2(cpus);
 163                break;
 164        }
 165
 166        return factor;
 167}
 168
 169static void update_sysctl(void)
 170{
 171        unsigned int factor = get_update_sysctl_factor();
 172
 173#define SET_SYSCTL(name) \
 174        (sysctl_##name = (factor) * normalized_sysctl_##name)
 175        SET_SYSCTL(sched_min_granularity);
 176        SET_SYSCTL(sched_latency);
 177        SET_SYSCTL(sched_wakeup_granularity);
 178#undef SET_SYSCTL
 179}
 180
 181void sched_init_granularity(void)
 182{
 183        update_sysctl();
 184}
 185
 186#define WMULT_CONST     (~0U)
 187#define WMULT_SHIFT     32
 188
 189static void __update_inv_weight(struct load_weight *lw)
 190{
 191        unsigned long w;
 192
 193        if (likely(lw->inv_weight))
 194                return;
 195
 196        w = scale_load_down(lw->weight);
 197
 198        if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
 199                lw->inv_weight = 1;
 200        else if (unlikely(!w))
 201                lw->inv_weight = WMULT_CONST;
 202        else
 203                lw->inv_weight = WMULT_CONST / w;
 204}
 205
 206/*
 207 * delta_exec * weight / lw.weight
 208 *   OR
 209 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
 210 *
 211 * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
 212 * we're guaranteed shift stays positive because inv_weight is guaranteed to
 213 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
 214 *
 215 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
 216 * weight/lw.weight <= 1, and therefore our shift will also be positive.
 217 */
 218static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
 219{
 220        u64 fact = scale_load_down(weight);
 221        int shift = WMULT_SHIFT;
 222
 223        __update_inv_weight(lw);
 224
 225        if (unlikely(fact >> 32)) {
 226                while (fact >> 32) {
 227                        fact >>= 1;
 228                        shift--;
 229                }
 230        }
 231
 232        /* hint to use a 32x32->64 mul */
 233        fact = (u64)(u32)fact * lw->inv_weight;
 234
 235        while (fact >> 32) {
 236                fact >>= 1;
 237                shift--;
 238        }
 239
 240        return mul_u64_u32_shr(delta_exec, fact, shift);
 241}
 242
 243
 244const struct sched_class fair_sched_class;
 245
 246/**************************************************************
 247 * CFS operations on generic schedulable entities:
 248 */
 249
 250#ifdef CONFIG_FAIR_GROUP_SCHED
 251static inline struct task_struct *task_of(struct sched_entity *se)
 252{
 253        SCHED_WARN_ON(!entity_is_task(se));
 254        return container_of(se, struct task_struct, se);
 255}
 256
 257/* Walk up scheduling entities hierarchy */
 258#define for_each_sched_entity(se) \
 259                for (; se; se = se->parent)
 260
 261static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 262{
 263        return p->se.cfs_rq;
 264}
 265
 266/* runqueue on which this entity is (to be) queued */
 267static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 268{
 269        return se->cfs_rq;
 270}
 271
 272/* runqueue "owned" by this group */
 273static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 274{
 275        return grp->my_q;
 276}
 277
 278static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
 279{
 280        if (!path)
 281                return;
 282
 283        if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
 284                autogroup_path(cfs_rq->tg, path, len);
 285        else if (cfs_rq && cfs_rq->tg->css.cgroup)
 286                cgroup_path(cfs_rq->tg->css.cgroup, path, len);
 287        else
 288                strlcpy(path, "(null)", len);
 289}
 290
 291static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 292{
 293        struct rq *rq = rq_of(cfs_rq);
 294        int cpu = cpu_of(rq);
 295
 296        if (cfs_rq->on_list)
 297                return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
 298
 299        cfs_rq->on_list = 1;
 300
 301        /*
 302         * Ensure we either appear before our parent (if already
 303         * enqueued) or force our parent to appear after us when it is
 304         * enqueued. The fact that we always enqueue bottom-up
 305         * reduces this to two cases and a special case for the root
 306         * cfs_rq. Furthermore, it also means that we will always reset
 307         * tmp_alone_branch either when the branch is connected
 308         * to a tree or when we reach the top of the tree
 309         */
 310        if (cfs_rq->tg->parent &&
 311            cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
 312                /*
 313                 * If parent is already on the list, we add the child
 314                 * just before. Thanks to circular linked property of
 315                 * the list, this means to put the child at the tail
 316                 * of the list that starts by parent.
 317                 */
 318                list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 319                        &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
 320                /*
 321                 * The branch is now connected to its tree so we can
 322                 * reset tmp_alone_branch to the beginning of the
 323                 * list.
 324                 */
 325                rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
 326                return true;
 327        }
 328
 329        if (!cfs_rq->tg->parent) {
 330                /*
 331                 * cfs rq without parent should be put
 332                 * at the tail of the list.
 333                 */
 334                list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 335                        &rq->leaf_cfs_rq_list);
 336                /*
 337                 * We have reach the top of a tree so we can reset
 338                 * tmp_alone_branch to the beginning of the list.
 339                 */
 340                rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
 341                return true;
 342        }
 343
 344        /*
 345         * The parent has not already been added so we want to
 346         * make sure that it will be put after us.
 347         * tmp_alone_branch points to the begin of the branch
 348         * where we will add parent.
 349         */
 350        list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
 351        /*
 352         * update tmp_alone_branch to points to the new begin
 353         * of the branch
 354         */
 355        rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
 356        return false;
 357}
 358
 359static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 360{
 361        if (cfs_rq->on_list) {
 362                struct rq *rq = rq_of(cfs_rq);
 363
 364                /*
 365                 * With cfs_rq being unthrottled/throttled during an enqueue,
 366                 * it can happen the tmp_alone_branch points the a leaf that
 367                 * we finally want to del. In this case, tmp_alone_branch moves
 368                 * to the prev element but it will point to rq->leaf_cfs_rq_list
 369                 * at the end of the enqueue.
 370                 */
 371                if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
 372                        rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
 373
 374                list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
 375                cfs_rq->on_list = 0;
 376        }
 377}
 378
 379static inline void assert_list_leaf_cfs_rq(struct rq *rq)
 380{
 381        SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
 382}
 383
 384/* Iterate thr' all leaf cfs_rq's on a runqueue */
 385#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)                      \
 386        list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,    \
 387                                 leaf_cfs_rq_list)
 388
 389/* Do the two (enqueued) entities belong to the same group ? */
 390static inline struct cfs_rq *
 391is_same_group(struct sched_entity *se, struct sched_entity *pse)
 392{
 393        if (se->cfs_rq == pse->cfs_rq)
 394                return se->cfs_rq;
 395
 396        return NULL;
 397}
 398
 399static inline struct sched_entity *parent_entity(struct sched_entity *se)
 400{
 401        return se->parent;
 402}
 403
 404static void
 405find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 406{
 407        int se_depth, pse_depth;
 408
 409        /*
 410         * preemption test can be made between sibling entities who are in the
 411         * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
 412         * both tasks until we find their ancestors who are siblings of common
 413         * parent.
 414         */
 415
 416        /* First walk up until both entities are at same depth */
 417        se_depth = (*se)->depth;
 418        pse_depth = (*pse)->depth;
 419
 420        while (se_depth > pse_depth) {
 421                se_depth--;
 422                *se = parent_entity(*se);
 423        }
 424
 425        while (pse_depth > se_depth) {
 426                pse_depth--;
 427                *pse = parent_entity(*pse);
 428        }
 429
 430        while (!is_same_group(*se, *pse)) {
 431                *se = parent_entity(*se);
 432                *pse = parent_entity(*pse);
 433        }
 434}
 435
 436#else   /* !CONFIG_FAIR_GROUP_SCHED */
 437
 438static inline struct task_struct *task_of(struct sched_entity *se)
 439{
 440        return container_of(se, struct task_struct, se);
 441}
 442
 443#define for_each_sched_entity(se) \
 444                for (; se; se = NULL)
 445
 446static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 447{
 448        return &task_rq(p)->cfs;
 449}
 450
 451static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 452{
 453        struct task_struct *p = task_of(se);
 454        struct rq *rq = task_rq(p);
 455
 456        return &rq->cfs;
 457}
 458
 459/* runqueue "owned" by this group */
 460static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 461{
 462        return NULL;
 463}
 464
 465static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
 466{
 467        if (path)
 468                strlcpy(path, "(null)", len);
 469}
 470
 471static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 472{
 473        return true;
 474}
 475
 476static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 477{
 478}
 479
 480static inline void assert_list_leaf_cfs_rq(struct rq *rq)
 481{
 482}
 483
 484#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)      \
 485                for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
 486
 487static inline struct sched_entity *parent_entity(struct sched_entity *se)
 488{
 489        return NULL;
 490}
 491
 492static inline void
 493find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 494{
 495}
 496
 497#endif  /* CONFIG_FAIR_GROUP_SCHED */
 498
 499static __always_inline
 500void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 501
 502/**************************************************************
 503 * Scheduling class tree data structure manipulation methods:
 504 */
 505
 506static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
 507{
 508        s64 delta = (s64)(vruntime - max_vruntime);
 509        if (delta > 0)
 510                max_vruntime = vruntime;
 511
 512        return max_vruntime;
 513}
 514
 515static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
 516{
 517        s64 delta = (s64)(vruntime - min_vruntime);
 518        if (delta < 0)
 519                min_vruntime = vruntime;
 520
 521        return min_vruntime;
 522}
 523
 524static inline int entity_before(struct sched_entity *a,
 525                                struct sched_entity *b)
 526{
 527        return (s64)(a->vruntime - b->vruntime) < 0;
 528}
 529
 530static void update_min_vruntime(struct cfs_rq *cfs_rq)
 531{
 532        struct sched_entity *curr = cfs_rq->curr;
 533        struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
 534
 535        u64 vruntime = cfs_rq->min_vruntime;
 536
 537        if (curr) {
 538                if (curr->on_rq)
 539                        vruntime = curr->vruntime;
 540                else
 541                        curr = NULL;
 542        }
 543
 544        if (leftmost) { /* non-empty tree */
 545                struct sched_entity *se;
 546                se = rb_entry(leftmost, struct sched_entity, run_node);
 547
 548                if (!curr)
 549                        vruntime = se->vruntime;
 550                else
 551                        vruntime = min_vruntime(vruntime, se->vruntime);
 552        }
 553
 554        /* ensure we never gain time by being placed backwards. */
 555        cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
 556#ifndef CONFIG_64BIT
 557        smp_wmb();
 558        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 559#endif
 560}
 561
 562/*
 563 * Enqueue an entity into the rb-tree:
 564 */
 565static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 566{
 567        struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
 568        struct rb_node *parent = NULL;
 569        struct sched_entity *entry;
 570        bool leftmost = true;
 571
 572        /*
 573         * Find the right place in the rbtree:
 574         */
 575        while (*link) {
 576                parent = *link;
 577                entry = rb_entry(parent, struct sched_entity, run_node);
 578                /*
 579                 * We dont care about collisions. Nodes with
 580                 * the same key stay together.
 581                 */
 582                if (entity_before(se, entry)) {
 583                        link = &parent->rb_left;
 584                } else {
 585                        link = &parent->rb_right;
 586                        leftmost = false;
 587                }
 588        }
 589
 590        rb_link_node(&se->run_node, parent, link);
 591        rb_insert_color_cached(&se->run_node,
 592                               &cfs_rq->tasks_timeline, leftmost);
 593}
 594
 595static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 596{
 597        rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
 598}
 599
 600struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 601{
 602        struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
 603
 604        if (!left)
 605                return NULL;
 606
 607        return rb_entry(left, struct sched_entity, run_node);
 608}
 609
 610static struct sched_entity *__pick_next_entity(struct sched_entity *se)
 611{
 612        struct rb_node *next = rb_next(&se->run_node);
 613
 614        if (!next)
 615                return NULL;
 616
 617        return rb_entry(next, struct sched_entity, run_node);
 618}
 619
 620#ifdef CONFIG_SCHED_DEBUG
 621struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 622{
 623        struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
 624
 625        if (!last)
 626                return NULL;
 627
 628        return rb_entry(last, struct sched_entity, run_node);
 629}
 630
 631/**************************************************************
 632 * Scheduling class statistics methods:
 633 */
 634
 635int sched_proc_update_handler(struct ctl_table *table, int write,
 636                void __user *buffer, size_t *lenp,
 637                loff_t *ppos)
 638{
 639        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 640        unsigned int factor = get_update_sysctl_factor();
 641
 642        if (ret || !write)
 643                return ret;
 644
 645        sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
 646                                        sysctl_sched_min_granularity);
 647
 648#define WRT_SYSCTL(name) \
 649        (normalized_sysctl_##name = sysctl_##name / (factor))
 650        WRT_SYSCTL(sched_min_granularity);
 651        WRT_SYSCTL(sched_latency);
 652        WRT_SYSCTL(sched_wakeup_granularity);
 653#undef WRT_SYSCTL
 654
 655        return 0;
 656}
 657#endif
 658
 659/*
 660 * delta /= w
 661 */
 662static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
 663{
 664        if (unlikely(se->load.weight != NICE_0_LOAD))
 665                delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
 666
 667        return delta;
 668}
 669
 670/*
 671 * The idea is to set a period in which each task runs once.
 672 *
 673 * When there are too many tasks (sched_nr_latency) we have to stretch
 674 * this period because otherwise the slices get too small.
 675 *
 676 * p = (nr <= nl) ? l : l*nr/nl
 677 */
 678static u64 __sched_period(unsigned long nr_running)
 679{
 680        if (unlikely(nr_running > sched_nr_latency))
 681                return nr_running * sysctl_sched_min_granularity;
 682        else
 683                return sysctl_sched_latency;
 684}
 685
 686/*
 687 * We calculate the wall-time slice from the period by taking a part
 688 * proportional to the weight.
 689 *
 690 * s = p*P[w/rw]
 691 */
 692static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 693{
 694        u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
 695
 696        for_each_sched_entity(se) {
 697                struct load_weight *load;
 698                struct load_weight lw;
 699
 700                cfs_rq = cfs_rq_of(se);
 701                load = &cfs_rq->load;
 702
 703                if (unlikely(!se->on_rq)) {
 704                        lw = cfs_rq->load;
 705
 706                        update_load_add(&lw, se->load.weight);
 707                        load = &lw;
 708                }
 709                slice = __calc_delta(slice, se->load.weight, load);
 710        }
 711        return slice;
 712}
 713
 714/*
 715 * We calculate the vruntime slice of a to-be-inserted task.
 716 *
 717 * vs = s/w
 718 */
 719static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 720{
 721        return calc_delta_fair(sched_slice(cfs_rq, se), se);
 722}
 723
 724#include "pelt.h"
 725#ifdef CONFIG_SMP
 726
 727static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
 728static unsigned long task_h_load(struct task_struct *p);
 729static unsigned long capacity_of(int cpu);
 730
 731/* Give new sched_entity start runnable values to heavy its load in infant time */
 732void init_entity_runnable_average(struct sched_entity *se)
 733{
 734        struct sched_avg *sa = &se->avg;
 735
 736        memset(sa, 0, sizeof(*sa));
 737
 738        /*
 739         * Tasks are initialized with full load to be seen as heavy tasks until
 740         * they get a chance to stabilize to their real load level.
 741         * Group entities are initialized with zero load to reflect the fact that
 742         * nothing has been attached to the task group yet.
 743         */
 744        if (entity_is_task(se))
 745                sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
 746
 747        se->runnable_weight = se->load.weight;
 748
 749        /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 750}
 751
 752static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
 753static void attach_entity_cfs_rq(struct sched_entity *se);
 754
 755/*
 756 * With new tasks being created, their initial util_avgs are extrapolated
 757 * based on the cfs_rq's current util_avg:
 758 *
 759 *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
 760 *
 761 * However, in many cases, the above util_avg does not give a desired
 762 * value. Moreover, the sum of the util_avgs may be divergent, such
 763 * as when the series is a harmonic series.
 764 *
 765 * To solve this problem, we also cap the util_avg of successive tasks to
 766 * only 1/2 of the left utilization budget:
 767 *
 768 *   util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
 769 *
 770 * where n denotes the nth task and cpu_scale the CPU capacity.
 771 *
 772 * For example, for a CPU with 1024 of capacity, a simplest series from
 773 * the beginning would be like:
 774 *
 775 *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
 776 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
 777 *
 778 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
 779 * if util_avg > util_avg_cap.
 780 */
 781void post_init_entity_util_avg(struct task_struct *p)
 782{
 783        struct sched_entity *se = &p->se;
 784        struct cfs_rq *cfs_rq = cfs_rq_of(se);
 785        struct sched_avg *sa = &se->avg;
 786        long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
 787        long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
 788
 789        if (cap > 0) {
 790                if (cfs_rq->avg.util_avg != 0) {
 791                        sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
 792                        sa->util_avg /= (cfs_rq->avg.load_avg + 1);
 793
 794                        if (sa->util_avg > cap)
 795                                sa->util_avg = cap;
 796                } else {
 797                        sa->util_avg = cap;
 798                }
 799        }
 800
 801        if (p->sched_class != &fair_sched_class) {
 802                /*
 803                 * For !fair tasks do:
 804                 *
 805                update_cfs_rq_load_avg(now, cfs_rq);
 806                attach_entity_load_avg(cfs_rq, se, 0);
 807                switched_from_fair(rq, p);
 808                 *
 809                 * such that the next switched_to_fair() has the
 810                 * expected state.
 811                 */
 812                se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
 813                return;
 814        }
 815
 816        attach_entity_cfs_rq(se);
 817}
 818
 819#else /* !CONFIG_SMP */
 820void init_entity_runnable_average(struct sched_entity *se)
 821{
 822}
 823void post_init_entity_util_avg(struct task_struct *p)
 824{
 825}
 826static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 827{
 828}
 829#endif /* CONFIG_SMP */
 830
 831/*
 832 * Update the current task's runtime statistics.
 833 */
 834static void update_curr(struct cfs_rq *cfs_rq)
 835{
 836        struct sched_entity *curr = cfs_rq->curr;
 837        u64 now = rq_clock_task(rq_of(cfs_rq));
 838        u64 delta_exec;
 839
 840        if (unlikely(!curr))
 841                return;
 842
 843        delta_exec = now - curr->exec_start;
 844        if (unlikely((s64)delta_exec <= 0))
 845                return;
 846
 847        curr->exec_start = now;
 848
 849        schedstat_set(curr->statistics.exec_max,
 850                      max(delta_exec, curr->statistics.exec_max));
 851
 852        curr->sum_exec_runtime += delta_exec;
 853        schedstat_add(cfs_rq->exec_clock, delta_exec);
 854
 855        curr->vruntime += calc_delta_fair(delta_exec, curr);
 856        update_min_vruntime(cfs_rq);
 857
 858        if (entity_is_task(curr)) {
 859                struct task_struct *curtask = task_of(curr);
 860
 861                trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
 862                cgroup_account_cputime(curtask, delta_exec);
 863                account_group_exec_runtime(curtask, delta_exec);
 864        }
 865
 866        account_cfs_rq_runtime(cfs_rq, delta_exec);
 867}
 868
 869static void update_curr_fair(struct rq *rq)
 870{
 871        update_curr(cfs_rq_of(&rq->curr->se));
 872}
 873
 874static inline void
 875update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 876{
 877        u64 wait_start, prev_wait_start;
 878
 879        if (!schedstat_enabled())
 880                return;
 881
 882        wait_start = rq_clock(rq_of(cfs_rq));
 883        prev_wait_start = schedstat_val(se->statistics.wait_start);
 884
 885        if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
 886            likely(wait_start > prev_wait_start))
 887                wait_start -= prev_wait_start;
 888
 889        __schedstat_set(se->statistics.wait_start, wait_start);
 890}
 891
 892static inline void
 893update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 894{
 895        struct task_struct *p;
 896        u64 delta;
 897
 898        if (!schedstat_enabled())
 899                return;
 900
 901        delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
 902
 903        if (entity_is_task(se)) {
 904                p = task_of(se);
 905                if (task_on_rq_migrating(p)) {
 906                        /*
 907                         * Preserve migrating task's wait time so wait_start
 908                         * time stamp can be adjusted to accumulate wait time
 909                         * prior to migration.
 910                         */
 911                        __schedstat_set(se->statistics.wait_start, delta);
 912                        return;
 913                }
 914                trace_sched_stat_wait(p, delta);
 915        }
 916
 917        __schedstat_set(se->statistics.wait_max,
 918                      max(schedstat_val(se->statistics.wait_max), delta));
 919        __schedstat_inc(se->statistics.wait_count);
 920        __schedstat_add(se->statistics.wait_sum, delta);
 921        __schedstat_set(se->statistics.wait_start, 0);
 922}
 923
 924static inline void
 925update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 926{
 927        struct task_struct *tsk = NULL;
 928        u64 sleep_start, block_start;
 929
 930        if (!schedstat_enabled())
 931                return;
 932
 933        sleep_start = schedstat_val(se->statistics.sleep_start);
 934        block_start = schedstat_val(se->statistics.block_start);
 935
 936        if (entity_is_task(se))
 937                tsk = task_of(se);
 938
 939        if (sleep_start) {
 940                u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
 941
 942                if ((s64)delta < 0)
 943                        delta = 0;
 944
 945                if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
 946                        __schedstat_set(se->statistics.sleep_max, delta);
 947
 948                __schedstat_set(se->statistics.sleep_start, 0);
 949                __schedstat_add(se->statistics.sum_sleep_runtime, delta);
 950
 951                if (tsk) {
 952                        account_scheduler_latency(tsk, delta >> 10, 1);
 953                        trace_sched_stat_sleep(tsk, delta);
 954                }
 955        }
 956        if (block_start) {
 957                u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
 958
 959                if ((s64)delta < 0)
 960                        delta = 0;
 961
 962                if (unlikely(delta > schedstat_val(se->statistics.block_max)))
 963                        __schedstat_set(se->statistics.block_max, delta);
 964
 965                __schedstat_set(se->statistics.block_start, 0);
 966                __schedstat_add(se->statistics.sum_sleep_runtime, delta);
 967
 968                if (tsk) {
 969                        if (tsk->in_iowait) {
 970                                __schedstat_add(se->statistics.iowait_sum, delta);
 971                                __schedstat_inc(se->statistics.iowait_count);
 972                                trace_sched_stat_iowait(tsk, delta);
 973                        }
 974
 975                        trace_sched_stat_blocked(tsk, delta);
 976
 977                        /*
 978                         * Blocking time is in units of nanosecs, so shift by
 979                         * 20 to get a milliseconds-range estimation of the
 980                         * amount of time that the task spent sleeping:
 981                         */
 982                        if (unlikely(prof_on == SLEEP_PROFILING)) {
 983                                profile_hits(SLEEP_PROFILING,
 984                                                (void *)get_wchan(tsk),
 985                                                delta >> 20);
 986                        }
 987                        account_scheduler_latency(tsk, delta >> 10, 0);
 988                }
 989        }
 990}
 991
 992/*
 993 * Task is being enqueued - update stats:
 994 */
 995static inline void
 996update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 997{
 998        if (!schedstat_enabled())
 999                return;
1000

1001        /*
1002         * Are we enqueueing a waiting task? (for current tasks
1003         * a dequeue/enqueue event is a NOP)
1004         */
1005        if (se != cfs_rq->curr)
1006                update_stats_wait_start(cfs_rq, se);
1007
1008        if (flags & ENQUEUE_WAKEUP)
1009                update_stats_enqueue_sleeper(cfs_rq, se);
1010}
1011
1012static inline void
1013update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1014{
1015
1016        if (!schedstat_enabled())
1017                return;
1018
1019        /*
1020         * Mark the end of the wait period if dequeueing a
1021         * waiting task:
1022         */
1023        if (se != cfs_rq->curr)
1024                update_stats_wait_end(cfs_rq, se);
1025
1026        if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
1027                struct task_struct *tsk = task_of(se);
1028
1029                if (tsk->state & TASK_INTERRUPTIBLE)
1030                        __schedstat_set(se->statistics.sleep_start,
1031                                      rq_clock(rq_of(cfs_rq)));
1032                if (tsk->state & TASK_UNINTERRUPTIBLE)
1033                        __schedstat_set(se->statistics.block_start,
1034                                      rq_clock(rq_of(cfs_rq)));
1035        }
1036}
1037
1038/*
1039 * We are picking a new current task - update its stats:
1040 */
1041static inline void
1042update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
1043{
1044        /*
1045         * We are starting a new run period:
1046         */
1047        se->exec_start = rq_clock_task(rq_of(cfs_rq));
1048}
1049
1050/**************************************************
1051 * Scheduling class queueing methods:
1052 */
1053
1054#ifdef CONFIG_NUMA_BALANCING
1055/*
1056 * Approximate time to scan a full NUMA task in ms. The task scan period is
1057 * calculated based on the tasks virtual memory size and
1058 * numa_balancing_scan_size.
1059 */
1060unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1061unsigned int sysctl_numa_balancing_scan_period_max = 60000;
1062
1063/* Portion of address space to scan in MB */
1064unsigned int sysctl_numa_balancing_scan_size = 256;
1065
1066/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
1067unsigned int sysctl_numa_balancing_scan_delay = 1000;
1068
1069struct numa_group {
1070        refcount_t refcount;
1071
1072        spinlock_t lock; /* nr_tasks, tasks */
1073        int nr_tasks;
1074        pid_t gid;
1075        int active_nodes;
1076
1077        struct rcu_head rcu;
1078        unsigned long total_faults;
1079        unsigned long max_faults_cpu;
1080        /*
1081         * Faults_cpu is used to decide whether memory should move
1082         * towards the CPU. As a consequence, these stats are weighted
1083         * more by CPU use than by memory faults.
1084         */
1085        unsigned long *faults_cpu;
1086        unsigned long faults[0];
1087};
1088
1089/*
1090 * For functions that can be called in multiple contexts that permit reading
1091 * ->numa_group (see struct task_struct for locking rules).
1092 */
1093static struct numa_group *deref_task_numa_group(struct task_struct *p)
1094{
1095        return rcu_dereference_check(p->numa_group, p == current ||
1096                (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
1097}
1098
1099static struct numa_group *deref_curr_numa_group(struct task_struct *p)
1100{
1101        return rcu_dereference_protected(p->numa_group, p == current);
1102}
1103
1104static inline unsigned long group_faults_priv(struct numa_group *ng);
1105static inline unsigned long group_faults_shared(struct numa_group *ng);
1106
1107static unsigned int task_nr_scan_windows(struct task_struct *p)
1108{
1109        unsigned long rss = 0;
1110        unsigned long nr_scan_pages;
1111
1112        /*
1113         * Calculations based on RSS as non-present and empty pages are skipped
1114         * by the PTE scanner and NUMA hinting faults should be trapped based
1115         * on resident pages
1116         */
1117        nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1118        rss = get_mm_rss(p->mm);
1119        if (!rss)
1120                rss = nr_scan_pages;
1121
1122        rss = round_up(rss, nr_scan_pages);
1123        return rss / nr_scan_pages;
1124}
1125
1126/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
1127#define MAX_SCAN_WINDOW 2560
1128
1129static unsigned int task_scan_min(struct task_struct *p)
1130{
1131        unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1132        unsigned int scan, floor;
1133        unsigned int windows = 1;
1134
1135        if (scan_size < MAX_SCAN_WINDOW)
1136                windows = MAX_SCAN_WINDOW / scan_size;
1137        floor = 1000 / windows;
1138
1139        scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1140        return max_t(unsigned int, floor, scan);
1141}
1142
1143static unsigned int task_scan_start(struct task_struct *p)
1144{
1145        unsigned long smin = task_scan_min(p);
1146        unsigned long period = smin;
1147        struct numa_group *ng;
1148
1149        /* Scale the maximum scan period with the amount of shared memory. */
1150        rcu_read_lock();
1151        ng = rcu_dereference(p->numa_group);
1152        if (ng) {
1153                unsigned long shared = group_faults_shared(ng);
1154                unsigned long private = group_faults_priv(ng);
1155
1156                period *= refcount_read(&ng->refcount);
1157                period *= shared + 1;
1158                period /= private + shared + 1;
1159        }
1160        rcu_read_unlock();
1161
1162        return max(smin, period);
1163}
1164
1165static unsigned int task_scan_max(struct task_struct *p)
1166{
1167        unsigned long smin = task_scan_min(p);
1168        unsigned long smax;
1169        struct numa_group *ng;
1170
1171        /* Watch for min being lower than max due to floor calculations */
1172        smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1173
1174        /* Scale the maximum scan period with the amount of shared memory. */
1175        ng = deref_curr_numa_group(p);
1176        if (ng) {
1177                unsigned long shared = group_faults_shared(ng);
1178                unsigned long private = group_faults_priv(ng);
1179                unsigned long period = smax;
1180
1181                period *= refcount_read(&ng->refcount);
1182                period *= shared + 1;
1183                period /= private + shared + 1;
1184
1185                smax = max(smax, period);
1186        }
1187
1188        return max(smin, smax);
1189}
1190
1191void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
1192{
1193        int mm_users = 0;
1194        struct mm_struct *mm = p->mm;
1195
1196        if (mm) {
1197                mm_users = atomic_read(&mm->mm_users);
1198                if (mm_users == 1) {
1199                        mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1200                        mm->numa_scan_seq = 0;
1201                }
1202        }
1203        p->node_stamp                   = 0;
1204        p->numa_scan_seq                = mm ? mm->numa_scan_seq : 0;
1205        p->numa_scan_period             = sysctl_numa_balancing_scan_delay;
1206        p->numa_work.next               = &p->numa_work;
1207        p->numa_faults                  = NULL;
1208        RCU_INIT_POINTER(p->numa_group, NULL);
1209        p->last_task_numa_placement     = 0;
1210        p->last_sum_exec_runtime        = 0;
1211
1212        /* New address space, reset the preferred nid */
1213        if (!(clone_flags & CLONE_VM)) {
1214                p->numa_preferred_nid = NUMA_NO_NODE;
1215                return;
1216        }
1217
1218        /*
1219         * New thread, keep existing numa_preferred_nid which should be copied
1220         * already by arch_dup_task_struct but stagger when scans start.
1221         */
1222        if (mm) {
1223                unsigned int delay;
1224
1225                delay = min_t(unsigned int, task_scan_max(current),
1226                        current->numa_scan_period * mm_users * NSEC_PER_MSEC);
1227                delay += 2 * TICK_NSEC;
1228                p->node_stamp = delay;
1229        }
1230}
1231
1232static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1233{
1234        rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
1235        rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1236}
1237
1238static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1239{
1240        rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
1241        rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1242}
1243
1244/* Shared or private faults. */
1245#define NR_NUMA_HINT_FAULT_TYPES 2
1246
1247/* Memory and CPU locality */
1248#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1249
1250/* Averaged statistics, and temporary buffers. */
1251#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1252
1253pid_t task_numa_group_id(struct task_struct *p)
1254{
1255        struct numa_group *ng;
1256        pid_t gid = 0;
1257
1258        rcu_read_lock();
1259        ng = rcu_dereference(p->numa_group);
1260        if (ng)
1261                gid = ng->gid;
1262        rcu_read_unlock();
1263
1264        return gid;
1265}
1266
1267/*
1268 * The averaged statistics, shared & private, memory & CPU,
1269 * occupy the first half of the array. The second half of the
1270 * array is for current counters, which are averaged into the
1271 * first set by task_numa_placement.
1272 */
1273static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1274{
1275        return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1276}
1277
1278static inline unsigned long task_faults(struct task_struct *p, int nid)
1279{
1280        if (!p->numa_faults)
1281                return 0;
1282
1283        return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1284                p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1285}
1286
1287static inline unsigned long group_faults(struct task_struct *p, int nid)
1288{
1289        struct numa_group *ng = deref_task_numa_group(p);
1290
1291        if (!ng)
1292                return 0;
1293
1294        return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1295                ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1296}
1297
1298static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1299{
1300        return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1301                group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
1302}
1303
1304static inline unsigned long group_faults_priv(struct numa_group *ng)
1305{
1306        unsigned long faults = 0;
1307        int node;
1308
1309        for_each_online_node(node) {
1310                faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
1311        }
1312
1313        return faults;
1314}
1315
1316static inline unsigned long group_faults_shared(struct numa_group *ng)
1317{
1318        unsigned long faults = 0;
1319        int node;
1320
1321        for_each_online_node(node) {
1322                faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
1323        }
1324
1325        return faults;
1326}
1327
1328/*
1329 * A node triggering more than 1/3 as many NUMA faults as the maximum is
1330 * considered part of a numa group's pseudo-interleaving set. Migrations
1331 * between these nodes are slowed down, to allow things to settle down.
1332 */
1333#define ACTIVE_NODE_FRACTION 3
1334
1335static bool numa_is_active_node(int nid, struct numa_group *ng)
1336{
1337        return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1338}
1339
1340/* Handle placement on systems where not all nodes are directly connected. */
1341static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1342                                        int maxdist, bool task)
1343{
1344        unsigned long score = 0;
1345        int node;
1346
1347        /*
1348         * All nodes are directly connected, and the same distance
1349         * from each other. No need for fancy placement algorithms.
1350         */
1351        if (sched_numa_topology_type == NUMA_DIRECT)
1352                return 0;
1353
1354        /*
1355         * This code is called for each node, introducing N^2 complexity,
1356         * which should be ok given the number of nodes rarely exceeds 8.
1357         */
1358        for_each_online_node(node) {
1359                unsigned long faults;
1360                int dist = node_distance(nid, node);
1361
1362                /*
1363                 * The furthest away nodes in the system are not interesting
1364                 * for placement; nid was already counted.
1365                 */
1366                if (dist == sched_max_numa_distance || node == nid)
1367                        continue;
1368
1369                /*
1370                 * On systems with a backplane NUMA topology, compare groups
1371                 * of nodes, and move tasks towards the group with the most
1372                 * memory accesses. When comparing two nodes at distance
1373                 * "hoplimit", only nodes closer by than "hoplimit" are part
1374                 * of each group. Skip other nodes.
1375                 */
1376                if (sched_numa_topology_type == NUMA_BACKPLANE &&
1377                                        dist >= maxdist)
1378                        continue;
1379
1380                /* Add up the faults from nearby nodes. */
1381                if (task)
1382                        faults = task_faults(p, node);
1383                else
1384                        faults = group_faults(p, node);
1385
1386                /*
1387                 * On systems with a glueless mesh NUMA topology, there are
1388                 * no fixed "groups of nodes". Instead, nodes that are not
1389                 * directly connected bounce traffic through intermediate
1390                 * nodes; a numa_group can occupy any set of nodes.
1391                 * The further away a node is, the less the faults count.
1392                 * This seems to result in good task placement.
1393                 */
1394                if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1395                        faults *= (sched_max_numa_distance - dist);
1396                        faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1397                }
1398
1399                score += faults;
1400        }
1401
1402        return score;
1403}
1404
1405/*
1406 * These return the fraction of accesses done by a particular task, or
1407 * task group, on a particular numa node.  The group weight is given a
1408 * larger multiplier, in order to group tasks together that are almost
1409 * evenly spread out between numa nodes.
1410 */
1411static inline unsigned long task_weight(struct task_struct *p, int nid,
1412                                        int dist)
1413{
1414        unsigned long faults, total_faults;
1415
1416        if (!p->numa_faults)
1417                return 0;
1418
1419        total_faults = p->total_numa_faults;
1420
1421        if (!total_faults)
1422                return 0;
1423
1424        faults = task_faults(p, nid);
1425        faults += score_nearby_nodes(p, nid, dist, true);
1426
1427        return 1000 * faults / total_faults;
1428}
1429
1430static inline unsigned long group_weight(struct task_struct *p, int nid,
1431                                         int dist)
1432{
1433        struct numa_group *ng = deref_task_numa_group(p);
1434        unsigned long faults, total_faults;
1435
1436        if (!ng)
1437                return 0;
1438
1439        total_faults = ng->total_faults;
1440
1441        if (!total_faults)
1442                return 0;
1443
1444        faults = group_faults(p, nid);
1445        faults += score_nearby_nodes(p, nid, dist, false);
1446
1447        return 1000 * faults / total_faults;
1448}
1449
1450bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1451                                int src_nid, int dst_cpu)
1452{
1453        struct numa_group *ng = deref_curr_numa_group(p);
1454        int dst_nid = cpu_to_node(dst_cpu);
1455        int last_cpupid, this_cpupid;
1456
1457        this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1458        last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1459
1460        /*
1461         * Allow first faults or private faults to migrate immediately early in
1462         * the lifetime of a task. The magic number 4 is based on waiting for
1463         * two full passes of the "multi-stage node selection" test that is
1464         * executed below.
1465         */
1466        if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
1467            (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
1468                return true;
1469
1470        /*
1471         * Multi-stage node selection is used in conjunction with a periodic
1472         * migration fault to build a temporal task<->page relation. By using
1473         * a two-stage filter we remove short/unlikely relations.
1474         *
1475         * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1476         * a task's usage of a particular page (n_p) per total usage of this
1477         * page (n_t) (in a given time-span) to a probability.
1478         *
1479         * Our periodic faults will sample this probability and getting the
1480         * same result twice in a row, given these samples are fully
1481         * independent, is then given by P(n)^2, provided our sample period
1482         * is sufficiently short compared to the usage pattern.
1483         *
1484         * This quadric squishes small probabilities, making it less likely we
1485         * act on an unlikely task<->page relation.
1486         */
1487        if (!cpupid_pid_unset(last_cpupid) &&
1488                                cpupid_to_nid(last_cpupid) != dst_nid)
1489                return false;
1490
1491        /* Always allow migrate on private faults */
1492        if (cpupid_match_pid(p, last_cpupid))
1493                return true;
1494
1495        /* A shared fault, but p->numa_group has not been set up yet. */
1496        if (!ng)
1497                return true;
1498
1499        /*
1500         * Destination node is much more heavily used than the source
1501         * node? Allow migration.
1502         */
1503        if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1504                                        ACTIVE_NODE_FRACTION)
1505                return true;
1506
1507        /*
1508         * Distribute memory according to CPU & memory use on each node,
1509         * with 3/4 hysteresis to avoid unnecessary memory migrations:
1510         *
1511         * faults_cpu(dst)   3   faults_cpu(src)
1512         * --------------- * - > ---------------
1513         * faults_mem(dst)   4   faults_mem(src)
1514         */
1515        return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1516               group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1517}
1518
1519static unsigned long cpu_runnable_load(struct rq *rq);
1520
1521/* Cached statistics for all CPUs within a node */
1522struct numa_stats {
1523        unsigned long load;
1524
1525        /* Total compute capacity of CPUs on a node */
1526        unsigned long compute_capacity;
1527};
1528
1529/*
1530 * XXX borrowed from update_sg_lb_stats
1531 */
1532static void update_numa_stats(struct numa_stats *ns, int nid)
1533{
1534        int cpu;
1535
1536        memset(ns, 0, sizeof(*ns));
1537        for_each_cpu(cpu, cpumask_of_node(nid)) {
1538                struct rq *rq = cpu_rq(cpu);
1539
1540                ns->load += cpu_runnable_load(rq);
1541                ns->compute_capacity += capacity_of(cpu);
1542        }
1543
1544}
1545
1546struct task_numa_env {
1547        struct task_struct *p;
1548
1549        int src_cpu, src_nid;
1550        int dst_cpu, dst_nid;
1551
1552        struct numa_stats src_stats, dst_stats;
1553
1554        int imbalance_pct;
1555        int dist;
1556
1557        struct task_struct *best_task;
1558        long best_imp;
1559        int best_cpu;
1560};
1561
1562static void task_numa_assign(struct task_numa_env *env,
1563                             struct task_struct *p, long imp)
1564{
1565        struct rq *rq = cpu_rq(env->dst_cpu);
1566
1567        /* Bail out if run-queue part of active NUMA balance. */
1568        if (xchg(&rq->numa_migrate_on, 1))
1569                return;
1570
1571        /*
1572         * Clear previous best_cpu/rq numa-migrate flag, since task now
1573         * found a better CPU to move/swap.
1574         */
1575        if (env->best_cpu != -1) {
1576                rq = cpu_rq(env->best_cpu);
1577                WRITE_ONCE(rq->numa_migrate_on, 0);
1578        }
1579
1580        if (env->best_task)
1581                put_task_struct(env->best_task);
1582        if (p)
1583                get_task_struct(p);
1584
1585        env->best_task = p;
1586        env->best_imp = imp;
1587        env->best_cpu = env->dst_cpu;
1588}
1589
1590static bool load_too_imbalanced(long src_load, long dst_load,
1591                                struct task_numa_env *env)
1592{
1593        long imb, old_imb;
1594        long orig_src_load, orig_dst_load;
1595        long src_capacity, dst_capacity;
1596
1597        /*
1598         * The load is corrected for the CPU capacity available on each node.
1599         *
1600         * src_load        dst_load
1601         * ------------ vs ---------
1602         * src_capacity    dst_capacity
1603         */
1604        src_capacity = env->src_stats.compute_capacity;
1605        dst_capacity = env->dst_stats.compute_capacity;
1606
1607        imb = abs(dst_load * src_capacity - src_load * dst_capacity);
1608
1609        orig_src_load = env->src_stats.load;
1610        orig_dst_load = env->dst_stats.load;
1611
1612        old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
1613
1614        /* Would this change make things worse? */
1615        return (imb > old_imb);
1616}
1617
1618/*
1619 * Maximum NUMA importance can be 1998 (2*999);
1620 * SMALLIMP @ 30 would be close to 1998/64.
1621 * Used to deter task migration.
1622 */
1623#define SMALLIMP        30
1624
1625/*
1626 * This checks if the overall compute and NUMA accesses of the system would
1627 * be improved if the source tasks was migrated to the target dst_cpu taking
1628 * into account that it might be best if task running on the dst_cpu should
1629 * be exchanged with the source task
1630 */
1631static void task_numa_compare(struct task_numa_env *env,
1632                              long taskimp, long groupimp, bool maymove)
1633{
1634        struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
1635        struct rq *dst_rq = cpu_rq(env->dst_cpu);
1636        long imp = p_ng ? groupimp : taskimp;
1637        struct task_struct *cur;
1638        long src_load, dst_load;
1639        int dist = env->dist;
1640        long moveimp = imp;
1641        long load;
1642
1643        if (READ_ONCE(dst_rq->numa_migrate_on))
1644                return;
1645
1646        rcu_read_lock();
1647        cur = task_rcu_dereference(&dst_rq->curr);
1648        if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
1649                cur = NULL;
1650
1651        /*
1652         * Because we have preemption enabled we can get migrated around and
1653         * end try selecting ourselves (current == env->p) as a swap candidate.
1654         */
1655        if (cur == env->p)
1656                goto unlock;
1657
1658        if (!cur) {
1659                if (maymove && moveimp >= env->best_imp)
1660                        goto assign;
1661                else
1662                        goto unlock;
1663        }
1664
1665        /*
1666         * "imp" is the fault differential for the source task between the
1667         * source and destination node. Calculate the total differential for
1668         * the source task and potential destination task. The more negative
1669         * the value is, the more remote accesses that would be expected to
1670         * be incurred if the tasks were swapped.
1671         */
1672        /* Skip this swap candidate if cannot move to the source cpu */
1673        if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
1674                goto unlock;
1675
1676        /*
1677         * If dst and source tasks are in the same NUMA group, or not
1678         * in any group then look only at task weights.
1679         */
1680        cur_ng = rcu_dereference(cur->numa_group);
1681        if (cur_ng == p_ng) {
1682                imp = taskimp + task_weight(cur, env->src_nid, dist) -
1683                      task_weight(cur, env->dst_nid, dist);
1684                /*
1685                 * Add some hysteresis to prevent swapping the
1686                 * tasks within a group over tiny differences.
1687                 */
1688                if (cur_ng)
1689                        imp -= imp / 16;
1690        } else {
1691                /*
1692                 * Compare the group weights. If a task is all by itself
1693                 * (not part of a group), use the task weight instead.
1694                 */
1695                if (cur_ng && p_ng)
1696                        imp += group_weight(cur, env->src_nid, dist) -
1697                               group_weight(cur, env->dst_nid, dist);
1698                else
1699                        imp += task_weight(cur, env->src_nid, dist) -
1700                               task_weight(cur, env->dst_nid, dist);
1701        }
1702
1703        if (maymove && moveimp > imp && moveimp > env->best_imp) {
1704                imp = moveimp;
1705                cur = NULL;
1706                goto assign;
1707        }
1708
1709        /*
1710         * If the NUMA importance is less than SMALLIMP,
1711         * task migration might only result in ping pong
1712         * of tasks and also hurt performance due to cache
1713         * misses.
1714         */
1715        if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
1716                goto unlock;
1717
1718        /*
1719         * In the overloaded case, try and keep the load balanced.
1720         */
1721        load = task_h_load(env->p) - task_h_load(cur);
1722        if (!load)
1723                goto assign;
1724
1725        dst_load = env->dst_stats.load + load;
1726        src_load = env->src_stats.load - load;
1727
1728        if (load_too_imbalanced(src_load, dst_load, env))
1729                goto unlock;
1730
1731assign:
1732        /*
1733         * One idle CPU per node is evaluated for a task numa move.
1734         * Call select_idle_sibling to maybe find a better one.
1735         */
1736        if (!cur) {
1737                /*
1738                 * select_idle_siblings() uses an per-CPU cpumask that
1739                 * can be used from IRQ context.
1740                 */
1741                local_irq_disable();
1742                env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1743                                                   env->dst_cpu);
1744                local_irq_enable();
1745        }
1746
1747        task_numa_assign(env, cur, imp);
1748unlock:
1749        rcu_read_unlock();
1750}
1751
1752static void task_numa_find_cpu(struct task_numa_env *env,
1753                                long taskimp, long groupimp)
1754{
1755        long src_load, dst_load, load;
1756        bool maymove = false;
1757        int cpu;
1758
1759        load = task_h_load(env->p);
1760        dst_load = env->dst_stats.load + load;
1761        src_load = env->src_stats.load - load;
1762
1763        /*
1764         * If the improvement from just moving env->p direction is better
1765         * than swapping tasks around, check if a move is possible.
1766         */
1767        maymove = !load_too_imbalanced(src_load, dst_load, env);
1768
1769        for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1770                /* Skip this CPU if the source task cannot migrate */
1771                if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
1772                        continue;
1773
1774                env->dst_cpu = cpu;
1775                task_numa_compare(env, taskimp, groupimp, maymove);
1776        }
1777}
1778
1779static int task_numa_migrate(struct task_struct *p)
1780{
1781        struct task_numa_env env = {
1782                .p = p,
1783
1784                .src_cpu = task_cpu(p),
1785                .src_nid = task_node(p),
1786
1787                .imbalance_pct = 112,
1788
1789                .best_task = NULL,
1790                .best_imp = 0,
1791                .best_cpu = -1,
1792        };
1793        unsigned long taskweight, groupweight;
1794        struct sched_domain *sd;
1795        long taskimp, groupimp;
1796        struct numa_group *ng;
1797        struct rq *best_rq;
1798        int nid, ret, dist;
1799
1800        /*
1801         * Pick the lowest SD_NUMA domain, as that would have the smallest
1802         * imbalance and would be the first to start moving tasks about.
1803         *
1804         * And we want to avoid any moving of tasks about, as that would create
1805         * random movement of tasks -- counter the numa conditions we're trying
1806         * to satisfy here.
1807         */
1808        rcu_read_lock();
1809        sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1810        if (sd)
1811                env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1812        rcu_read_unlock();
1813
1814        /*
1815         * Cpusets can break the scheduler domain tree into smaller
1816         * balance domains, some of which do not cross NUMA boundaries.
1817         * Tasks that are "trapped" in such domains cannot be migrated
1818         * elsewhere, so there is no point in (re)trying.
1819         */
1820        if (unlikely(!sd)) {
1821                sched_setnuma(p, task_node(p));
1822                return -EINVAL;
1823        }
1824
1825        env.dst_nid = p->numa_preferred_nid;
1826        dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1827        taskweight = task_weight(p, env.src_nid, dist);
1828        groupweight = group_weight(p, env.src_nid, dist);
1829        update_numa_stats(&env.src_stats, env.src_nid);
1830        taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1831        groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1832        update_numa_stats(&env.dst_stats, env.dst_nid);
1833
1834        /* Try to find a spot on the preferred nid. */
1835        task_numa_find_cpu(&env, taskimp, groupimp);
1836
1837        /*
1838         * Look at other nodes in these cases:
1839         * - there is no space available on the preferred_nid
1840         * - the task is part of a numa_group that is interleaved across
1841         *   multiple NUMA nodes; in order to better consolidate the group,
1842         *   we need to check other locations.
1843         */
1844        ng = deref_curr_numa_group(p);
1845        if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
1846                for_each_online_node(nid) {
1847                        if (nid == env.src_nid || nid == p->numa_preferred_nid)
1848                                continue;
1849
1850                        dist = node_distance(env.src_nid, env.dst_nid);
1851                        if (sched_numa_topology_type == NUMA_BACKPLANE &&
1852                                                dist != env.dist) {
1853                                taskweight = task_weight(p, env.src_nid, dist);
1854                                groupweight = group_weight(p, env.src_nid, dist);
1855                        }
1856
1857                        /* Only consider nodes where both task and groups benefit */
1858                        taskimp = task_weight(p, nid, dist) - taskweight;
1859                        groupimp = group_weight(p, nid, dist) - groupweight;
1860                        if (taskimp < 0 && groupimp < 0)
1861                                continue;
1862
1863                        env.dist = dist;
1864                        env.dst_nid = nid;
1865                        update_numa_stats(&env.dst_stats, env.dst_nid);
1866                        task_numa_find_cpu(&env, taskimp, groupimp);
1867                }
1868        }
1869
1870        /*
1871         * If the task is part of a workload that spans multiple NUMA nodes,
1872         * and is migrating into one of the workload's active nodes, remember
1873         * this node as the task's preferred numa node, so the workload can
1874         * settle down.
1875         * A task that migrated to a second choice node will be better off
1876         * trying for a better one later. Do not set the preferred node here.
1877         */
1878        if (ng) {
1879                if (env.best_cpu == -1)
1880                        nid = env.src_nid;
1881                else
1882                        nid = cpu_to_node(env.best_cpu);
1883
1884                if (nid != p->numa_preferred_nid)
1885                        sched_setnuma(p, nid);
1886        }
1887
1888        /* No better CPU than the current one was found. */
1889        if (env.best_cpu == -1)
1890                return -EAGAIN;
1891
1892        best_rq = cpu_rq(env.best_cpu);
1893        if (env.best_task == NULL) {
1894                ret = migrate_task_to(p, env.best_cpu);
1895                WRITE_ONCE(best_rq->numa_migrate_on, 0);
1896                if (ret != 0)
1897                        trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1898                return ret;
1899        }
1900
1901        ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
1902        WRITE_ONCE(best_rq->numa_migrate_on, 0);
1903
1904        if (ret != 0)
1905                trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1906        put_task_struct(env.best_task);
1907        return ret;
1908}
1909
1910/* Attempt to migrate a task to a CPU on the preferred node. */
1911static void numa_migrate_preferred(struct task_struct *p)
1912{
1913        unsigned long interval = HZ;
1914
1915        /* This task has no NUMA fault statistics yet */
1916        if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
1917                return;
1918
1919        /* Periodically retry migrating the task to the preferred node */
1920        interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1921        p->numa_migrate_retry = jiffies + interval;
1922
1923        /* Success if task is already running on preferred CPU */
1924        if (task_node(p) == p->numa_preferred_nid)
1925                return;
1926
1927        /* Otherwise, try migrate to a CPU on the preferred node */
1928        task_numa_migrate(p);
1929}
1930
1931/*
1932 * Find out how many nodes on the workload is actively running on. Do this by
1933 * tracking the nodes from which NUMA hinting faults are triggered. This can
1934 * be different from the set of nodes where the workload's memory is currently
1935 * located.
1936 */
1937static void numa_group_count_active_nodes(struct numa_group *numa_group)
1938{
1939        unsigned long faults, max_faults = 0;
1940        int nid, active_nodes = 0;
1941
1942        for_each_online_node(nid) {
1943                faults = group_faults_cpu(numa_group, nid);
1944                if (faults > max_faults)
1945                        max_faults = faults;
1946        }
1947
1948        for_each_online_node(nid) {
1949                faults = group_faults_cpu(numa_group, nid);
1950                if (faults * ACTIVE_NODE_FRACTION > max_faults)
1951                        active_nodes++;
1952        }
1953
1954        numa_group->max_faults_cpu = max_faults;
1955        numa_group->active_nodes = active_nodes;
1956}
1957
1958/*
1959 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1960 * increments. The more local the fault statistics are, the higher the scan
1961 * period will be for the next scan window. If local/(local+remote) ratio is
1962 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1963 * the scan period will decrease. Aim for 70% local accesses.
1964 */
1965#define NUMA_PERIOD_SLOTS 10
1966#define NUMA_PERIOD_THRESHOLD 7
1967
1968/*
1969 * Increase the scan period (slow down scanning) if the majority of
1970 * our memory is already on our local node, or if the majority of
1971 * the page accesses are shared with other processes.
1972 * Otherwise, decrease the scan period.
1973 */
1974static void update_task_scan_period(struct task_struct *p,
1975                        unsigned long shared, unsigned long private)
1976{
1977        unsigned int period_slot;
1978        int lr_ratio, ps_ratio;
1979        int diff;
1980
1981        unsigned long remote = p->numa_faults_locality[0];
1982        unsigned long local = p->numa_faults_locality[1];
1983
1984        /*
1985         * If there were no record hinting faults then either the task is
1986         * completely idle or all activity is areas that are not of interest
1987         * to automatic numa balancing. Related to that, if there were failed
1988         * migration then it implies we are migrating too quickly or the local
1989         * node is overloaded. In either case, scan slower
1990         */
1991        if (local + shared == 0 || p->numa_faults_locality[2]) {
1992                p->numa_scan_period = min(p->numa_scan_period_max,
1993                        p->numa_scan_period << 1);
1994
1995                p->mm->numa_next_scan = jiffies +
1996                        msecs_to_jiffies(p->numa_scan_period);
1997
1998                return;
1999        }
2000

2001        /*
2002         * Prepare to scale scan period relative to the current period.
2003         *       == NUMA_PERIOD_THRESHOLD scan period stays the same
2004         *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
2005         *       >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
2006         */
2007        period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
2008        lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
2009        ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
2010
2011        if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
2012                /*
2013                 * Most memory accesses are local. There is no need to
2014                 * do fast NUMA scanning, since memory is already local.
2015                 */
2016                int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
2017                if (!slot)
2018                        slot = 1;
2019                diff = slot * period_slot;
2020        } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
2021                /*
2022                 * Most memory accesses are shared with other tasks.
2023                 * There is no point in continuing fast NUMA scanning,
2024                 * since other tasks may just move the memory elsewhere.
2025                 */
2026                int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
2027                if (!slot)
2028                        slot = 1;
2029                diff = slot * period_slot;
2030        } else {
2031                /*
2032                 * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
2033                 * yet they are not on the local NUMA node. Speed up
2034                 * NUMA scanning to get the memory moved over.
2035                 */
2036                int ratio = max(lr_ratio, ps_ratio);
2037                diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
2038        }
2039
2040        p->numa_scan_period = clamp(p->numa_scan_period + diff,
2041                        task_scan_min(p), task_scan_max(p));
2042        memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2043}
2044
2045/*
2046 * Get the fraction of time the task has been running since the last
2047 * NUMA placement cycle. The scheduler keeps similar statistics, but
2048 * decays those on a 32ms period, which is orders of magnitude off
2049 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
2050 * stats only if the task is so new there are no NUMA statistics yet.
2051 */
2052static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2053{
2054        u64 runtime, delta, now;
2055        /* Use the start of this time slice to avoid calculations. */
2056        now = p->se.exec_start;
2057        runtime = p->se.sum_exec_runtime;
2058
2059        if (p->last_task_numa_placement) {
2060                delta = runtime - p->last_sum_exec_runtime;
2061                *period = now - p->last_task_numa_placement;
2062
2063                /* Avoid time going backwards, prevent potential divide error: */
2064                if (unlikely((s64)*period < 0))
2065                        *period = 0;
2066        } else {
2067                delta = p->se.avg.load_sum;
2068                *period = LOAD_AVG_MAX;
2069        }
2070
2071        p->last_sum_exec_runtime = runtime;
2072        p->last_task_numa_placement = now;
2073
2074        return delta;
2075}
2076
2077/*
2078 * Determine the preferred nid for a task in a numa_group. This needs to
2079 * be done in a way that produces consistent results with group_weight,
2080 * otherwise workloads might not converge.
2081 */
2082static int preferred_group_nid(struct task_struct *p, int nid)
2083{
2084        nodemask_t nodes;
2085        int dist;
2086
2087        /* Direct connections between all NUMA nodes. */
2088        if (sched_numa_topology_type == NUMA_DIRECT)
2089                return nid;
2090
2091        /*
2092         * On a system with glueless mesh NUMA topology, group_weight
2093         * scores nodes according to the number of NUMA hinting faults on
2094         * both the node itself, and on nearby nodes.
2095         */
2096        if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2097                unsigned long score, max_score = 0;
2098                int node, max_node = nid;
2099
2100                dist = sched_max_numa_distance;
2101
2102                for_each_online_node(node) {
2103                        score = group_weight(p, node, dist);
2104                        if (score > max_score) {
2105                                max_score = score;
2106                                max_node = node;
2107                        }
2108                }
2109                return max_node;
2110        }
2111
2112        /*
2113         * Finding the preferred nid in a system with NUMA backplane
2114         * interconnect topology is more involved. The goal is to locate
2115         * tasks from numa_groups near each other in the system, and
2116         * untangle workloads from different sides of the system. This requires
2117         * searching down the hierarchy of node groups, recursively searching
2118         * inside the highest scoring group of nodes. The nodemask tricks
2119         * keep the complexity of the search down.
2120         */
2121        nodes = node_online_map;
2122        for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2123                unsigned long max_faults = 0;
2124                nodemask_t max_group = NODE_MASK_NONE;
2125                int a, b;
2126
2127                /* Are there nodes at this distance from each other? */
2128                if (!find_numa_distance(dist))
2129                        continue;
2130
2131                for_each_node_mask(a, nodes) {
2132                        unsigned long faults = 0;
2133                        nodemask_t this_group;
2134                        nodes_clear(this_group);
2135
2136                        /* Sum group's NUMA faults; includes a==b case. */
2137                        for_each_node_mask(b, nodes) {
2138                                if (node_distance(a, b) < dist) {
2139                                        faults += group_faults(p, b);
2140                                        node_set(b, this_group);
2141                                        node_clear(b, nodes);
2142                                }
2143                        }
2144
2145                        /* Remember the top group. */
2146                        if (faults > max_faults) {
2147                                max_faults = faults;
2148                                max_group = this_group;
2149                                /*
2150                                 * subtle: at the smallest distance there is
2151                                 * just one node left in each "group", the
2152                                 * winner is the preferred nid.
2153                                 */
2154                                nid = a;
2155                        }
2156                }
2157                /* Next round, evaluate the nodes within max_group. */
2158                if (!max_faults)
2159                        break;
2160                nodes = max_group;
2161        }
2162        return nid;
2163}
2164
2165static void task_numa_placement(struct task_struct *p)
2166{
2167        int seq, nid, max_nid = NUMA_NO_NODE;
2168        unsigned long max_faults = 0;
2169        unsigned long fault_types[2] = { 0, 0 };
2170        unsigned long total_faults;
2171        u64 runtime, period;
2172        spinlock_t *group_lock = NULL;
2173        struct numa_group *ng;
2174
2175        /*
2176         * The p->mm->numa_scan_seq field gets updated without
2177         * exclusive access. Use READ_ONCE() here to ensure
2178         * that the field is read in a single access:
2179         */
2180        seq = READ_ONCE(p->mm->numa_scan_seq);
2181        if (p->numa_scan_seq == seq)
2182                return;
2183        p->numa_scan_seq = seq;
2184        p->numa_scan_period_max = task_scan_max(p);
2185
2186        total_faults = p->numa_faults_locality[0] +
2187                       p->numa_faults_locality[1];
2188        runtime = numa_get_avg_runtime(p, &period);
2189
2190        /* If the task is part of a group prevent parallel updates to group stats */
2191        ng = deref_curr_numa_group(p);
2192        if (ng) {
2193                group_lock = &ng->lock;
2194                spin_lock_irq(group_lock);
2195        }
2196
2197        /* Find the node with the highest number of faults */
2198        for_each_online_node(nid) {
2199                /* Keep track of the offsets in numa_faults array */
2200                int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2201                unsigned long faults = 0, group_faults = 0;
2202                int priv;
2203
2204                for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2205                        long diff, f_diff, f_weight;
2206
2207                        mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2208                        membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2209                        cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2210                        cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2211
2212                        /* Decay existing window, copy faults since last scan */
2213                        diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2214                        fault_types[priv] += p->numa_faults[membuf_idx];
2215                        p->numa_faults[membuf_idx] = 0;
2216
2217                        /*
2218                         * Normalize the faults_from, so all tasks in a group
2219                         * count according to CPU use, instead of by the raw
2220                         * number of faults. Tasks with little runtime have
2221                         * little over-all impact on throughput, and thus their
2222                         * faults are less important.
2223                         */
2224                        f_weight = div64_u64(runtime << 16, period + 1);
2225                        f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2226                                   (total_faults + 1);
2227                        f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2228                        p->numa_faults[cpubuf_idx] = 0;
2229
2230                        p->numa_faults[mem_idx] += diff;
2231                        p->numa_faults[cpu_idx] += f_diff;
2232                        faults += p->numa_faults[mem_idx];
2233                        p->total_numa_faults += diff;
2234                        if (ng) {
2235                                /*
2236                                 * safe because we can only change our own group
2237                                 *
2238                                 * mem_idx represents the offset for a given
2239                                 * nid and priv in a specific region because it
2240                                 * is at the beginning of the numa_faults array.
2241                                 */
2242                                ng->faults[mem_idx] += diff;
2243                                ng->faults_cpu[mem_idx] += f_diff;
2244                                ng->total_faults += diff;
2245                                group_faults += ng->faults[mem_idx];
2246                        }
2247                }
2248
2249                if (!ng) {
2250                        if (faults > max_faults) {
2251                                max_faults = faults;
2252                                max_nid = nid;
2253                        }
2254                } else if (group_faults > max_faults) {
2255                        max_faults = group_faults;
2256                        max_nid = nid;
2257                }
2258        }
2259
2260        if (ng) {
2261                numa_group_count_active_nodes(ng);
2262                spin_unlock_irq(group_lock);
2263                max_nid = preferred_group_nid(p, max_nid);
2264        }
2265
2266        if (max_faults) {
2267                /* Set the new preferred node */
2268                if (max_nid != p->numa_preferred_nid)
2269                        sched_setnuma(p, max_nid);
2270        }
2271
2272        update_task_scan_period(p, fault_types[0], fault_types[1]);
2273}
2274
2275static inline int get_numa_group(struct numa_group *grp)
2276{
2277        return refcount_inc_not_zero(&grp->refcount);
2278}
2279
2280static inline void put_numa_group(struct numa_group *grp)
2281{
2282        if (refcount_dec_and_test(&grp->refcount))
2283                kfree_rcu(grp, rcu);
2284}
2285
2286static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2287                        int *priv)
2288{
2289        struct numa_group *grp, *my_grp;
2290        struct task_struct *tsk;
2291        bool join = false;
2292        int cpu = cpupid_to_cpu(cpupid);
2293        int i;
2294
2295        if (unlikely(!deref_curr_numa_group(p))) {
2296                unsigned int size = sizeof(struct numa_group) +
2297                                    4*nr_node_ids*sizeof(unsigned long);
2298
2299                grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2300                if (!grp)
2301                        return;
2302
2303                refcount_set(&grp->refcount, 1);
2304                grp->active_nodes = 1;
2305                grp->max_faults_cpu = 0;
2306                spin_lock_init(&grp->lock);
2307                grp->gid = p->pid;
2308                /* Second half of the array tracks nids where faults happen */
2309                grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2310                                                nr_node_ids;
2311
2312                for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2313                        grp->faults[i] = p->numa_faults[i];
2314
2315                grp->total_faults = p->total_numa_faults;
2316
2317                grp->nr_tasks++;
2318                rcu_assign_pointer(p->numa_group, grp);
2319        }
2320
2321        rcu_read_lock();
2322        tsk = READ_ONCE(cpu_rq(cpu)->curr);
2323
2324        if (!cpupid_match_pid(tsk, cpupid))
2325                goto no_join;
2326
2327        grp = rcu_dereference(tsk->numa_group);
2328        if (!grp)
2329                goto no_join;
2330
2331        my_grp = deref_curr_numa_group(p);
2332        if (grp == my_grp)
2333                goto no_join;
2334
2335        /*
2336         * Only join the other group if its bigger; if we're the bigger group,
2337         * the other task will join us.
2338         */
2339        if (my_grp->nr_tasks > grp->nr_tasks)
2340                goto no_join;
2341
2342        /*
2343         * Tie-break on the grp address.
2344         */
2345        if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2346                goto no_join;
2347
2348        /* Always join threads in the same process. */
2349        if (tsk->mm == current->mm)
2350                join = true;
2351
2352        /* Simple filter to avoid false positives due to PID collisions */
2353        if (flags & TNF_SHARED)
2354                join = true;
2355
2356        /* Update priv based on whether false sharing was detected */
2357        *priv = !join;
2358
2359        if (join && !get_numa_group(grp))
2360                goto no_join;
2361
2362        rcu_read_unlock();
2363
2364        if (!join)
2365                return;
2366
2367        BUG_ON(irqs_disabled());
2368        double_lock_irq(&my_grp->lock, &grp->lock);
2369
2370        for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2371                my_grp->faults[i] -= p->numa_faults[i];
2372                grp->faults[i] += p->numa_faults[i];
2373        }
2374        my_grp->total_faults -= p->total_numa_faults;
2375        grp->total_faults += p->total_numa_faults;
2376
2377        my_grp->nr_tasks--;
2378        grp->nr_tasks++;
2379
2380        spin_unlock(&my_grp->lock);
2381        spin_unlock_irq(&grp->lock);
2382
2383        rcu_assign_pointer(p->numa_group, grp);
2384
2385        put_numa_group(my_grp);
2386        return;
2387
2388no_join:
2389        rcu_read_unlock();
2390        return;
2391}
2392
2393/*
2394 * Get rid of NUMA staticstics associated with a task (either current or dead).
2395 * If @final is set, the task is dead and has reached refcount zero, so we can
2396 * safely free all relevant data structures. Otherwise, there might be
2397 * concurrent reads from places like load balancing and procfs, and we should
2398 * reset the data back to default state without freeing ->numa_faults.
2399 */
2400void task_numa_free(struct task_struct *p, bool final)
2401{
2402        /* safe: p either is current or is being freed by current */
2403        struct numa_group *grp = rcu_dereference_raw(p->numa_group);
2404        unsigned long *numa_faults = p->numa_faults;
2405        unsigned long flags;
2406        int i;
2407
2408        if (!numa_faults)
2409                return;
2410
2411        if (grp) {
2412                spin_lock_irqsave(&grp->lock, flags);
2413                for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2414                        grp->faults[i] -= p->numa_faults[i];
2415                grp->total_faults -= p->total_numa_faults;
2416
2417                grp->nr_tasks--;
2418                spin_unlock_irqrestore(&grp->lock, flags);
2419                RCU_INIT_POINTER(p->numa_group, NULL);
2420                put_numa_group(grp);
2421        }
2422
2423        if (final) {
2424                p->numa_faults = NULL;
2425                kfree(numa_faults);
2426        } else {
2427                p->total_numa_faults = 0;
2428                for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2429                        numa_faults[i] = 0;
2430        }
2431}
2432
2433/*
2434 * Got a PROT_NONE fault for a page on @node.
2435 */
2436void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2437{
2438        struct task_struct *p = current;
2439        bool migrated = flags & TNF_MIGRATED;
2440        int cpu_node = task_node(current);
2441        int local = !!(flags & TNF_FAULT_LOCAL);
2442        struct numa_group *ng;
2443        int priv;
2444
2445        if (!static_branch_likely(&sched_numa_balancing))
2446                return;
2447
2448        /* for example, ksmd faulting in a user's mm */
2449        if (!p->mm)
2450                return;
2451
2452        /* Allocate buffer to track faults on a per-node basis */
2453        if (unlikely(!p->numa_faults)) {
2454                int size = sizeof(*p->numa_faults) *
2455                           NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2456
2457                p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2458                if (!p->numa_faults)
2459                        return;
2460
2461                p->total_numa_faults = 0;
2462                memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2463        }
2464
2465        /*
2466         * First accesses are treated as private, otherwise consider accesses
2467         * to be private if the accessing pid has not changed
2468         */
2469        if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2470                priv = 1;
2471        } else {
2472                priv = cpupid_match_pid(p, last_cpupid);
2473                if (!priv && !(flags & TNF_NO_GROUP))
2474                        task_numa_group(p, last_cpupid, flags, &priv);
2475        }
2476
2477        /*
2478         * If a workload spans multiple NUMA nodes, a shared fault that
2479         * occurs wholly within the set of nodes that the workload is
2480         * actively using should be counted as local. This allows the
2481         * scan rate to slow down when a workload has settled down.
2482         */
2483        ng = deref_curr_numa_group(p);
2484        if (!priv && !local && ng && ng->active_nodes > 1 &&
2485                                numa_is_active_node(cpu_node, ng) &&
2486                                numa_is_active_node(mem_node, ng))
2487                local = 1;
2488
2489        /*
2490         * Retry to migrate task to preferred node periodically, in case it
2491         * previously failed, or the scheduler moved us.
2492         */
2493        if (time_after(jiffies, p->numa_migrate_retry)) {
2494                task_numa_placement(p);
2495                numa_migrate_preferred(p);
2496        }
2497
2498        if (migrated)
2499                p->numa_pages_migrated += pages;
2500        if (flags & TNF_MIGRATE_FAIL)
2501                p->numa_faults_locality[2] += pages;
2502
2503        p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2504        p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2505        p->numa_faults_locality[local] += pages;
2506}
2507
2508static void reset_ptenuma_scan(struct task_struct *p)
2509{
2510        /*
2511         * We only did a read acquisition of the mmap sem, so
2512         * p->mm->numa_scan_seq is written to without exclusive access
2513         * and the update is not guaranteed to be atomic. That's not
2514         * much of an issue though, since this is just used for
2515         * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2516         * expensive, to avoid any form of compiler optimizations:
2517         */
2518        WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2519        p->mm->numa_scan_offset = 0;
2520}
2521
2522/*
2523 * The expensive part of numa migration is done from task_work context.
2524 * Triggered from task_tick_numa().
2525 */
2526void task_numa_work(struct callback_head *work)
2527{
2528        unsigned long migrate, next_scan, now = jiffies;
2529        struct task_struct *p = current;
2530        struct mm_struct *mm = p->mm;
2531        u64 runtime = p->se.sum_exec_runtime;
2532        struct vm_area_struct *vma;
2533        unsigned long start, end;
2534        unsigned long nr_pte_updates = 0;
2535        long pages, virtpages;
2536
2537        SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
2538
2539        work->next = work; /* protect against double add */
2540        /*
2541         * Who cares about NUMA placement when they're dying.
2542         *
2543         * NOTE: make sure not to dereference p->mm before this check,
2544         * exit_task_work() happens _after_ exit_mm() so we could be called
2545         * without p->mm even though we still had it when we enqueued this
2546         * work.
2547         */
2548        if (p->flags & PF_EXITING)
2549                return;
2550
2551        if (!mm->numa_next_scan) {
2552                mm->numa_next_scan = now +
2553                        msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2554        }
2555
2556        /*
2557         * Enforce maximal scan/migration frequency..
2558         */
2559        migrate = mm->numa_next_scan;
2560        if (time_before(now, migrate))
2561                return;
2562
2563        if (p->numa_scan_period == 0) {
2564                p->numa_scan_period_max = task_scan_max(p);
2565                p->numa_scan_period = task_scan_start(p);
2566        }
2567
2568        next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2569        if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2570                return;
2571
2572        /*
2573         * Delay this task enough that another task of this mm will likely win
2574         * the next time around.
2575         */
2576        p->node_stamp += 2 * TICK_NSEC;
2577
2578        start = mm->numa_scan_offset;
2579        pages = sysctl_numa_balancing_scan_size;
2580        pages <<= 20 - PAGE_SHIFT; /* MB in pages */
2581        virtpages = pages * 8;     /* Scan up to this much virtual space */
2582        if (!pages)
2583                return;
2584
2585
2586        if (!down_read_trylock(&mm->mmap_sem))
2587                return;
2588        vma = find_vma(mm, start);
2589        if (!vma) {
2590                reset_ptenuma_scan(p);
2591                start = 0;
2592                vma = mm->mmap;
2593        }
2594        for (; vma; vma = vma->vm_next) {
2595                if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2596                        is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2597                        continue;
2598                }
2599
2600                /*
2601                 * Shared library pages mapped by multiple processes are not
2602                 * migrated as it is expected they are cache replicated. Avoid
2603                 * hinting faults in read-only file-backed mappings or the vdso
2604                 * as migrating the pages will be of marginal benefit.
2605                 */
2606                if (!vma->vm_mm ||
2607                    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2608                        continue;
2609
2610                /*
2611                 * Skip inaccessible VMAs to avoid any confusion between
2612                 * PROT_NONE and NUMA hinting ptes
2613                 */
2614                if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2615                        continue;
2616
2617                do {
2618                        start = max(start, vma->vm_start);
2619                        end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2620                        end = min(end, vma->vm_end);
2621                        nr_pte_updates = change_prot_numa(vma, start, end);
2622
2623                        /*
2624                         * Try to scan sysctl_numa_balancing_size worth of
2625                         * hpages that have at least one present PTE that
2626                         * is not already pte-numa. If the VMA contains
2627                         * areas that are unused or already full of prot_numa
2628                         * PTEs, scan up to virtpages, to skip through those
2629                         * areas faster.
2630                         */
2631                        if (nr_pte_updates)
2632                                pages -= (end - start) >> PAGE_SHIFT;
2633                        virtpages -= (end - start) >> PAGE_SHIFT;
2634
2635                        start = end;
2636                        if (pages <= 0 || virtpages <= 0)
2637                                goto out;
2638
2639                        cond_resched();
2640                } while (end != vma->vm_end);
2641        }
2642
2643out:
2644        /*
2645         * It is possible to reach the end of the VMA list but the last few
2646         * VMAs are not guaranteed to the vma_migratable. If they are not, we
2647         * would find the !migratable VMA on the next scan but not reset the
2648         * scanner to the start so check it now.
2649         */
2650        if (vma)
2651                mm->numa_scan_offset = start;
2652        else
2653                reset_ptenuma_scan(p);
2654        up_read(&mm->mmap_sem);
2655
2656        /*
2657         * Make sure tasks use at least 32x as much time to run other code
2658         * than they used here, to limit NUMA PTE scanning overhead to 3% max.
2659         * Usually update_task_scan_period slows down scanning enough; on an
2660         * overloaded system we need to limit overhead on a per task basis.
2661         */
2662        if (unlikely(p->se.sum_exec_runtime != runtime)) {
2663                u64 diff = p->se.sum_exec_runtime - runtime;
2664                p->node_stamp += 32 * diff;
2665        }
2666}
2667
2668/*
2669 * Drive the periodic memory faults..
2670 */
2671static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2672{
2673        struct callback_head *work = &curr->numa_work;
2674        u64 period, now;
2675
2676        /*
2677         * We don't care about NUMA placement if we don't have memory.
2678         */
2679        if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2680                return;
2681
2682        /*
2683         * Using runtime rather than walltime has the dual advantage that
2684         * we (mostly) drive the selection from busy threads and that the
2685         * task needs to have done some actual work before we bother with
2686         * NUMA placement.
2687         */
2688        now = curr->se.sum_exec_runtime;
2689        period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2690
2691        if (now > curr->node_stamp + period) {
2692                if (!curr->node_stamp)
2693                        curr->numa_scan_period = task_scan_start(curr);
2694                curr->node_stamp += period;
2695
2696                if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2697                        init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2698                        task_work_add(curr, work, true);
2699                }
2700        }
2701}
2702
2703static void update_scan_period(struct task_struct *p, int new_cpu)
2704{
2705        int src_nid = cpu_to_node(task_cpu(p));
2706        int dst_nid = cpu_to_node(new_cpu);
2707
2708        if (!static_branch_likely(&sched_numa_balancing))
2709                return;
2710
2711        if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
2712                return;
2713
2714        if (src_nid == dst_nid)
2715                return;
2716
2717        /*
2718         * Allow resets if faults have been trapped before one scan
2719         * has completed. This is most likely due to a new task that
2720         * is pulled cross-node due to wakeups or load balancing.
2721         */
2722        if (p->numa_scan_seq) {
2723                /*
2724                 * Avoid scan adjustments if moving to the preferred
2725                 * node or if the task was not previously running on
2726                 * the preferred node.
2727                 */
2728                if (dst_nid == p->numa_preferred_nid ||
2729                    (p->numa_preferred_nid != NUMA_NO_NODE &&
2730                        src_nid != p->numa_preferred_nid))
2731                        return;
2732        }
2733
2734        p->numa_scan_period = task_scan_start(p);
2735}
2736
2737#else
2738static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2739{
2740}
2741
2742static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2743{
2744}
2745
2746static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2747{
2748}
2749
2750static inline void update_scan_period(struct task_struct *p, int new_cpu)
2751{
2752}
2753
2754#endif /* CONFIG_NUMA_BALANCING */
2755
2756static void
2757account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2758{
2759        update_load_add(&cfs_rq->load, se->load.weight);
2760#ifdef CONFIG_SMP
2761        if (entity_is_task(se)) {
2762                struct rq *rq = rq_of(cfs_rq);
2763
2764                account_numa_enqueue(rq, task_of(se));
2765                list_add(&se->group_node, &rq->cfs_tasks);
2766        }
2767#endif
2768        cfs_rq->nr_running++;
2769}
2770
2771static void
2772account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2773{
2774        update_load_sub(&cfs_rq->load, se->load.weight);
2775#ifdef CONFIG_SMP
2776        if (entity_is_task(se)) {
2777                account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2778                list_del_init(&se->group_node);
2779        }
2780#endif
2781        cfs_rq->nr_running--;
2782}
2783
2784/*
2785 * Signed add and clamp on underflow.
2786 *
2787 * Explicitly do a load-store to ensure the intermediate value never hits
2788 * memory. This allows lockless observations without ever seeing the negative
2789 * values.
2790 */
2791#define add_positive(_ptr, _val) do {                           \
2792        typeof(_ptr) ptr = (_ptr);                              \
2793        typeof(_val) val = (_val);                              \
2794        typeof(*ptr) res, var = READ_ONCE(*ptr);                \
2795                                                                \
2796        res = var + val;                                        \
2797                                                                \
2798        if (val < 0 && res > var)                               \
2799                res = 0;                                        \
2800                                                                \
2801        WRITE_ONCE(*ptr, res);                                  \
2802} while (0)
2803
2804/*
2805 * Unsigned subtract and clamp on underflow.
2806 *
2807 * Explicitly do a load-store to ensure the intermediate value never hits
2808 * memory. This allows lockless observations without ever seeing the negative
2809 * values.
2810 */
2811#define sub_positive(_ptr, _val) do {                           \
2812        typeof(_ptr) ptr = (_ptr);                              \
2813        typeof(*ptr) val = (_val);                              \
2814        typeof(*ptr) res, var = READ_ONCE(*ptr);                \
2815        res = var - val;                                        \
2816        if (res > var)                                          \
2817                res = 0;                                        \
2818        WRITE_ONCE(*ptr, res);                                  \
2819} while (0)
2820
2821/*
2822 * Remove and clamp on negative, from a local variable.
2823 *
2824 * A variant of sub_positive(), which does not use explicit load-store
2825 * and is thus optimized for local variable updates.
2826 */
2827#define lsub_positive(_ptr, _val) do {                          \
2828        typeof(_ptr) ptr = (_ptr);                              \
2829        *ptr -= min_t(typeof(*ptr), *ptr, _val);                \
2830} while (0)
2831
2832#ifdef CONFIG_SMP
2833static inline void
2834enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2835{
2836        cfs_rq->runnable_weight += se->runnable_weight;
2837
2838        cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
2839        cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
2840}
2841
2842static inline void
2843dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2844{
2845        cfs_rq->runnable_weight -= se->runnable_weight;
2846
2847        sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
2848        sub_positive(&cfs_rq->avg.runnable_load_sum,
2849                     se_runnable(se) * se->avg.runnable_load_sum);
2850}
2851
2852static inline void
2853enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2854{
2855        cfs_rq->avg.load_avg += se->avg.load_avg;
2856        cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
2857}
2858
2859static inline void
2860dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2861{
2862        sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
2863        sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
2864}
2865#else
2866static inline void
2867enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2868static inline void
2869dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2870static inline void
2871enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2872static inline void
2873dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2874#endif
2875
2876static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2877                            unsigned long weight, unsigned long runnable)
2878{
2879        if (se->on_rq) {
2880                /* commit outstanding execution time */
2881                if (cfs_rq->curr == se)
2882                        update_curr(cfs_rq);
2883                account_entity_dequeue(cfs_rq, se);
2884                dequeue_runnable_load_avg(cfs_rq, se);
2885        }
2886        dequeue_load_avg(cfs_rq, se);
2887
2888        se->runnable_weight = runnable;
2889        update_load_set(&se->load, weight);
2890
2891#ifdef CONFIG_SMP
2892        do {
2893                u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
2894
2895                se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
2896                se->avg.runnable_load_avg =
2897                        div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
2898        } while (0);
2899#endif
2900
2901        enqueue_load_avg(cfs_rq, se);
2902        if (se->on_rq) {
2903                account_entity_enqueue(cfs_rq, se);
2904                enqueue_runnable_load_avg(cfs_rq, se);
2905        }
2906}
2907
2908void reweight_task(struct task_struct *p, int prio)
2909{
2910        struct sched_entity *se = &p->se;
2911        struct cfs_rq *cfs_rq = cfs_rq_of(se);
2912        struct load_weight *load = &se->load;
2913        unsigned long weight = scale_load(sched_prio_to_weight[prio]);
2914
2915        reweight_entity(cfs_rq, se, weight, weight);
2916        load->inv_weight = sched_prio_to_wmult[prio];
2917}
2918
2919#ifdef CONFIG_FAIR_GROUP_SCHED
2920#ifdef CONFIG_SMP
2921/*
2922 * All this does is approximate the hierarchical proportion which includes that
2923 * global sum we all love to hate.
2924 *
2925 * That is, the weight of a group entity, is the proportional share of the
2926 * group weight based on the group runqueue weights. That is:
2927 *
2928 *                     tg->weight * grq->load.weight
2929 *   ge->load.weight = -----------------------------               (1)
2930 *                        \Sum grq->load.weight
2931 *
2932 * Now, because computing that sum is prohibitively expensive to compute (been
2933 * there, done that) we approximate it with this average stuff. The average
2934 * moves slower and therefore the approximation is cheaper and more stable.
2935 *
2936 * So instead of the above, we substitute:
2937 *
2938 *   grq->load.weight -> grq->avg.load_avg                         (2)
2939 *
2940 * which yields the following:
2941 *
2942 *                     tg->weight * grq->avg.load_avg
2943 *   ge->load.weight = ------------------------------              (3)
2944 *                              tg->load_avg
2945 *
2946 * Where: tg->load_avg ~= \Sum grq->avg.load_avg
2947 *
2948 * That is shares_avg, and it is right (given the approximation (2)).
2949 *
2950 * The problem with it is that because the average is slow -- it was designed
2951 * to be exactly that of course -- this leads to transients in boundary
2952 * conditions. In specific, the case where the group was idle and we start the
2953 * one task. It takes time for our CPU's grq->avg.load_avg to build up,
2954 * yielding bad latency etc..
2955 *
2956 * Now, in that special case (1) reduces to:
2957 *
2958 *                     tg->weight * grq->load.weight
2959 *   ge->load.weight = ----------------------------- = tg->weight   (4)
2960 *                          grp->load.weight
2961 *
2962 * That is, the sum collapses because all other CPUs are idle; the UP scenario.
2963 *
2964 * So what we do is modify our approximation (3) to approach (4) in the (near)
2965 * UP case, like:
2966 *
2967 *   ge->load.weight =
2968 *
2969 *              tg->weight * grq->load.weight
2970 *     ---------------------------------------------------         (5)
2971 *     tg->load_avg - grq->avg.load_avg + grq->load.weight
2972 *
2973 * But because grq->load.weight can drop to 0, resulting in a divide by zero,
2974 * we need to use grq->avg.load_avg as its lower bound, which then gives:
2975 *
2976 *
2977 *                     tg->weight * grq->load.weight
2978 *   ge->load.weight = -----------------------------               (6)
2979 *                              tg_load_avg'
2980 *
2981 * Where:
2982 *
2983 *   tg_load_avg' = tg->load_avg - grq->avg.load_avg +
2984 *                  max(grq->load.weight, grq->avg.load_avg)
2985 *
2986 * And that is shares_weight and is icky. In the (near) UP case it approaches
2987 * (4) while in the normal case it approaches (3). It consistently
2988 * overestimates the ge->load.weight and therefore:
2989 *
2990 *   \Sum ge->load.weight >= tg->weight
2991 *
2992 * hence icky!
2993 */
2994static long calc_group_shares(struct cfs_rq *cfs_rq)
2995{
2996        long tg_weight, tg_shares, load, shares;
2997        struct task_group *tg = cfs_rq->tg;
2998
2999        tg_shares = READ_ONCE(tg->shares);
3000

3001        load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
3002
3003        tg_weight = atomic_long_read(&tg->load_avg);
3004
3005        /* Ensure tg_weight >= load */
3006        tg_weight -= cfs_rq->tg_load_avg_contrib;
3007        tg_weight += load;
3008
3009        shares = (tg_shares * load);
3010        if (tg_weight)
3011                shares /= tg_weight;
3012
3013        /*
3014         * MIN_SHARES has to be unscaled here to support per-CPU partitioning
3015         * of a group with small tg->shares value. It is a floor value which is
3016         * assigned as a minimum load.weight to the sched_entity representing
3017         * the group on a CPU.
3018         *
3019         * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
3020         * on an 8-core system with 8 tasks each runnable on one CPU shares has
3021         * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
3022         * case no task is runnable on a CPU MIN_SHARES=2 should be returned
3023         * instead of 0.
3024         */
3025        return clamp_t(long, shares, MIN_SHARES, tg_shares);
3026}
3027
3028/*
3029 * This calculates the effective runnable weight for a group entity based on
3030 * the group entity weight calculated above.
3031 *
3032 * Because of the above approximation (2), our group entity weight is
3033 * an load_avg based ratio (3). This means that it includes blocked load and
3034 * does not represent the runnable weight.
3035 *
3036 * Approximate the group entity's runnable weight per ratio from the group
3037 * runqueue:
3038 *
3039 *                                           grq->avg.runnable_load_avg
3040 *   ge->runnable_weight = ge->load.weight * -------------------------- (7)
3041 *                                               grq->avg.load_avg
3042 *
3043 * However, analogous to above, since the avg numbers are slow, this leads to
3044 * transients in the from-idle case. Instead we use:
3045 *
3046 *   ge->runnable_weight = ge->load.weight *
3047 *
3048 *              max(grq->avg.runnable_load_avg, grq->runnable_weight)
3049 *              -----------------------------------------------------   (8)
3050 *                    max(grq->avg.load_avg, grq->load.weight)
3051 *
3052 * Where these max() serve both to use the 'instant' values to fix the slow
3053 * from-idle and avoid the /0 on to-idle, similar to (6).
3054 */
3055static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
3056{
3057        long runnable, load_avg;
3058
3059        load_avg = max(cfs_rq->avg.load_avg,
3060                       scale_load_down(cfs_rq->load.weight));
3061
3062        runnable = max(cfs_rq->avg.runnable_load_avg,
3063                       scale_load_down(cfs_rq->runnable_weight));
3064
3065        runnable *= shares;
3066        if (load_avg)
3067                runnable /= load_avg;
3068
3069        return clamp_t(long, runnable, MIN_SHARES, shares);
3070}
3071#endif /* CONFIG_SMP */
3072
3073static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
3074
3075/*
3076 * Recomputes the group entity based on the current state of its group
3077 * runqueue.
3078 */
3079static void update_cfs_group(struct sched_entity *se)
3080{
3081        struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3082        long shares, runnable;
3083
3084        if (!gcfs_rq)
3085                return;
3086
3087        if (throttled_hierarchy(gcfs_rq))
3088                return;
3089
3090#ifndef CONFIG_SMP
3091        runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
3092
3093        if (likely(se->load.weight == shares))
3094                return;
3095#else
3096        shares   = calc_group_shares(gcfs_rq);
3097        runnable = calc_group_runnable(gcfs_rq, shares);
3098#endif
3099
3100        reweight_entity(cfs_rq_of(se), se, shares, runnable);
3101}
3102
3103#else /* CONFIG_FAIR_GROUP_SCHED */
3104static inline void update_cfs_group(struct sched_entity *se)
3105{
3106}
3107#endif /* CONFIG_FAIR_GROUP_SCHED */
3108
3109static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
3110{
3111        struct rq *rq = rq_of(cfs_rq);
3112
3113        if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
3114                /*
3115                 * There are a few boundary cases this might miss but it should
3116                 * get called often enough that that should (hopefully) not be
3117                 * a real problem.
3118                 *
3119                 * It will not get called when we go idle, because the idle
3120                 * thread is a different class (!fair), nor will the utilization
3121                 * number include things like RT tasks.
3122                 *
3123                 * As is, the util number is not freq-invariant (we'd have to
3124                 * implement arch_scale_freq_capacity() for that).
3125                 *
3126                 * See cpu_util().
3127                 */
3128                cpufreq_update_util(rq, flags);
3129        }
3130}
3131
3132#ifdef CONFIG_SMP
3133#ifdef CONFIG_FAIR_GROUP_SCHED
3134/**
3135 * update_tg_load_avg - update the tg's load avg
3136 * @cfs_rq: the cfs_rq whose avg changed
3137 * @force: update regardless of how small the difference
3138 *
3139 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
3140 * However, because tg->load_avg is a global value there are performance
3141 * considerations.
3142 *
3143 * In order to avoid having to look at the other cfs_rq's, we use a
3144 * differential update where we store the last value we propagated. This in
3145 * turn allows skipping updates if the differential is 'small'.
3146 *
3147 * Updating tg's load_avg is necessary before update_cfs_share().
3148 */
3149static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
3150{
3151        long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
3152
3153        /*
3154         * No need to update load_avg for root_task_group as it is not used.
3155         */
3156        if (cfs_rq->tg == &root_task_group)
3157                return;
3158
3159        if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3160                atomic_long_add(delta, &cfs_rq->tg->load_avg);
3161                cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
3162        }
3163}
3164
3165/*
3166 * Called within set_task_rq() right before setting a task's CPU. The
3167 * caller only guarantees p->pi_lock is held; no other assumptions,
3168 * including the state of rq->lock, should be made.
3169 */
3170void set_task_rq_fair(struct sched_entity *se,
3171                      struct cfs_rq *prev, struct cfs_rq *next)
3172{
3173        u64 p_last_update_time;
3174        u64 n_last_update_time;
3175
3176        if (!sched_feat(ATTACH_AGE_LOAD))
3177                return;
3178
3179        /*
3180         * We are supposed to update the task to "current" time, then its up to
3181         * date and ready to go to new CPU/cfs_rq. But we have difficulty in
3182         * getting what current time is, so simply throw away the out-of-date
3183         * time. This will result in the wakee task is less decayed, but giving
3184         * the wakee more load sounds not bad.
3185         */
3186        if (!(se->avg.last_update_time && prev))
3187                return;
3188
3189#ifndef CONFIG_64BIT
3190        {
3191                u64 p_last_update_time_copy;
3192                u64 n_last_update_time_copy;
3193
3194                do {
3195                        p_last_update_time_copy = prev->load_last_update_time_copy;
3196                        n_last_update_time_copy = next->load_last_update_time_copy;
3197
3198                        smp_rmb();
3199
3200                        p_last_update_time = prev->avg.last_update_time;
3201                        n_last_update_time = next->avg.last_update_time;
3202
3203                } while (p_last_update_time != p_last_update_time_copy ||
3204                         n_last_update_time != n_last_update_time_copy);
3205        }
3206#else
3207        p_last_update_time = prev->avg.last_update_time;
3208        n_last_update_time = next->avg.last_update_time;
3209#endif
3210        __update_load_avg_blocked_se(p_last_update_time, se);
3211        se->avg.last_update_time = n_last_update_time;
3212}
3213
3214
3215/*
3216 * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
3217 * propagate its contribution. The key to this propagation is the invariant
3218 * that for each group:
3219 *
3220 *   ge->avg == grq->avg                                                (1)
3221 *
3222 * _IFF_ we look at the pure running and runnable sums. Because they
3223 * represent the very same entity, just at different points in the hierarchy.
3224 *
3225 * Per the above update_tg_cfs_util() is trivial and simply copies the running
3226 * sum over (but still wrong, because the group entity and group rq do not have
3227 * their PELT windows aligned).
3228 *
3229 * However, update_tg_cfs_runnable() is more complex. So we have:
3230 *
3231 *   ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg          (2)
3232 *
3233 * And since, like util, the runnable part should be directly transferable,
3234 * the following would _appear_ to be the straight forward approach:
3235 *
3236 *   grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg       (3)
3237 *
3238 * And per (1) we have:
3239 *
3240 *   ge->avg.runnable_avg == grq->avg.runnable_avg
3241 *
3242 * Which gives:
3243 *
3244 *                      ge->load.weight * grq->avg.load_avg
3245 *   ge->avg.load_avg = -----------------------------------             (4)
3246 *                               grq->load.weight
3247 *
3248 * Except that is wrong!
3249 *
3250 * Because while for entities historical weight is not important and we
3251 * really only care about our future and therefore can consider a pure
3252 * runnable sum, runqueues can NOT do this.
3253 *
3254 * We specifically want runqueues to have a load_avg that includes
3255 * historical weights. Those represent the blocked load, the load we expect
3256 * to (shortly) return to us. This only works by keeping the weights as
3257 * integral part of the sum. We therefore cannot decompose as per (3).
3258 *
3259 * Another reason this doesn't work is that runnable isn't a 0-sum entity.
3260 * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
3261 * rq itself is runnable anywhere between 2/3 and 1 depending on how the
3262 * runnable section of these tasks overlap (or not). If they were to perfectly
3263 * align the rq as a whole would be runnable 2/3 of the time. If however we
3264 * always have at least 1 runnable task, the rq as a whole is always runnable.
3265 *
3266 * So we'll have to approximate.. :/
3267 *
3268 * Given the constraint:
3269 *
3270 *   ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
3271 *
3272 * We can construct a rule that adds runnable to a rq by assuming minimal
3273 * overlap.
3274 *
3275 * On removal, we'll assume each task is equally runnable; which yields:
3276 *
3277 *   grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
3278 *
3279 * XXX: only do this for the part of runnable > running ?
3280 *
3281 */
3282
3283static inline void
3284update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3285{
3286        long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
3287
3288        /* Nothing to update */
3289        if (!delta)
3290                return;
3291
3292        /*
3293         * The relation between sum and avg is:
3294         *
3295         *   LOAD_AVG_MAX - 1024 + sa->period_contrib
3296         *
3297         * however, the PELT windows are not aligned between grq and gse.
3298         */
3299
3300        /* Set new sched_entity's utilization */
3301        se->avg.util_avg = gcfs_rq->avg.util_avg;
3302        se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
3303
3304        /* Update parent cfs_rq utilization */
3305        add_positive(&cfs_rq->avg.util_avg, delta);
3306        cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
3307}
3308
3309static inline void
3310update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3311{
3312        long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3313        unsigned long runnable_load_avg, load_avg;
3314        u64 runnable_load_sum, load_sum = 0;
3315        s64 delta_sum;
3316
3317        if (!runnable_sum)
3318                return;
3319
3320        gcfs_rq->prop_runnable_sum = 0;
3321
3322        if (runnable_sum >= 0) {
3323                /*
3324                 * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
3325                 * the CPU is saturated running == runnable.
3326                 */
3327                runnable_sum += se->avg.load_sum;
3328                runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
3329        } else {
3330                /*
3331                 * Estimate the new unweighted runnable_sum of the gcfs_rq by
3332                 * assuming all tasks are equally runnable.
3333                 */
3334                if (scale_load_down(gcfs_rq->load.weight)) {
3335                        load_sum = div_s64(gcfs_rq->avg.load_sum,
3336                                scale_load_down(gcfs_rq->load.weight));
3337                }
3338
3339                /* But make sure to not inflate se's runnable */
3340                runnable_sum = min(se->avg.load_sum, load_sum);
3341        }
3342
3343        /*
3344         * runnable_sum can't be lower than running_sum
3345         * Rescale running sum to be in the same range as runnable sum
3346         * running_sum is in [0 : LOAD_AVG_MAX <<  SCHED_CAPACITY_SHIFT]
3347         * runnable_sum is in [0 : LOAD_AVG_MAX]
3348         */
3349        running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
3350        runnable_sum = max(runnable_sum, running_sum);
3351
3352        load_sum = (s64)se_weight(se) * runnable_sum;
3353        load_avg = div_s64(load_sum, LOAD_AVG_MAX);
3354
3355        delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
3356        delta_avg = load_avg - se->avg.load_avg;
3357
3358        se->avg.load_sum = runnable_sum;
3359        se->avg.load_avg = load_avg;
3360        add_positive(&cfs_rq->avg.load_avg, delta_avg);
3361        add_positive(&cfs_rq->avg.load_sum, delta_sum);
3362
3363        runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
3364        runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
3365        delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
3366        delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
3367
3368        se->avg.runnable_load_sum = runnable_sum;
3369        se->avg.runnable_load_avg = runnable_load_avg;
3370
3371        if (se->on_rq) {
3372                add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
3373                add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
3374        }
3375}
3376
3377static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
3378{
3379        cfs_rq->propagate = 1;
3380        cfs_rq->prop_runnable_sum += runnable_sum;
3381}
3382
3383/* Update task and its cfs_rq load average */
3384static inline int propagate_entity_load_avg(struct sched_entity *se)
3385{
3386        struct cfs_rq *cfs_rq, *gcfs_rq;
3387
3388        if (entity_is_task(se))
3389                return 0;
3390
3391        gcfs_rq = group_cfs_rq(se);
3392        if (!gcfs_rq->propagate)
3393                return 0;
3394
3395        gcfs_rq->propagate = 0;
3396
3397        cfs_rq = cfs_rq_of(se);
3398
3399        add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
3400
3401        update_tg_cfs_util(cfs_rq, se, gcfs_rq);
3402        update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
3403
3404        trace_pelt_cfs_tp(cfs_rq);
3405        trace_pelt_se_tp(se);
3406
3407        return 1;
3408}
3409
3410/*
3411 * Check if we need to update the load and the utilization of a blocked
3412 * group_entity:
3413 */
3414static inline bool skip_blocked_update(struct sched_entity *se)
3415{
3416        struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3417
3418        /*
3419         * If sched_entity still have not zero load or utilization, we have to
3420         * decay it:
3421         */
3422        if (se->avg.load_avg || se->avg.util_avg)
3423                return false;
3424
3425        /*
3426         * If there is a pending propagation, we have to update the load and
3427         * the utilization of the sched_entity:
3428         */
3429        if (gcfs_rq->propagate)
3430                return false;
3431
3432        /*
3433         * Otherwise, the load and the utilization of the sched_entity is
3434         * already zero and there is no pending propagation, so it will be a
3435         * waste of time to try to decay it:
3436         */
3437        return true;
3438}
3439
3440#else /* CONFIG_FAIR_GROUP_SCHED */
3441
3442static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
3443
3444static inline int propagate_entity_load_avg(struct sched_entity *se)
3445{
3446        return 0;
3447}
3448
3449static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
3450
3451#endif /* CONFIG_FAIR_GROUP_SCHED */
3452
3453/**
3454 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
3455 * @now: current time, as per cfs_rq_clock_pelt()
3456 * @cfs_rq: cfs_rq to update
3457 *
3458 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
3459 * avg. The immediate corollary is that all (fair) tasks must be attached, see
3460 * post_init_entity_util_avg().
3461 *
3462 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
3463 *
3464 * Returns true if the load decayed or we removed load.
3465 *
3466 * Since both these conditions indicate a changed cfs_rq->avg.load we should
3467 * call update_tg_load_avg() when this function returns true.
3468 */
3469static inline int
3470update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3471{
3472        unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
3473        struct sched_avg *sa = &cfs_rq->avg;
3474        int decayed = 0;
3475
3476        if (cfs_rq->removed.nr) {
3477                unsigned long r;
3478                u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
3479
3480                raw_spin_lock(&cfs_rq->removed.lock);
3481                swap(cfs_rq->removed.util_avg, removed_util);
3482                swap(cfs_rq->removed.load_avg, removed_load);
3483                swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
3484                cfs_rq->removed.nr = 0;
3485                raw_spin_unlock(&cfs_rq->removed.lock);
3486
3487                r = removed_load;
3488                sub_positive(&sa->load_avg, r);
3489                sub_positive(&sa->load_sum, r * divider);
3490
3491                r = removed_util;
3492                sub_positive(&sa->util_avg, r);
3493                sub_positive(&sa->util_sum, r * divider);
3494
3495                add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
3496
3497                decayed = 1;
3498        }
3499
3500        decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
3501
3502#ifndef CONFIG_64BIT
3503        smp_wmb();
3504        cfs_rq->load_last_update_time_copy = sa->last_update_time;
3505#endif
3506
3507        if (decayed)
3508                cfs_rq_util_change(cfs_rq, 0);
3509
3510        return decayed;
3511}
3512
3513/**
3514 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
3515 * @cfs_rq: cfs_rq to attach to
3516 * @se: sched_entity to attach
3517 * @flags: migration hints
3518 *
3519 * Must call update_cfs_rq_load_avg() before this, since we rely on
3520 * cfs_rq->avg.last_update_time being current.
3521 */
3522static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3523{
3524        u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
3525
3526        /*
3527         * When we attach the @se to the @cfs_rq, we must align the decay
3528         * window because without that, really weird and wonderful things can
3529         * happen.
3530         *
3531         * XXX illustrate
3532         */
3533        se->avg.last_update_time = cfs_rq->avg.last_update_time;
3534        se->avg.period_contrib = cfs_rq->avg.period_contrib;
3535
3536        /*
3537         * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
3538         * period_contrib. This isn't strictly correct, but since we're
3539         * entirely outside of the PELT hierarchy, nobody cares if we truncate
3540         * _sum a little.
3541         */
3542        se->avg.util_sum = se->avg.util_avg * divider;
3543
3544        se->avg.load_sum = divider;
3545        if (se_weight(se)) {
3546                se->avg.load_sum =
3547                        div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
3548        }
3549
3550        se->avg.runnable_load_sum = se->avg.load_sum;
3551
3552        enqueue_load_avg(cfs_rq, se);
3553        cfs_rq->avg.util_avg += se->avg.util_avg;
3554        cfs_rq->avg.util_sum += se->avg.util_sum;
3555
3556        add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
3557
3558        cfs_rq_util_change(cfs_rq, flags);
3559
3560        trace_pelt_cfs_tp(cfs_rq);
3561}
3562
3563/**
3564 * detach_entity_load_avg - detach this entity from its cfs_rq load avg
3565 * @cfs_rq: cfs_rq to detach from
3566 * @se: sched_entity to detach
3567 *
3568 * Must call update_cfs_rq_load_avg() before this, since we rely on
3569 * cfs_rq->avg.last_update_time being current.
3570 */
3571static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3572{
3573        dequeue_load_avg(cfs_rq, se);
3574        sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3575        sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3576
3577        add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3578
3579        cfs_rq_util_change(cfs_rq, 0);
3580
3581        trace_pelt_cfs_tp(cfs_rq);
3582}
3583
3584/*
3585 * Optional action to be done while updating the load average
3586 */
3587#define UPDATE_TG       0x1
3588#define SKIP_AGE_LOAD   0x2
3589#define DO_ATTACH       0x4
3590
3591/* Update task and its cfs_rq load average */
3592static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3593{
3594        u64 now = cfs_rq_clock_pelt(cfs_rq);
3595        int decayed;
3596
3597        /*
3598         * Track task load average for carrying it to new CPU after migrated, and
3599         * track group sched_entity load average for task_h_load calc in migration
3600         */
3601        if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
3602                __update_load_avg_se(now, cfs_rq, se);
3603
3604        decayed  = update_cfs_rq_load_avg(now, cfs_rq);
3605        decayed |= propagate_entity_load_avg(se);
3606
3607        if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
3608
3609                /*
3610                 * DO_ATTACH means we're here from enqueue_entity().
3611                 * !last_update_time means we've passed through
3612                 * migrate_task_rq_fair() indicating we migrated.
3613                 *
3614                 * IOW we're enqueueing a task on a new CPU.
3615                 */
3616                attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
3617                update_tg_load_avg(cfs_rq, 0);
3618
3619        } else if (decayed && (flags & UPDATE_TG))
3620                update_tg_load_avg(cfs_rq, 0);
3621}
3622
3623#ifndef CONFIG_64BIT
3624static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3625{
3626        u64 last_update_time_copy;
3627        u64 last_update_time;
3628
3629        do {
3630                last_update_time_copy = cfs_rq->load_last_update_time_copy;
3631                smp_rmb();
3632                last_update_time = cfs_rq->avg.last_update_time;
3633        } while (last_update_time != last_update_time_copy);
3634
3635        return last_update_time;
3636}
3637#else
3638static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3639{
3640        return cfs_rq->avg.last_update_time;
3641}
3642#endif
3643
3644/*
3645 * Synchronize entity load avg of dequeued entity without locking
3646 * the previous rq.
3647 */
3648static void sync_entity_load_avg(struct sched_entity *se)
3649{
3650        struct cfs_rq *cfs_rq = cfs_rq_of(se);
3651        u64 last_update_time;
3652
3653        last_update_time = cfs_rq_last_update_time(cfs_rq);
3654        __update_load_avg_blocked_se(last_update_time, se);
3655}
3656
3657/*
3658 * Task first catches up with cfs_rq, and then subtract
3659 * itself from the cfs_rq (task must be off the queue now).
3660 */
3661static void remove_entity_load_avg(struct sched_entity *se)
3662{
3663        struct cfs_rq *cfs_rq = cfs_rq_of(se);
3664        unsigned long flags;
3665
3666        /*
3667         * tasks cannot exit without having gone through wake_up_new_task() ->
3668         * post_init_entity_util_avg() which will have added things to the
3669         * cfs_rq, so we can remove unconditionally.
3670         */
3671
3672        sync_entity_load_avg(se);
3673
3674        raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
3675        ++cfs_rq->removed.nr;
3676        cfs_rq->removed.util_avg        += se->avg.util_avg;
3677        cfs_rq->removed.load_avg        += se->avg.load_avg;
3678        cfs_rq->removed.runnable_sum    += se->avg.load_sum; /* == runnable_sum */
3679        raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
3680}
3681
3682static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
3683{
3684        return cfs_rq->avg.runnable_load_avg;
3685}
3686
3687static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3688{
3689        return cfs_rq->avg.load_avg;
3690}
3691
3692static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
3693
3694static inline unsigned long task_util(struct task_struct *p)
3695{
3696        return READ_ONCE(p->se.avg.util_avg);
3697}
3698
3699static inline unsigned long _task_util_est(struct task_struct *p)
3700{
3701        struct util_est ue = READ_ONCE(p->se.avg.util_est);
3702
3703        return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
3704}
3705
3706static inline unsigned long task_util_est(struct task_struct *p)
3707{
3708        return max(task_util(p), _task_util_est(p));
3709}
3710
3711static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
3712                                    struct task_struct *p)
3713{
3714        unsigned int enqueued;
3715
3716        if (!sched_feat(UTIL_EST))
3717                return;
3718
3719        /* Update root cfs_rq's estimated utilization */
3720        enqueued  = cfs_rq->avg.util_est.enqueued;
3721        enqueued += _task_util_est(p);
3722        WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3723}
3724
3725/*
3726 * Check if a (signed) value is within a specified (unsigned) margin,
3727 * based on the observation that:
3728 *
3729 *     abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
3730 *
3731 * NOTE: this only works when value + maring < INT_MAX.
3732 */
3733static inline bool within_margin(int value, int margin)
3734{
3735        return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
3736}
3737
3738static void
3739util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
3740{
3741        long last_ewma_diff;
3742        struct util_est ue;
3743        int cpu;
3744
3745        if (!sched_feat(UTIL_EST))
3746                return;
3747
3748        /* Update root cfs_rq's estimated utilization */
3749        ue.enqueued  = cfs_rq->avg.util_est.enqueued;
3750        ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
3751        WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
3752
3753        /*
3754         * Skip update of task's estimated utilization when the task has not
3755         * yet completed an activation, e.g. being migrated.
3756         */
3757        if (!task_sleep)
3758                return;
3759
3760        /*
3761         * If the PELT values haven't changed since enqueue time,
3762         * skip the util_est update.
3763         */
3764        ue = p->se.avg.util_est;
3765        if (ue.enqueued & UTIL_AVG_UNCHANGED)
3766                return;
3767
3768        /*
3769         * Skip update of task's estimated utilization when its EWMA is
3770         * already ~1% close to its last activation value.
3771         */
3772        ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
3773        last_ewma_diff = ue.enqueued - ue.ewma;
3774        if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
3775                return;
3776
3777        /*
3778         * To avoid overestimation of actual task utilization, skip updates if
3779         * we cannot grant there is idle time in this CPU.
3780         */
3781        cpu = cpu_of(rq_of(cfs_rq));
3782        if (task_util(p) > capacity_orig_of(cpu))
3783                return;
3784
3785        /*
3786         * Update Task's estimated utilization
3787         *
3788         * When *p completes an activation we can consolidate another sample
3789         * of the task size. This is done by storing the current PELT value
3790         * as ue.enqueued and by using this value to update the Exponential
3791         * Weighted Moving Average (EWMA):
3792         *
3793         *  ewma(t) = w *  task_util(p) + (1-w) * ewma(t-1)
3794         *          = w *  task_util(p) +         ewma(t-1)  - w * ewma(t-1)
3795         *          = w * (task_util(p) -         ewma(t-1)) +     ewma(t-1)
3796         *          = w * (      last_ewma_diff            ) +     ewma(t-1)
3797         *          = w * (last_ewma_diff  +  ewma(t-1) / w)
3798         *
3799         * Where 'w' is the weight of new samples, which is configured to be
3800         * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
3801         */
3802        ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
3803        ue.ewma  += last_ewma_diff;
3804        ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
3805        WRITE_ONCE(p->se.avg.util_est, ue);
3806}
3807
3808static inline int task_fits_capacity(struct task_struct *p, long capacity)
3809{
3810        return capacity * 1024 > task_util_est(p) * capacity_margin;
3811}
3812
3813static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
3814{
3815        if (!static_branch_unlikely(&sched_asym_cpucapacity))
3816                return;
3817
3818        if (!p) {
3819                rq->misfit_task_load = 0;
3820                return;
3821        }
3822
3823        if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
3824                rq->misfit_task_load = 0;
3825                return;
3826        }
3827
3828        rq->misfit_task_load = task_h_load(p);
3829}
3830
3831#else /* CONFIG_SMP */
3832
3833#define UPDATE_TG       0x0
3834#define SKIP_AGE_LOAD   0x0
3835#define DO_ATTACH       0x0
3836
3837static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
3838{
3839        cfs_rq_util_change(cfs_rq, 0);
3840}
3841
3842static inline void remove_entity_load_avg(struct sched_entity *se) {}
3843
3844static inline void
3845attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
3846static inline void
3847detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3848
3849static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
3850{
3851        return 0;
3852}
3853
3854static inline void
3855util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
3856
3857static inline void
3858util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
3859                 bool task_sleep) {}
3860static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
3861
3862#endif /* CONFIG_SMP */
3863
3864static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3865{
3866#ifdef CONFIG_SCHED_DEBUG
3867        s64 d = se->vruntime - cfs_rq->min_vruntime;
3868
3869        if (d < 0)
3870                d = -d;
3871
3872        if (d > 3*sysctl_sched_latency)
3873                schedstat_inc(cfs_rq->nr_spread_over);
3874#endif
3875}
3876
3877static void
3878place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
3879{
3880        u64 vruntime = cfs_rq->min_vruntime;
3881
3882        /*
3883         * The 'current' period is already promised to the current tasks,
3884         * however the extra weight of the new task will slow them down a
3885         * little, place the new task so that it fits in the slot that
3886         * stays open at the end.
3887         */
3888        if (initial && sched_feat(START_DEBIT))
3889                vruntime += sched_vslice(cfs_rq, se);
3890
3891        /* sleeps up to a single latency don't count. */
3892        if (!initial) {
3893                unsigned long thresh = sysctl_sched_latency;
3894
3895                /*
3896                 * Halve their sleep time's effect, to allow
3897                 * for a gentler effect of sleepers:
3898                 */
3899                if (sched_feat(GENTLE_FAIR_SLEEPERS))
3900                        thresh >>= 1;
3901
3902                vruntime -= thresh;
3903        }
3904
3905        /* ensure we never gain time by being placed backwards. */
3906        se->vruntime = max_vruntime(se->vruntime, vruntime);
3907}
3908
3909static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
3910
3911static inline void check_schedstat_required(void)
3912{
3913#ifdef CONFIG_SCHEDSTATS
3914        if (schedstat_enabled())
3915                return;
3916
3917        /* Force schedstat enabled if a dependent tracepoint is active */
3918        if (trace_sched_stat_wait_enabled()    ||
3919                        trace_sched_stat_sleep_enabled()   ||
3920                        trace_sched_stat_iowait_enabled()  ||
3921                        trace_sched_stat_blocked_enabled() ||
3922                        trace_sched_stat_runtime_enabled())  {
3923                printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
3924                             "stat_blocked and stat_runtime require the "
3925                             "kernel parameter schedstats=enable or "
3926                             "kernel.sched_schedstats=1\n");
3927        }
3928#endif
3929}
3930
3931
3932/*
3933 * MIGRATION
3934 *
3935 *      dequeue
3936 *        update_curr()
3937 *          update_min_vruntime()
3938 *        vruntime -= min_vruntime
3939 *
3940 *      enqueue
3941 *        update_curr()
3942 *          update_min_vruntime()
3943 *        vruntime += min_vruntime
3944 *
3945 * this way the vruntime transition between RQs is done when both
3946 * min_vruntime are up-to-date.
3947 *
3948 * WAKEUP (remote)
3949 *
3950 *      ->migrate_task_rq_fair() (p->state == TASK_WAKING)
3951 *        vruntime -= min_vruntime
3952 *
3953 *      enqueue
3954 *        update_curr()
3955 *          update_min_vruntime()
3956 *        vruntime += min_vruntime
3957 *
3958 * this way we don't have the most up-to-date min_vruntime on the originating
3959 * CPU and an up-to-date min_vruntime on the destination CPU.
3960 */
3961
3962static void
3963enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3964{
3965        bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
3966        bool curr = cfs_rq->curr == se;
3967
3968        /*
3969         * If we're the current task, we must renormalise before calling
3970         * update_curr().
3971         */
3972        if (renorm && curr)
3973                se->vruntime += cfs_rq->min_vruntime;
3974
3975        update_curr(cfs_rq);
3976
3977        /*
3978         * Otherwise, renormalise after, such that we're placed at the current
3979         * moment in time, instead of some random moment in the past. Being
3980         * placed in the past could significantly boost this task to the
3981         * fairness detriment of existing tasks.
3982         */
3983        if (renorm && !curr)
3984                se->vruntime += cfs_rq->min_vruntime;
3985
3986        /*
3987         * When enqueuing a sched_entity, we must:
3988         *   - Update loads to have both entity and cfs_rq synced with now.
3989         *   - Add its load to cfs_rq->runnable_avg
3990         *   - For group_entity, update its weight to reflect the new share of
3991         *     its group cfs_rq
3992         *   - Add its new weight to cfs_rq->load.weight
3993         */
3994        update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
3995        update_cfs_group(se);
3996        enqueue_runnable_load_avg(cfs_rq, se);
3997        account_entity_enqueue(cfs_rq, se);
3998
3999        if (flags & ENQUEUE_WAKEUP)
4000                place_entity(cfs_rq, se, 0);

4001
4002        check_schedstat_required();
4003        update_stats_enqueue(cfs_rq, se, flags);
4004        check_spread(cfs_rq, se);
4005        if (!curr)
4006                __enqueue_entity(cfs_rq, se);
4007        se->on_rq = 1;
4008
4009        if (cfs_rq->nr_running == 1) {
4010                list_add_leaf_cfs_rq(cfs_rq);
4011                check_enqueue_throttle(cfs_rq);
4012        }
4013}
4014
4015static void __clear_buddies_last(struct sched_entity *se)
4016{
4017        for_each_sched_entity(se) {
4018                struct cfs_rq *cfs_rq = cfs_rq_of(se);
4019                if (cfs_rq->last != se)
4020                        break;
4021
4022                cfs_rq->last = NULL;
4023        }
4024}
4025
4026static void __clear_buddies_next(struct sched_entity *se)
4027{
4028        for_each_sched_entity(se) {
4029                struct cfs_rq *cfs_rq = cfs_rq_of(se);
4030                if (cfs_rq->next != se)
4031                        break;
4032
4033                cfs_rq->next = NULL;
4034        }
4035}
4036
4037static void __clear_buddies_skip(struct sched_entity *se)
4038{
4039        for_each_sched_entity(se) {
4040                struct cfs_rq *cfs_rq = cfs_rq_of(se);
4041                if (cfs_rq->skip != se)
4042                        break;
4043
4044                cfs_rq->skip = NULL;
4045        }
4046}
4047
4048static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
4049{
4050        if (cfs_rq->last == se)
4051                __clear_buddies_last(se);
4052
4053        if (cfs_rq->next == se)
4054                __clear_buddies_next(se);
4055
4056        if (cfs_rq->skip == se)
4057                __clear_buddies_skip(se);
4058}
4059
4060static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4061
4062static void
4063dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4064{
4065        /*
4066         * Update run-time statistics of the 'current'.
4067         */
4068        update_curr(cfs_rq);
4069
4070        /*
4071         * When dequeuing a sched_entity, we must:
4072         *   - Update loads to have both entity and cfs_rq synced with now.
4073         *   - Subtract its load from the cfs_rq->runnable_avg.
4074         *   - Subtract its previous weight from cfs_rq->load.weight.
4075         *   - For group entity, update its weight to reflect the new share
4076         *     of its group cfs_rq.
4077         */
4078        update_load_avg(cfs_rq, se, UPDATE_TG);
4079        dequeue_runnable_load_avg(cfs_rq, se);
4080
4081        update_stats_dequeue(cfs_rq, se, flags);
4082
4083        clear_buddies(cfs_rq, se);
4084
4085        if (se != cfs_rq->curr)
4086                __dequeue_entity(cfs_rq, se);
4087        se->on_rq = 0;
4088        account_entity_dequeue(cfs_rq, se);
4089
4090        /*
4091         * Normalize after update_curr(); which will also have moved
4092         * min_vruntime if @se is the one holding it back. But before doing
4093         * update_min_vruntime() again, which will discount @se's position and
4094         * can move min_vruntime forward still more.
4095         */
4096        if (!(flags & DEQUEUE_SLEEP))
4097                se->vruntime -= cfs_rq->min_vruntime;
4098
4099        /* return excess runtime on last dequeue */
4100        return_cfs_rq_runtime(cfs_rq);
4101
4102        update_cfs_group(se);
4103
4104        /*
4105         * Now advance min_vruntime if @se was the entity holding it back,
4106         * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
4107         * put back on, and if we advance min_vruntime, we'll be placed back
4108         * further than we started -- ie. we'll be penalized.
4109         */
4110        if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
4111                update_min_vruntime(cfs_rq);
4112}
4113
4114/*
4115 * Preempt the current task with a newly woken task if needed:
4116 */
4117static void
4118check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4119{
4120        unsigned long ideal_runtime, delta_exec;
4121        struct sched_entity *se;
4122        s64 delta;
4123
4124        ideal_runtime = sched_slice(cfs_rq, curr);
4125        delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
4126        if (delta_exec > ideal_runtime) {
4127                resched_curr(rq_of(cfs_rq));
4128                /*
4129                 * The current task ran long enough, ensure it doesn't get
4130                 * re-elected due to buddy favours.
4131                 */
4132                clear_buddies(cfs_rq, curr);
4133                return;
4134        }
4135
4136        /*
4137         * Ensure that a task that missed wakeup preemption by a
4138         * narrow margin doesn't have to wait for a full slice.
4139         * This also mitigates buddy induced latencies under load.
4140         */
4141        if (delta_exec < sysctl_sched_min_granularity)
4142                return;
4143
4144        se = __pick_first_entity(cfs_rq);
4145        delta = curr->vruntime - se->vruntime;
4146
4147        if (delta < 0)
4148                return;
4149
4150        if (delta > ideal_runtime)
4151                resched_curr(rq_of(cfs_rq));
4152}
4153
4154static void
4155set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
4156{
4157        /* 'current' is not kept within the tree. */
4158        if (se->on_rq) {
4159                /*
4160                 * Any task has to be enqueued before it get to execute on
4161                 * a CPU. So account for the time it spent waiting on the
4162                 * runqueue.
4163                 */
4164                update_stats_wait_end(cfs_rq, se);
4165                __dequeue_entity(cfs_rq, se);
4166                update_load_avg(cfs_rq, se, UPDATE_TG);
4167        }
4168
4169        update_stats_curr_start(cfs_rq, se);
4170        cfs_rq->curr = se;
4171
4172        /*
4173         * Track our maximum slice length, if the CPU's load is at
4174         * least twice that of our own weight (i.e. dont track it
4175         * when there are only lesser-weight tasks around):
4176         */
4177        if (schedstat_enabled() &&
4178            rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
4179                schedstat_set(se->statistics.slice_max,
4180                        max((u64)schedstat_val(se->statistics.slice_max),
4181                            se->sum_exec_runtime - se->prev_sum_exec_runtime));
4182        }
4183
4184        se->prev_sum_exec_runtime = se->sum_exec_runtime;
4185}
4186
4187static int
4188wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
4189
4190/*
4191 * Pick the next process, keeping these things in mind, in this order:
4192 * 1) keep things fair between processes/task groups
4193 * 2) pick the "next" process, since someone really wants that to run
4194 * 3) pick the "last" process, for cache locality
4195 * 4) do not run the "skip" process, if something else is available
4196 */
4197static struct sched_entity *
4198pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4199{
4200        struct sched_entity *left = __pick_first_entity(cfs_rq);
4201        struct sched_entity *se;
4202
4203        /*
4204         * If curr is set we have to see if its left of the leftmost entity
4205         * still in the tree, provided there was anything in the tree at all.
4206         */
4207        if (!left || (curr && entity_before(curr, left)))
4208                left = curr;
4209
4210        se = left; /* ideally we run the leftmost entity */
4211
4212        /*
4213         * Avoid running the skip buddy, if running something else can
4214         * be done without getting too unfair.
4215         */
4216        if (cfs_rq->skip == se) {
4217                struct sched_entity *second;
4218
4219                if (se == curr) {
4220                        second = __pick_first_entity(cfs_rq);
4221                } else {
4222                        second = __pick_next_entity(se);
4223                        if (!second || (curr && entity_before(curr, second)))
4224                                second = curr;
4225                }
4226
4227                if (second && wakeup_preempt_entity(second, left) < 1)
4228                        se = second;
4229        }
4230
4231        /*
4232         * Prefer last buddy, try to return the CPU to a preempted task.
4233         */
4234        if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
4235                se = cfs_rq->last;
4236
4237        /*
4238         * Someone really wants this to run. If it's not unfair, run it.
4239         */
4240        if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
4241                se = cfs_rq->next;
4242
4243        clear_buddies(cfs_rq, se);
4244
4245        return se;
4246}
4247
4248static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4249
4250static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
4251{
4252        /*
4253         * If still on the runqueue then deactivate_task()
4254         * was not called and update_curr() has to be done:
4255         */
4256        if (prev->on_rq)
4257                update_curr(cfs_rq);
4258
4259        /* throttle cfs_rqs exceeding runtime */
4260        check_cfs_rq_runtime(cfs_rq);
4261
4262        check_spread(cfs_rq, prev);
4263
4264        if (prev->on_rq) {
4265                update_stats_wait_start(cfs_rq, prev);
4266                /* Put 'current' back into the tree. */
4267                __enqueue_entity(cfs_rq, prev);
4268                /* in !on_rq case, update occurred at dequeue */
4269                update_load_avg(cfs_rq, prev, 0);
4270        }
4271        cfs_rq->curr = NULL;
4272}
4273
4274static void
4275entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
4276{
4277        /*
4278         * Update run-time statistics of the 'current'.
4279         */
4280        update_curr(cfs_rq);
4281
4282        /*
4283         * Ensure that runnable average is periodically updated.
4284         */
4285        update_load_avg(cfs_rq, curr, UPDATE_TG);
4286        update_cfs_group(curr);
4287
4288#ifdef CONFIG_SCHED_HRTICK
4289        /*
4290         * queued ticks are scheduled to match the slice, so don't bother
4291         * validating it and just reschedule.
4292         */
4293        if (queued) {
4294                resched_curr(rq_of(cfs_rq));
4295                return;
4296        }
4297        /*
4298         * don't let the period tick interfere with the hrtick preemption
4299         */
4300        if (!sched_feat(DOUBLE_TICK) &&
4301                        hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4302                return;
4303#endif
4304
4305        if (cfs_rq->nr_running > 1)
4306                check_preempt_tick(cfs_rq, curr);
4307}
4308
4309
4310/**************************************************
4311 * CFS bandwidth control machinery
4312 */
4313
4314#ifdef CONFIG_CFS_BANDWIDTH
4315
4316#ifdef CONFIG_JUMP_LABEL
4317static struct static_key __cfs_bandwidth_used;
4318
4319static inline bool cfs_bandwidth_used(void)
4320{
4321        return static_key_false(&__cfs_bandwidth_used);
4322}
4323
4324void cfs_bandwidth_usage_inc(void)
4325{
4326        static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
4327}
4328
4329void cfs_bandwidth_usage_dec(void)
4330{
4331        static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
4332}
4333#else /* CONFIG_JUMP_LABEL */
4334static bool cfs_bandwidth_used(void)
4335{
4336        return true;
4337}
4338
4339void cfs_bandwidth_usage_inc(void) {}
4340void cfs_bandwidth_usage_dec(void) {}
4341#endif /* CONFIG_JUMP_LABEL */
4342
4343/*
4344 * default period for cfs group bandwidth.
4345 * default: 0.1s, units: nanoseconds
4346 */
4347static inline u64 default_cfs_period(void)
4348{
4349        return 100000000ULL;
4350}
4351
4352static inline u64 sched_cfs_bandwidth_slice(void)
4353{
4354        return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
4355}
4356
4357/*
4358 * Replenish runtime according to assigned quota and update expiration time.
4359 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
4360 * additional synchronization around rq->lock.
4361 *
4362 * requires cfs_b->lock
4363 */
4364void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
4365{
4366        u64 now;
4367
4368        if (cfs_b->quota == RUNTIME_INF)
4369                return;
4370
4371        now = sched_clock_cpu(smp_processor_id());
4372        cfs_b->runtime = cfs_b->quota;
4373        cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
4374        cfs_b->expires_seq++;
4375}
4376
4377static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4378{
4379        return &tg->cfs_bandwidth;
4380}
4381
4382/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
4383static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4384{
4385        if (unlikely(cfs_rq->throttle_count))
4386                return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
4387
4388        return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
4389}
4390
4391/* returns 0 on failure to allocate runtime */
4392static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4393{
4394        struct task_group *tg = cfs_rq->tg;
4395        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
4396        u64 amount = 0, min_amount, expires;
4397        int expires_seq;
4398
4399        /* note: this is a positive sum as runtime_remaining <= 0 */
4400        min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
4401
4402        raw_spin_lock(&cfs_b->lock);
4403        if (cfs_b->quota == RUNTIME_INF)
4404                amount = min_amount;
4405        else {
4406                start_cfs_bandwidth(cfs_b);
4407
4408                if (cfs_b->runtime > 0) {
4409                        amount = min(cfs_b->runtime, min_amount);
4410                        cfs_b->runtime -= amount;
4411                        cfs_b->idle = 0;
4412                }
4413        }
4414        expires_seq = cfs_b->expires_seq;
4415        expires = cfs_b->runtime_expires;
4416        raw_spin_unlock(&cfs_b->lock);
4417
4418        cfs_rq->runtime_remaining += amount;
4419        /*
4420         * we may have advanced our local expiration to account for allowed
4421         * spread between our sched_clock and the one on which runtime was
4422         * issued.
4423         */
4424        if (cfs_rq->expires_seq != expires_seq) {
4425                cfs_rq->expires_seq = expires_seq;
4426                cfs_rq->runtime_expires = expires;
4427        }
4428
4429        return cfs_rq->runtime_remaining > 0;
4430}
4431
4432/*
4433 * Note: This depends on the synchronization provided by sched_clock and the
4434 * fact that rq->clock snapshots this value.
4435 */
4436static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4437{
4438        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4439
4440        /* if the deadline is ahead of our clock, nothing to do */
4441        if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
4442                return;
4443
4444        if (cfs_rq->runtime_remaining < 0)
4445                return;
4446
4447        /*
4448         * If the local deadline has passed we have to consider the
4449         * possibility that our sched_clock is 'fast' and the global deadline
4450         * has not truly expired.
4451         *
4452         * Fortunately we can check determine whether this the case by checking
4453         * whether the global deadline(cfs_b->expires_seq) has advanced.
4454         */
4455        if (cfs_rq->expires_seq == cfs_b->expires_seq) {
4456                /* extend local deadline, drift is bounded above by 2 ticks */
4457                cfs_rq->runtime_expires += TICK_NSEC;
4458        } else {
4459                /* global deadline is ahead, expiration has passed */
4460                cfs_rq->runtime_remaining = 0;
4461        }
4462}
4463
4464static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4465{
4466        /* dock delta_exec before expiring quota (as it could span periods) */
4467        cfs_rq->runtime_remaining -= delta_exec;
4468        expire_cfs_rq_runtime(cfs_rq);
4469
4470        if (likely(cfs_rq->runtime_remaining > 0))
4471                return;
4472
4473        if (cfs_rq->throttled)
4474                return;
4475        /*
4476         * if we're unable to extend our runtime we resched so that the active
4477         * hierarchy can be throttled
4478         */
4479        if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
4480                resched_curr(rq_of(cfs_rq));
4481}
4482
4483static __always_inline
4484void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4485{
4486        if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
4487                return;
4488
4489        __account_cfs_rq_runtime(cfs_rq, delta_exec);
4490}
4491
4492static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4493{
4494        return cfs_bandwidth_used() && cfs_rq->throttled;
4495}
4496
4497/* check whether cfs_rq, or any parent, is throttled */
4498static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4499{
4500        return cfs_bandwidth_used() && cfs_rq->throttle_count;
4501}
4502
4503/*
4504 * Ensure that neither of the group entities corresponding to src_cpu or
4505 * dest_cpu are members of a throttled hierarchy when performing group
4506 * load-balance operations.
4507 */
4508static inline int throttled_lb_pair(struct task_group *tg,
4509                                    int src_cpu, int dest_cpu)
4510{
4511        struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
4512
4513        src_cfs_rq = tg->cfs_rq[src_cpu];
4514        dest_cfs_rq = tg->cfs_rq[dest_cpu];
4515
4516        return throttled_hierarchy(src_cfs_rq) ||
4517               throttled_hierarchy(dest_cfs_rq);
4518}
4519
4520static int tg_unthrottle_up(struct task_group *tg, void *data)
4521{
4522        struct rq *rq = data;
4523        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4524
4525        cfs_rq->throttle_count--;
4526        if (!cfs_rq->throttle_count) {
4527                /* adjust cfs_rq_clock_task() */
4528                cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
4529                                             cfs_rq->throttled_clock_task;
4530
4531                /* Add cfs_rq with already running entity in the list */
4532                if (cfs_rq->nr_running >= 1)
4533                        list_add_leaf_cfs_rq(cfs_rq);
4534        }
4535
4536        return 0;
4537}
4538
4539static int tg_throttle_down(struct task_group *tg, void *data)
4540{
4541        struct rq *rq = data;
4542        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4543
4544        /* group is entering throttled state, stop time */
4545        if (!cfs_rq->throttle_count) {
4546                cfs_rq->throttled_clock_task = rq_clock_task(rq);
4547                list_del_leaf_cfs_rq(cfs_rq);
4548        }
4549        cfs_rq->throttle_count++;
4550
4551        return 0;
4552}
4553
4554static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
4555{
4556        struct rq *rq = rq_of(cfs_rq);
4557        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4558        struct sched_entity *se;
4559        long task_delta, dequeue = 1;
4560        bool empty;
4561
4562        se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
4563
4564        /* freeze hierarchy runnable averages while throttled */
4565        rcu_read_lock();
4566        walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
4567        rcu_read_unlock();
4568
4569        task_delta = cfs_rq->h_nr_running;
4570        for_each_sched_entity(se) {
4571                struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4572                /* throttled entity or throttle-on-deactivate */
4573                if (!se->on_rq)
4574                        break;
4575
4576                if (dequeue)
4577                        dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
4578                qcfs_rq->h_nr_running -= task_delta;
4579
4580                if (qcfs_rq->load.weight)
4581                        dequeue = 0;
4582        }
4583
4584        if (!se)
4585                sub_nr_running(rq, task_delta);
4586
4587        cfs_rq->throttled = 1;
4588        cfs_rq->throttled_clock = rq_clock(rq);
4589        raw_spin_lock(&cfs_b->lock);
4590        empty = list_empty(&cfs_b->throttled_cfs_rq);
4591
4592        /*
4593         * Add to the _head_ of the list, so that an already-started
4594         * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
4595         * not running add to the tail so that later runqueues don't get starved.
4596         */
4597        if (cfs_b->distribute_running)
4598                list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4599        else
4600                list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4601
4602        /*
4603         * If we're the first throttled task, make sure the bandwidth
4604         * timer is running.
4605         */
4606        if (empty)
4607                start_cfs_bandwidth(cfs_b);
4608
4609        raw_spin_unlock(&cfs_b->lock);
4610}
4611
4612void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
4613{
4614        struct rq *rq = rq_of(cfs_rq);
4615        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4616        struct sched_entity *se;
4617        int enqueue = 1;
4618        long task_delta;
4619
4620        se = cfs_rq->tg->se[cpu_of(rq)];
4621
4622        cfs_rq->throttled = 0;
4623
4624        update_rq_clock(rq);
4625
4626        raw_spin_lock(&cfs_b->lock);
4627        cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
4628        list_del_rcu(&cfs_rq->throttled_list);
4629        raw_spin_unlock(&cfs_b->lock);
4630
4631        /* update hierarchical throttle state */
4632        walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
4633
4634        if (!cfs_rq->load.weight)
4635                return;
4636
4637        task_delta = cfs_rq->h_nr_running;
4638        for_each_sched_entity(se) {
4639                if (se->on_rq)
4640                        enqueue = 0;
4641
4642                cfs_rq = cfs_rq_of(se);
4643                if (enqueue)
4644                        enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4645                cfs_rq->h_nr_running += task_delta;
4646
4647                if (cfs_rq_throttled(cfs_rq))
4648                        break;
4649        }
4650
4651        assert_list_leaf_cfs_rq(rq);
4652
4653        if (!se)
4654                add_nr_running(rq, task_delta);
4655
4656        /* Determine whether we need to wake up potentially idle CPU: */
4657        if (rq->curr == rq->idle && rq->cfs.nr_running)
4658                resched_curr(rq);
4659}
4660
4661static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
4662                u64 remaining, u64 expires)
4663{
4664        struct cfs_rq *cfs_rq;
4665        u64 runtime;
4666        u64 starting_runtime = remaining;
4667
4668        rcu_read_lock();
4669        list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
4670                                throttled_list) {
4671                struct rq *rq = rq_of(cfs_rq);
4672                struct rq_flags rf;
4673
4674                rq_lock_irqsave(rq, &rf);
4675                if (!cfs_rq_throttled(cfs_rq))
4676                        goto next;
4677
4678                /* By the above check, this should never be true */
4679                SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
4680
4681                runtime = -cfs_rq->runtime_remaining + 1;
4682                if (runtime > remaining)
4683                        runtime = remaining;
4684                remaining -= runtime;
4685
4686                cfs_rq->runtime_remaining += runtime;
4687                cfs_rq->runtime_expires = expires;
4688
4689                /* we check whether we're throttled above */
4690                if (cfs_rq->runtime_remaining > 0)
4691                        unthrottle_cfs_rq(cfs_rq);
4692
4693next:
4694                rq_unlock_irqrestore(rq, &rf);
4695
4696                if (!remaining)
4697                        break;
4698        }
4699        rcu_read_unlock();
4700
4701        return starting_runtime - remaining;
4702}
4703
4704/*
4705 * Responsible for refilling a task_group's bandwidth and unthrottling its
4706 * cfs_rqs as appropriate. If there has been no activity within the last
4707 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
4708 * used to track this state.
4709 */
4710static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
4711{
4712        u64 runtime, runtime_expires;
4713        int throttled;
4714
4715        /* no need to continue the timer with no bandwidth constraint */
4716        if (cfs_b->quota == RUNTIME_INF)
4717                goto out_deactivate;
4718
4719        throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4720        cfs_b->nr_periods += overrun;
4721
4722        /*
4723         * idle depends on !throttled (for the case of a large deficit), and if
4724         * we're going inactive then everything else can be deferred
4725         */
4726        if (cfs_b->idle && !throttled)
4727                goto out_deactivate;
4728
4729        __refill_cfs_bandwidth_runtime(cfs_b);
4730
4731        if (!throttled) {
4732                /* mark as potentially idle for the upcoming period */
4733                cfs_b->idle = 1;
4734                return 0;
4735        }
4736
4737        /* account preceding periods in which throttling occurred */
4738        cfs_b->nr_throttled += overrun;
4739
4740        runtime_expires = cfs_b->runtime_expires;
4741
4742        /*
4743         * This check is repeated as we are holding onto the new bandwidth while
4744         * we unthrottle. This can potentially race with an unthrottled group
4745         * trying to acquire new bandwidth from the global pool. This can result
4746         * in us over-using our runtime if it is all used during this loop, but
4747         * only by limited amounts in that extreme case.
4748         */
4749        while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
4750                runtime = cfs_b->runtime;
4751                cfs_b->distribute_running = 1;
4752                raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4753                /* we can't nest cfs_b->lock while distributing bandwidth */
4754                runtime = distribute_cfs_runtime(cfs_b, runtime,
4755                                                 runtime_expires);
4756                raw_spin_lock_irqsave(&cfs_b->lock, flags);
4757
4758                cfs_b->distribute_running = 0;
4759                throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4760
4761                lsub_positive(&cfs_b->runtime, runtime);
4762        }
4763
4764        /*
4765         * While we are ensured activity in the period following an
4766         * unthrottle, this also covers the case in which the new bandwidth is
4767         * insufficient to cover the existing bandwidth deficit.  (Forcing the
4768         * timer to remain active while there are any throttled entities.)
4769         */
4770        cfs_b->idle = 0;
4771
4772        return 0;
4773
4774out_deactivate:
4775        return 1;
4776}
4777
4778/* a cfs_rq won't donate quota below this amount */
4779static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
4780/* minimum remaining period time to redistribute slack quota */
4781static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
4782/* how long we wait to gather additional slack before distributing */
4783static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
4784
4785/*
4786 * Are we near the end of the current quota period?
4787 *
4788 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
4789 * hrtimer base being cleared by hrtimer_start. In the case of
4790 * migrate_hrtimers, base is never cleared, so we are fine.
4791 */
4792static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
4793{
4794        struct hrtimer *refresh_timer = &cfs_b->period_timer;
4795        u64 remaining;
4796
4797        /* if the call-back is running a quota refresh is already occurring */
4798        if (hrtimer_callback_running(refresh_timer))
4799                return 1;
4800
4801        /* is a quota refresh about to occur? */
4802        remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
4803        if (remaining < min_expire)
4804                return 1;
4805
4806        return 0;
4807}
4808
4809static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
4810{
4811        u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
4812
4813        /* if there's a quota refresh soon don't bother with slack */
4814        if (runtime_refresh_within(cfs_b, min_left))
4815                return;
4816
4817        /* don't push forwards an existing deferred unthrottle */
4818        if (cfs_b->slack_started)
4819                return;
4820        cfs_b->slack_started = true;
4821
4822        hrtimer_start(&cfs_b->slack_timer,
4823                        ns_to_ktime(cfs_bandwidth_slack_period),
4824                        HRTIMER_MODE_REL);
4825}
4826
4827/* we know any runtime found here is valid as update_curr() precedes return */
4828static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4829{
4830        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4831        s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
4832
4833        if (slack_runtime <= 0)
4834                return;
4835
4836        raw_spin_lock(&cfs_b->lock);
4837        if (cfs_b->quota != RUNTIME_INF &&
4838            cfs_rq->runtime_expires == cfs_b->runtime_expires) {
4839                cfs_b->runtime += slack_runtime;
4840
4841                /* we are under rq->lock, defer unthrottling using a timer */
4842                if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
4843                    !list_empty(&cfs_b->throttled_cfs_rq))
4844                        start_cfs_slack_bandwidth(cfs_b);
4845        }
4846        raw_spin_unlock(&cfs_b->lock);
4847
4848        /* even if it's not valid for return we don't want to try again */
4849        cfs_rq->runtime_remaining -= slack_runtime;
4850}
4851
4852static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4853{
4854        if (!cfs_bandwidth_used())
4855                return;
4856
4857        if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
4858                return;
4859
4860        __return_cfs_rq_runtime(cfs_rq);
4861}
4862
4863/*
4864 * This is done with a timer (instead of inline with bandwidth return) since
4865 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
4866 */
4867static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4868{
4869        u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
4870        unsigned long flags;
4871        u64 expires;
4872
4873        /* confirm we're still not at a refresh boundary */
4874        raw_spin_lock_irqsave(&cfs_b->lock, flags);
4875        cfs_b->slack_started = false;
4876        if (cfs_b->distribute_running) {
4877                raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4878                return;
4879        }
4880
4881        if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
4882                raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4883                return;
4884        }
4885
4886        if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
4887                runtime = cfs_b->runtime;
4888
4889        expires = cfs_b->runtime_expires;
4890        if (runtime)
4891                cfs_b->distribute_running = 1;
4892
4893        raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4894
4895        if (!runtime)
4896                return;
4897
4898        runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
4899
4900        raw_spin_lock_irqsave(&cfs_b->lock, flags);
4901        if (expires == cfs_b->runtime_expires)
4902                lsub_positive(&cfs_b->runtime, runtime);
4903        cfs_b->distribute_running = 0;
4904        raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4905}
4906
4907/*
4908 * When a group wakes up we want to make sure that its quota is not already
4909 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
4910 * runtime as update_curr() throttling can not not trigger until it's on-rq.
4911 */
4912static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4913{
4914        if (!cfs_bandwidth_used())
4915                return;
4916
4917        /* an active group must be handled by the update_curr()->put() path */
4918        if (!cfs_rq->runtime_enabled || cfs_rq->curr)
4919                return;
4920
4921        /* ensure the group is not already throttled */
4922        if (cfs_rq_throttled(cfs_rq))
4923                return;
4924
4925        /* update runtime allocation */
4926        account_cfs_rq_runtime(cfs_rq, 0);
4927        if (cfs_rq->runtime_remaining <= 0)
4928                throttle_cfs_rq(cfs_rq);
4929}
4930
4931static void sync_throttle(struct task_group *tg, int cpu)
4932{
4933        struct cfs_rq *pcfs_rq, *cfs_rq;
4934
4935        if (!cfs_bandwidth_used())
4936                return;
4937
4938        if (!tg->parent)
4939                return;
4940
4941        cfs_rq = tg->cfs_rq[cpu];
4942        pcfs_rq = tg->parent->cfs_rq[cpu];
4943
4944        cfs_rq->throttle_count = pcfs_rq->throttle_count;
4945        cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
4946}
4947
4948/* conditionally throttle active cfs_rq's from put_prev_entity() */
4949static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4950{
4951        if (!cfs_bandwidth_used())
4952                return false;
4953
4954        if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
4955                return false;
4956
4957        /*
4958         * it's possible for a throttled entity to be forced into a running
4959         * state (e.g. set_curr_task), in this case we're finished.
4960         */
4961        if (cfs_rq_throttled(cfs_rq))
4962                return true;
4963
4964        throttle_cfs_rq(cfs_rq);
4965        return true;
4966}
4967
4968static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
4969{
4970        struct cfs_bandwidth *cfs_b =
4971                container_of(timer, struct cfs_bandwidth, slack_timer);
4972
4973        do_sched_cfs_slack_timer(cfs_b);
4974
4975        return HRTIMER_NORESTART;
4976}
4977
4978extern const u64 max_cfs_quota_period;
4979
4980static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4981{
4982        struct cfs_bandwidth *cfs_b =
4983                container_of(timer, struct cfs_bandwidth, period_timer);
4984        unsigned long flags;
4985        int overrun;
4986        int idle = 0;
4987        int count = 0;
4988
4989        raw_spin_lock_irqsave(&cfs_b->lock, flags);
4990        for (;;) {
4991                overrun = hrtimer_forward_now(timer, cfs_b->period);
4992                if (!overrun)
4993                        break;
4994
4995                if (++count > 3) {
4996                        u64 new, old = ktime_to_ns(cfs_b->period);
4997
4998                        new = (old * 147) / 128; /* ~115% */
4999                        new = min(new, max_cfs_quota_period);
5000

5001                        cfs_b->period = ns_to_ktime(new);
5002
5003                        /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
5004                        cfs_b->quota *= new;
5005                        cfs_b->quota = div64_u64(cfs_b->quota, old);
5006
5007                        pr_warn_ratelimited(
5008        "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
5009                                smp_processor_id(),
5010                                div_u64(new, NSEC_PER_USEC),
5011                                div_u64(cfs_b->quota, NSEC_PER_USEC));
5012
5013                        /* reset count so we don't come right back in here */
5014                        count = 0;
5015                }
5016
5017                idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
5018        }
5019        if (idle)
5020                cfs_b->period_active = 0;
5021        raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
5022
5023        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
5024}
5025
5026void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5027{
5028        raw_spin_lock_init(&cfs_b->lock);
5029        cfs_b->runtime = 0;
5030        cfs_b->quota = RUNTIME_INF;
5031        cfs_b->period = ns_to_ktime(default_cfs_period());
5032
5033        INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
5034        hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
5035        cfs_b->period_timer.function = sched_cfs_period_timer;
5036        hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5037        cfs_b->slack_timer.function = sched_cfs_slack_timer;
5038        cfs_b->distribute_running = 0;
5039        cfs_b->slack_started = false;
5040}
5041
5042static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5043{
5044        cfs_rq->runtime_enabled = 0;
5045        INIT_LIST_HEAD(&cfs_rq->throttled_list);
5046}
5047
5048void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5049{
5050        u64 overrun;
5051
5052        lockdep_assert_held(&cfs_b->lock);
5053
5054        if (cfs_b->period_active)
5055                return;
5056
5057        cfs_b->period_active = 1;
5058        overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
5059        cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
5060        cfs_b->expires_seq++;
5061        hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
5062}
5063
5064static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5065{
5066        /* init_cfs_bandwidth() was not called */
5067        if (!cfs_b->throttled_cfs_rq.next)
5068                return;
5069
5070        hrtimer_cancel(&cfs_b->period_timer);
5071        hrtimer_cancel(&cfs_b->slack_timer);
5072}
5073
5074/*
5075 * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
5076 *
5077 * The race is harmless, since modifying bandwidth settings of unhooked group
5078 * bits doesn't do much.
5079 */
5080
5081/* cpu online calback */
5082static void __maybe_unused update_runtime_enabled(struct rq *rq)
5083{
5084        struct task_group *tg;
5085
5086        lockdep_assert_held(&rq->lock);
5087
5088        rcu_read_lock();
5089        list_for_each_entry_rcu(tg, &task_groups, list) {
5090                struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
5091                struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5092
5093                raw_spin_lock(&cfs_b->lock);
5094                cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
5095                raw_spin_unlock(&cfs_b->lock);
5096        }
5097        rcu_read_unlock();
5098}
5099
5100/* cpu offline callback */
5101static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
5102{
5103        struct task_group *tg;
5104
5105        lockdep_assert_held(&rq->lock);
5106
5107        rcu_read_lock();
5108        list_for_each_entry_rcu(tg, &task_groups, list) {
5109                struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5110
5111                if (!cfs_rq->runtime_enabled)
5112                        continue;
5113
5114                /*
5115                 * clock_task is not advancing so we just need to make sure
5116                 * there's some valid quota amount
5117                 */
5118                cfs_rq->runtime_remaining = 1;
5119                /*
5120                 * Offline rq is schedulable till CPU is completely disabled
5121                 * in take_cpu_down(), so we prevent new cfs throttling here.
5122                 */
5123                cfs_rq->runtime_enabled = 0;
5124
5125                if (cfs_rq_throttled(cfs_rq))
5126                        unthrottle_cfs_rq(cfs_rq);
5127        }
5128        rcu_read_unlock();
5129}
5130
5131#else /* CONFIG_CFS_BANDWIDTH */
5132
5133static inline bool cfs_bandwidth_used(void)
5134{
5135        return false;
5136}
5137
5138static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
5139{
5140        return rq_clock_task(rq_of(cfs_rq));
5141}
5142
5143static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
5144static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
5145static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
5146static inline void sync_throttle(struct task_group *tg, int cpu) {}
5147static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5148
5149static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5150{
5151        return 0;
5152}
5153
5154static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5155{
5156        return 0;
5157}
5158
5159static inline int throttled_lb_pair(struct task_group *tg,
5160                                    int src_cpu, int dest_cpu)
5161{
5162        return 0;
5163}
5164
5165void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5166
5167#ifdef CONFIG_FAIR_GROUP_SCHED
5168static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5169#endif
5170
5171static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5172{
5173        return NULL;
5174}
5175static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5176static inline void update_runtime_enabled(struct rq *rq) {}
5177static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
5178
5179#endif /* CONFIG_CFS_BANDWIDTH */
5180
5181/**************************************************
5182 * CFS operations on tasks:
5183 */
5184
5185#ifdef CONFIG_SCHED_HRTICK
5186static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
5187{
5188        struct sched_entity *se = &p->se;
5189        struct cfs_rq *cfs_rq = cfs_rq_of(se);
5190
5191        SCHED_WARN_ON(task_rq(p) != rq);
5192
5193        if (rq->cfs.h_nr_running > 1) {
5194                u64 slice = sched_slice(cfs_rq, se);
5195                u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
5196                s64 delta = slice - ran;
5197
5198                if (delta < 0) {
5199                        if (rq->curr == p)
5200                                resched_curr(rq);
5201                        return;
5202                }
5203                hrtick_start(rq, delta);
5204        }
5205}
5206
5207/*
5208 * called from enqueue/dequeue and updates the hrtick when the
5209 * current task is from our class and nr_running is low enough
5210 * to matter.
5211 */
5212static void hrtick_update(struct rq *rq)
5213{
5214        struct task_struct *curr = rq->curr;
5215
5216        if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
5217                return;
5218
5219        if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
5220                hrtick_start_fair(rq, curr);
5221}
5222#else /* !CONFIG_SCHED_HRTICK */
5223static inline void
5224hrtick_start_fair(struct rq *rq, struct task_struct *p)
5225{
5226}
5227
5228static inline void hrtick_update(struct rq *rq)
5229{
5230}
5231#endif
5232
5233#ifdef CONFIG_SMP
5234static inline unsigned long cpu_util(int cpu);
5235
5236static inline bool cpu_overutilized(int cpu)
5237{
5238        return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
5239}
5240
5241static inline void update_overutilized_status(struct rq *rq)
5242{
5243        if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
5244                WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
5245                trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
5246        }
5247}
5248#else
5249static inline void update_overutilized_status(struct rq *rq) { }
5250#endif
5251
5252/*
5253 * The enqueue_task method is called before nr_running is
5254 * increased. Here we update the fair scheduling stats and
5255 * then put the task into the rbtree:
5256 */
5257static void
5258enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5259{
5260        struct cfs_rq *cfs_rq;
5261        struct sched_entity *se = &p->se;
5262
5263        /*
5264         * The code below (indirectly) updates schedutil which looks at
5265         * the cfs_rq utilization to select a frequency.
5266         * Let's add the task's estimated utilization to the cfs_rq's
5267         * estimated utilization, before we update schedutil.
5268         */
5269        util_est_enqueue(&rq->cfs, p);
5270
5271        /*
5272         * If in_iowait is set, the code below may not trigger any cpufreq
5273         * utilization updates, so do it here explicitly with the IOWAIT flag
5274         * passed.
5275         */
5276        if (p->in_iowait)
5277                cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
5278
5279        for_each_sched_entity(se) {
5280                if (se->on_rq)
5281                        break;
5282                cfs_rq = cfs_rq_of(se);
5283                enqueue_entity(cfs_rq, se, flags);
5284
5285                /*
5286                 * end evaluation on encountering a throttled cfs_rq
5287                 *
5288                 * note: in the case of encountering a throttled cfs_rq we will
5289                 * post the final h_nr_running increment below.
5290                 */
5291                if (cfs_rq_throttled(cfs_rq))
5292                        break;
5293                cfs_rq->h_nr_running++;
5294
5295                flags = ENQUEUE_WAKEUP;
5296        }
5297
5298        for_each_sched_entity(se) {
5299                cfs_rq = cfs_rq_of(se);
5300                cfs_rq->h_nr_running++;
5301
5302                if (cfs_rq_throttled(cfs_rq))
5303                        break;
5304
5305                update_load_avg(cfs_rq, se, UPDATE_TG);
5306                update_cfs_group(se);
5307        }
5308
5309        if (!se) {
5310                add_nr_running(rq, 1);
5311                /*
5312                 * Since new tasks are assigned an initial util_avg equal to
5313                 * half of the spare capacity of their CPU, tiny tasks have the
5314                 * ability to cross the overutilized threshold, which will
5315                 * result in the load balancer ruining all the task placement
5316                 * done by EAS. As a way to mitigate that effect, do not account
5317                 * for the first enqueue operation of new tasks during the
5318                 * overutilized flag detection.
5319                 *
5320                 * A better way of solving this problem would be to wait for
5321                 * the PELT signals of tasks to converge before taking them
5322                 * into account, but that is not straightforward to implement,
5323                 * and the following generally works well enough in practice.
5324                 */
5325                if (flags & ENQUEUE_WAKEUP)
5326                        update_overutilized_status(rq);
5327
5328        }
5329
5330        if (cfs_bandwidth_used()) {
5331                /*
5332                 * When bandwidth control is enabled; the cfs_rq_throttled()
5333                 * breaks in the above iteration can result in incomplete
5334                 * leaf list maintenance, resulting in triggering the assertion
5335                 * below.
5336                 */
5337                for_each_sched_entity(se) {
5338                        cfs_rq = cfs_rq_of(se);
5339
5340                        if (list_add_leaf_cfs_rq(cfs_rq))
5341                                break;
5342                }
5343        }
5344
5345        assert_list_leaf_cfs_rq(rq);
5346
5347        hrtick_update(rq);
5348}
5349
5350static void set_next_buddy(struct sched_entity *se);
5351
5352/*
5353 * The dequeue_task method is called before nr_running is
5354 * decreased. We remove the task from the rbtree and
5355 * update the fair scheduling stats:
5356 */
5357static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5358{
5359        struct cfs_rq *cfs_rq;
5360        struct sched_entity *se = &p->se;
5361        int task_sleep = flags & DEQUEUE_SLEEP;
5362
5363        for_each_sched_entity(se) {
5364                cfs_rq = cfs_rq_of(se);
5365                dequeue_entity(cfs_rq, se, flags);
5366
5367                /*
5368                 * end evaluation on encountering a throttled cfs_rq
5369                 *
5370                 * note: in the case of encountering a throttled cfs_rq we will
5371                 * post the final h_nr_running decrement below.
5372                */
5373                if (cfs_rq_throttled(cfs_rq))
5374                        break;
5375                cfs_rq->h_nr_running--;
5376
5377                /* Don't dequeue parent if it has other entities besides us */
5378                if (cfs_rq->load.weight) {
5379                        /* Avoid re-evaluating load for this entity: */
5380                        se = parent_entity(se);
5381                        /*
5382                         * Bias pick_next to pick a task from this cfs_rq, as
5383                         * p is sleeping when it is within its sched_slice.
5384                         */
5385                        if (task_sleep && se && !throttled_hierarchy(cfs_rq))
5386                                set_next_buddy(se);
5387                        break;
5388                }
5389                flags |= DEQUEUE_SLEEP;
5390        }
5391
5392        for_each_sched_entity(se) {
5393                cfs_rq = cfs_rq_of(se);
5394                cfs_rq->h_nr_running--;
5395
5396                if (cfs_rq_throttled(cfs_rq))
5397                        break;
5398
5399                update_load_avg(cfs_rq, se, UPDATE_TG);
5400                update_cfs_group(se);
5401        }
5402
5403        if (!se)
5404                sub_nr_running(rq, 1);
5405
5406        util_est_dequeue(&rq->cfs, p, task_sleep);
5407        hrtick_update(rq);
5408}
5409
5410#ifdef CONFIG_SMP
5411
5412/* Working cpumask for: load_balance, load_balance_newidle. */
5413DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5414DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5415
5416#ifdef CONFIG_NO_HZ_COMMON
5417
5418static struct {
5419        cpumask_var_t idle_cpus_mask;
5420        atomic_t nr_cpus;
5421        int has_blocked;                /* Idle CPUS has blocked load */
5422        unsigned long next_balance;     /* in jiffy units */
5423        unsigned long next_blocked;     /* Next update of blocked load in jiffies */
5424} nohz ____cacheline_aligned;
5425
5426#endif /* CONFIG_NO_HZ_COMMON */
5427
5428static unsigned long cpu_runnable_load(struct rq *rq)
5429{
5430        return cfs_rq_runnable_load_avg(&rq->cfs);
5431}
5432
5433static unsigned long capacity_of(int cpu)
5434{
5435        return cpu_rq(cpu)->cpu_capacity;
5436}
5437
5438static unsigned long cpu_avg_load_per_task(int cpu)
5439{
5440        struct rq *rq = cpu_rq(cpu);
5441        unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
5442        unsigned long load_avg = cpu_runnable_load(rq);
5443
5444        if (nr_running)
5445                return load_avg / nr_running;
5446
5447        return 0;
5448}
5449
5450static void record_wakee(struct task_struct *p)
5451{
5452        /*
5453         * Only decay a single time; tasks that have less then 1 wakeup per
5454         * jiffy will not have built up many flips.
5455         */
5456        if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
5457                current->wakee_flips >>= 1;
5458                current->wakee_flip_decay_ts = jiffies;
5459        }
5460
5461        if (current->last_wakee != p) {
5462                current->last_wakee = p;
5463                current->wakee_flips++;
5464        }
5465}
5466
5467/*
5468 * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
5469 *
5470 * A waker of many should wake a different task than the one last awakened
5471 * at a frequency roughly N times higher than one of its wakees.
5472 *
5473 * In order to determine whether we should let the load spread vs consolidating
5474 * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
5475 * partner, and a factor of lls_size higher frequency in the other.
5476 *
5477 * With both conditions met, we can be relatively sure that the relationship is
5478 * non-monogamous, with partner count exceeding socket size.
5479 *
5480 * Waker/wakee being client/server, worker/dispatcher, interrupt source or
5481 * whatever is irrelevant, spread criteria is apparent partner count exceeds
5482 * socket size.
5483 */
5484static int wake_wide(struct task_struct *p)
5485{
5486        unsigned int master = current->wakee_flips;
5487        unsigned int slave = p->wakee_flips;
5488        int factor = this_cpu_read(sd_llc_size);
5489
5490        if (master < slave)
5491                swap(master, slave);
5492        if (slave < factor || master < slave * factor)
5493                return 0;
5494        return 1;
5495}
5496
5497/*
5498 * The purpose of wake_affine() is to quickly determine on which CPU we can run
5499 * soonest. For the purpose of speed we only consider the waking and previous
5500 * CPU.
5501 *
5502 * wake_affine_idle() - only considers 'now', it check if the waking CPU is
5503 *                      cache-affine and is (or will be) idle.
5504 *
5505 * wake_affine_weight() - considers the weight to reflect the average
5506 *                        scheduling latency of the CPUs. This seems to work
5507 *                        for the overloaded case.
5508 */
5509static int
5510wake_affine_idle(int this_cpu, int prev_cpu, int sync)
5511{
5512        /*
5513         * If this_cpu is idle, it implies the wakeup is from interrupt
5514         * context. Only allow the move if cache is shared. Otherwise an
5515         * interrupt intensive workload could force all tasks onto one
5516         * node depending on the IO topology or IRQ affinity settings.
5517         *
5518         * If the prev_cpu is idle and cache affine then avoid a migration.
5519         * There is no guarantee that the cache hot data from an interrupt
5520         * is more important than cache hot data on the prev_cpu and from
5521         * a cpufreq perspective, it's better to have higher utilisation
5522         * on one CPU.
5523         */
5524        if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
5525                return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
5526
5527        if (sync && cpu_rq(this_cpu)->nr_running == 1)
5528                return this_cpu;
5529
5530        return nr_cpumask_bits;
5531}
5532
5533static int
5534wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5535                   int this_cpu, int prev_cpu, int sync)
5536{
5537        s64 this_eff_load, prev_eff_load;
5538        unsigned long task_load;
5539
5540        this_eff_load = cpu_runnable_load(cpu_rq(this_cpu));
5541
5542        if (sync) {
5543                unsigned long current_load = task_h_load(current);
5544
5545                if (current_load > this_eff_load)
5546                        return this_cpu;
5547
5548                this_eff_load -= current_load;
5549        }
5550
5551        task_load = task_h_load(p);
5552
5553        this_eff_load += task_load;
5554        if (sched_feat(WA_BIAS))
5555                this_eff_load *= 100;
5556        this_eff_load *= capacity_of(prev_cpu);
5557
5558        prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu));
5559        prev_eff_load -= task_load;
5560        if (sched_feat(WA_BIAS))
5561                prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
5562        prev_eff_load *= capacity_of(this_cpu);
5563
5564        /*
5565         * If sync, adjust the weight of prev_eff_load such that if
5566         * prev_eff == this_eff that select_idle_sibling() will consider
5567         * stacking the wakee on top of the waker if no other CPU is
5568         * idle.
5569         */
5570        if (sync)
5571                prev_eff_load += 1;
5572
5573        return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
5574}
5575
5576static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5577                       int this_cpu, int prev_cpu, int sync)
5578{
5579        int target = nr_cpumask_bits;
5580
5581        if (sched_feat(WA_IDLE))
5582                target = wake_affine_idle(this_cpu, prev_cpu, sync);
5583
5584        if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
5585                target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
5586
5587        schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
5588        if (target == nr_cpumask_bits)
5589                return prev_cpu;
5590
5591        schedstat_inc(sd->ttwu_move_affine);
5592        schedstat_inc(p->se.statistics.nr_wakeups_affine);
5593        return target;
5594}
5595
5596static unsigned long cpu_util_without(int cpu, struct task_struct *p);
5597
5598static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
5599{
5600        return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
5601}
5602
5603/*
5604 * find_idlest_group finds and returns the least busy CPU group within the
5605 * domain.
5606 *
5607 * Assumes p is allowed on at least one CPU in sd.
5608 */
5609static struct sched_group *
5610find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5611                  int this_cpu, int sd_flag)
5612{
5613        struct sched_group *idlest = NULL, *group = sd->groups;
5614        struct sched_group *most_spare_sg = NULL;
5615        unsigned long min_runnable_load = ULONG_MAX;
5616        unsigned long this_runnable_load = ULONG_MAX;
5617        unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
5618        unsigned long most_spare = 0, this_spare = 0;
5619        int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
5620        unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
5621                                (sd->imbalance_pct-100) / 100;
5622
5623        do {
5624                unsigned long load, avg_load, runnable_load;
5625                unsigned long spare_cap, max_spare_cap;
5626                int local_group;
5627                int i;
5628
5629                /* Skip over this group if it has no CPUs allowed */
5630                if (!cpumask_intersects(sched_group_span(group),
5631                                        p->cpus_ptr))
5632                        continue;
5633
5634                local_group = cpumask_test_cpu(this_cpu,
5635                                               sched_group_span(group));
5636
5637                /*
5638                 * Tally up the load of all CPUs in the group and find
5639                 * the group containing the CPU with most spare capacity.
5640                 */
5641                avg_load = 0;
5642                runnable_load = 0;
5643                max_spare_cap = 0;
5644
5645                for_each_cpu(i, sched_group_span(group)) {
5646                        load = cpu_runnable_load(cpu_rq(i));
5647                        runnable_load += load;
5648
5649                        avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
5650
5651                        spare_cap = capacity_spare_without(i, p);
5652
5653                        if (spare_cap > max_spare_cap)
5654                                max_spare_cap = spare_cap;
5655                }
5656
5657                /* Adjust by relative CPU capacity of the group */
5658                avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
5659                                        group->sgc->capacity;
5660                runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
5661                                        group->sgc->capacity;
5662
5663                if (local_group) {
5664                        this_runnable_load = runnable_load;
5665                        this_avg_load = avg_load;
5666                        this_spare = max_spare_cap;
5667                } else {
5668                        if (min_runnable_load > (runnable_load + imbalance)) {
5669                                /*
5670                                 * The runnable load is significantly smaller
5671                                 * so we can pick this new CPU:
5672                                 */
5673                                min_runnable_load = runnable_load;
5674                                min_avg_load = avg_load;
5675                                idlest = group;
5676                        } else if ((runnable_load < (min_runnable_load + imbalance)) &&
5677                                   (100*min_avg_load > imbalance_scale*avg_load)) {
5678                                /*
5679                                 * The runnable loads are close so take the
5680                                 * blocked load into account through avg_load:
5681                                 */
5682                                min_avg_load = avg_load;
5683                                idlest = group;
5684                        }
5685
5686                        if (most_spare < max_spare_cap) {
5687                                most_spare = max_spare_cap;
5688                                most_spare_sg = group;
5689                        }
5690                }
5691        } while (group = group->next, group != sd->groups);
5692
5693        /*
5694         * The cross-over point between using spare capacity or least load
5695         * is too conservative for high utilization tasks on partially
5696         * utilized systems if we require spare_capacity > task_util(p),
5697         * so we allow for some task stuffing by using
5698         * spare_capacity > task_util(p)/2.
5699         *
5700         * Spare capacity can't be used for fork because the utilization has
5701         * not been set yet, we must first select a rq to compute the initial
5702         * utilization.
5703         */
5704        if (sd_flag & SD_BALANCE_FORK)
5705                goto skip_spare;
5706
5707        if (this_spare > task_util(p) / 2 &&
5708            imbalance_scale*this_spare > 100*most_spare)
5709                return NULL;
5710
5711        if (most_spare > task_util(p) / 2)
5712                return most_spare_sg;
5713
5714skip_spare:
5715        if (!idlest)
5716                return NULL;
5717
5718        /*
5719         * When comparing groups across NUMA domains, it's possible for the
5720         * local domain to be very lightly loaded relative to the remote
5721         * domains but "imbalance" skews the comparison making remote CPUs
5722         * look much more favourable. When considering cross-domain, add
5723         * imbalance to the runnable load on the remote node and consider
5724         * staying local.
5725         */
5726        if ((sd->flags & SD_NUMA) &&
5727            min_runnable_load + imbalance >= this_runnable_load)
5728                return NULL;
5729
5730        if (min_runnable_load > (this_runnable_load + imbalance))
5731                return NULL;
5732
5733        if ((this_runnable_load < (min_runnable_load + imbalance)) &&
5734             (100*this_avg_load < imbalance_scale*min_avg_load))
5735                return NULL;
5736
5737        return idlest;
5738}
5739
5740/*
5741 * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
5742 */
5743static int
5744find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5745{
5746        unsigned long load, min_load = ULONG_MAX;
5747        unsigned int min_exit_latency = UINT_MAX;
5748        u64 latest_idle_timestamp = 0;
5749        int least_loaded_cpu = this_cpu;
5750        int shallowest_idle_cpu = -1;
5751        int i;
5752
5753        /* Check if we have any choice: */
5754        if (group->group_weight == 1)
5755                return cpumask_first(sched_group_span(group));
5756
5757        /* Traverse only the allowed CPUs */
5758        for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
5759                if (available_idle_cpu(i)) {
5760                        struct rq *rq = cpu_rq(i);
5761                        struct cpuidle_state *idle = idle_get_state(rq);
5762                        if (idle && idle->exit_latency < min_exit_latency) {
5763                                /*
5764                                 * We give priority to a CPU whose idle state
5765                                 * has the smallest exit latency irrespective
5766                                 * of any idle timestamp.
5767                                 */
5768                                min_exit_latency = idle->exit_latency;
5769                                latest_idle_timestamp = rq->idle_stamp;
5770                                shallowest_idle_cpu = i;
5771                        } else if ((!idle || idle->exit_latency == min_exit_latency) &&
5772                                   rq->idle_stamp > latest_idle_timestamp) {
5773                                /*
5774                                 * If equal or no active idle state, then
5775                                 * the most recently idled CPU might have
5776                                 * a warmer cache.
5777                                 */
5778                                latest_idle_timestamp = rq->idle_stamp;
5779                                shallowest_idle_cpu = i;
5780                        }
5781                } else if (shallowest_idle_cpu == -1) {
5782                        load = cpu_runnable_load(cpu_rq(i));
5783                        if (load < min_load) {
5784                                min_load = load;
5785                                least_loaded_cpu = i;
5786                        }
5787                }
5788        }
5789
5790        return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
5791}
5792
5793static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
5794                                  int cpu, int prev_cpu, int sd_flag)
5795{
5796        int new_cpu = cpu;
5797
5798        if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
5799                return prev_cpu;
5800
5801        /*
5802         * We need task's util for capacity_spare_without, sync it up to
5803         * prev_cpu's last_update_time.
5804         */
5805        if (!(sd_flag & SD_BALANCE_FORK))
5806                sync_entity_load_avg(&p->se);
5807
5808        while (sd) {
5809                struct sched_group *group;
5810                struct sched_domain *tmp;
5811                int weight;
5812
5813                if (!(sd->flags & sd_flag)) {
5814                        sd = sd->child;
5815                        continue;
5816                }
5817
5818                group = find_idlest_group(sd, p, cpu, sd_flag);
5819                if (!group) {
5820                        sd = sd->child;
5821                        continue;
5822                }
5823
5824                new_cpu = find_idlest_group_cpu(group, p, cpu);
5825                if (new_cpu == cpu) {
5826                        /* Now try balancing at a lower domain level of 'cpu': */
5827                        sd = sd->child;
5828                        continue;
5829                }
5830
5831                /* Now try balancing at a lower domain level of 'new_cpu': */
5832                cpu = new_cpu;
5833                weight = sd->span_weight;
5834                sd = NULL;
5835                for_each_domain(cpu, tmp) {
5836                        if (weight <= tmp->span_weight)
5837                                break;
5838                        if (tmp->flags & sd_flag)
5839                                sd = tmp;
5840                }
5841        }
5842
5843        return new_cpu;
5844}
5845
5846#ifdef CONFIG_SCHED_SMT
5847DEFINE_STATIC_KEY_FALSE(sched_smt_present);
5848EXPORT_SYMBOL_GPL(sched_smt_present);
5849
5850static inline void set_idle_cores(int cpu, int val)
5851{
5852        struct sched_domain_shared *sds;
5853
5854        sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5855        if (sds)
5856                WRITE_ONCE(sds->has_idle_cores, val);
5857}
5858
5859static inline bool test_idle_cores(int cpu, bool def)
5860{
5861        struct sched_domain_shared *sds;
5862
5863        sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5864        if (sds)
5865                return READ_ONCE(sds->has_idle_cores);
5866
5867        return def;
5868}
5869
5870/*
5871 * Scans the local SMT mask to see if the entire core is idle, and records this
5872 * information in sd_llc_shared->has_idle_cores.
5873 *
5874 * Since SMT siblings share all cache levels, inspecting this limited remote
5875 * state should be fairly cheap.
5876 */
5877void __update_idle_core(struct rq *rq)
5878{
5879        int core = cpu_of(rq);
5880        int cpu;
5881
5882        rcu_read_lock();
5883        if (test_idle_cores(core, true))
5884                goto unlock;
5885
5886        for_each_cpu(cpu, cpu_smt_mask(core)) {
5887                if (cpu == core)
5888                        continue;
5889
5890                if (!available_idle_cpu(cpu))
5891                        goto unlock;
5892        }
5893
5894        set_idle_cores(core, 1);
5895unlock:
5896        rcu_read_unlock();
5897}
5898
5899/*
5900 * Scan the entire LLC domain for idle cores; this dynamically switches off if
5901 * there are no idle cores left in the system; tracked through
5902 * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
5903 */
5904static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
5905{
5906        struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
5907        int core, cpu;
5908
5909        if (!static_branch_likely(&sched_smt_present))
5910                return -1;
5911
5912        if (!test_idle_cores(target, false))
5913                return -1;
5914
5915        cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
5916
5917        for_each_cpu_wrap(core, cpus, target) {
5918                bool idle = true;
5919
5920                for_each_cpu(cpu, cpu_smt_mask(core)) {
5921                        __cpumask_clear_cpu(cpu, cpus);
5922                        if (!available_idle_cpu(cpu))
5923                                idle = false;
5924                }
5925
5926                if (idle)
5927                        return core;
5928        }
5929
5930        /*
5931         * Failed to find an idle core; stop looking for one.
5932         */
5933        set_idle_cores(target, 0);
5934
5935        return -1;
5936}
5937
5938/*
5939 * Scan the local SMT mask for idle CPUs.
5940 */
5941static int select_idle_smt(struct task_struct *p, int target)
5942{
5943        int cpu;
5944
5945        if (!static_branch_likely(&sched_smt_present))
5946                return -1;
5947
5948        for_each_cpu(cpu, cpu_smt_mask(target)) {
5949                if (!cpumask_test_cpu(cpu, p->cpus_ptr))
5950                        continue;
5951                if (available_idle_cpu(cpu))
5952                        return cpu;
5953        }
5954
5955        return -1;
5956}
5957
5958#else /* CONFIG_SCHED_SMT */
5959
5960static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
5961{
5962        return -1;
5963}
5964
5965static inline int select_idle_smt(struct task_struct *p, int target)
5966{
5967        return -1;
5968}
5969
5970#endif /* CONFIG_SCHED_SMT */
5971
5972/*
5973 * Scan the LLC domain for idle CPUs; this is dynamically regulated by
5974 * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
5975 * average idle time for this rq (as found in rq->avg_idle).
5976 */
5977static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
5978{
5979        struct sched_domain *this_sd;
5980        u64 avg_cost, avg_idle;
5981        u64 time, cost;
5982        s64 delta;
5983        int cpu, nr = INT_MAX;
5984        int this = smp_processor_id();
5985
5986        this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
5987        if (!this_sd)
5988                return -1;
5989
5990        /*
5991         * Due to large variance we need a large fuzz factor; hackbench in
5992         * particularly is sensitive here.
5993         */
5994        avg_idle = this_rq()->avg_idle / 512;
5995        avg_cost = this_sd->avg_scan_cost + 1;
5996
5997        if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
5998                return -1;
5999
6000        if (sched_feat(SIS_PROP)) {

6001                u64 span_avg = sd->span_weight * avg_idle;
6002                if (span_avg > 4*avg_cost)
6003                        nr = div_u64(span_avg, avg_cost);
6004                else
6005                        nr = 4;
6006        }
6007
6008        time = cpu_clock(this);
6009
6010        for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
6011                if (!--nr)
6012                        return -1;
6013                if (!cpumask_test_cpu(cpu, p->cpus_ptr))
6014                        continue;
6015                if (available_idle_cpu(cpu))
6016                        break;
6017        }
6018
6019        time = cpu_clock(this) - time;
6020        cost = this_sd->avg_scan_cost;
6021        delta = (s64)(time - cost) / 8;
6022        this_sd->avg_scan_cost += delta;
6023
6024        return cpu;
6025}
6026
6027/*
6028 * Try and locate an idle core/thread in the LLC cache domain.
6029 */
6030static int select_idle_sibling(struct task_struct *p, int prev, int target)
6031{
6032        struct sched_domain *sd;
6033        int i, recent_used_cpu;
6034
6035        if (available_idle_cpu(target))
6036                return target;
6037
6038        /*
6039         * If the previous CPU is cache affine and idle, don't be stupid:
6040         */
6041        if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
6042                return prev;
6043
6044        /* Check a recently used CPU as a potential idle candidate: */
6045        recent_used_cpu = p->recent_used_cpu;
6046        if (recent_used_cpu != prev &&
6047            recent_used_cpu != target &&
6048            cpus_share_cache(recent_used_cpu, target) &&
6049            available_idle_cpu(recent_used_cpu) &&
6050            cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
6051                /*
6052                 * Replace recent_used_cpu with prev as it is a potential
6053                 * candidate for the next wake:
6054                 */
6055                p->recent_used_cpu = prev;
6056                return recent_used_cpu;
6057        }
6058
6059        sd = rcu_dereference(per_cpu(sd_llc, target));
6060        if (!sd)
6061                return target;
6062
6063        i = select_idle_core(p, sd, target);
6064        if ((unsigned)i < nr_cpumask_bits)
6065                return i;
6066
6067        i = select_idle_cpu(p, sd, target);
6068        if ((unsigned)i < nr_cpumask_bits)
6069                return i;
6070
6071        i = select_idle_smt(p, target);
6072        if ((unsigned)i < nr_cpumask_bits)
6073                return i;
6074
6075        return target;
6076}
6077
6078/**
6079 * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
6080 * @cpu: the CPU to get the utilization of
6081 *
6082 * The unit of the return value must be the one of capacity so we can compare
6083 * the utilization with the capacity of the CPU that is available for CFS task
6084 * (ie cpu_capacity).
6085 *
6086 * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
6087 * recent utilization of currently non-runnable tasks on a CPU. It represents
6088 * the amount of utilization of a CPU in the range [0..capacity_orig] where
6089 * capacity_orig is the cpu_capacity available at the highest frequency
6090 * (arch_scale_freq_capacity()).
6091 * The utilization of a CPU converges towards a sum equal to or less than the
6092 * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
6093 * the running time on this CPU scaled by capacity_curr.
6094 *
6095 * The estimated utilization of a CPU is defined to be the maximum between its
6096 * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
6097 * currently RUNNABLE on that CPU.
6098 * This allows to properly represent the expected utilization of a CPU which
6099 * has just got a big task running since a long sleep period. At the same time
6100 * however it preserves the benefits of the "blocked utilization" in
6101 * describing the potential for other tasks waking up on the same CPU.
6102 *
6103 * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
6104 * higher than capacity_orig because of unfortunate rounding in
6105 * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
6106 * the average stabilizes with the new running time. We need to check that the
6107 * utilization stays within the range of [0..capacity_orig] and cap it if
6108 * necessary. Without utilization capping, a group could be seen as overloaded
6109 * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
6110 * available capacity. We allow utilization to overshoot capacity_curr (but not
6111 * capacity_orig) as it useful for predicting the capacity required after task
6112 * migrations (scheduler-driven DVFS).
6113 *
6114 * Return: the (estimated) utilization for the specified CPU
6115 */
6116static inline unsigned long cpu_util(int cpu)
6117{
6118        struct cfs_rq *cfs_rq;
6119        unsigned int util;
6120
6121        cfs_rq = &cpu_rq(cpu)->cfs;
6122        util = READ_ONCE(cfs_rq->avg.util_avg);
6123
6124        if (sched_feat(UTIL_EST))
6125                util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
6126
6127        return min_t(unsigned long, util, capacity_orig_of(cpu));
6128}
6129
6130/*
6131 * cpu_util_without: compute cpu utilization without any contributions from *p
6132 * @cpu: the CPU which utilization is requested
6133 * @p: the task which utilization should be discounted
6134 *
6135 * The utilization of a CPU is defined by the utilization of tasks currently
6136 * enqueued on that CPU as well as tasks which are currently sleeping after an
6137 * execution on that CPU.
6138 *
6139 * This method returns the utilization of the specified CPU by discounting the
6140 * utilization of the specified task, whenever the task is currently
6141 * contributing to the CPU utilization.
6142 */
6143static unsigned long cpu_util_without(int cpu, struct task_struct *p)
6144{
6145        struct cfs_rq *cfs_rq;
6146        unsigned int util;
6147
6148        /* Task has no contribution or is new */
6149        if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
6150                return cpu_util(cpu);
6151
6152        cfs_rq = &cpu_rq(cpu)->cfs;
6153        util = READ_ONCE(cfs_rq->avg.util_avg);
6154
6155        /* Discount task's util from CPU's util */
6156        lsub_positive(&util, task_util(p));
6157
6158        /*
6159         * Covered cases:
6160         *
6161         * a) if *p is the only task sleeping on this CPU, then:
6162         *      cpu_util (== task_util) > util_est (== 0)
6163         *    and thus we return:
6164         *      cpu_util_without = (cpu_util - task_util) = 0
6165         *
6166         * b) if other tasks are SLEEPING on this CPU, which is now exiting
6167         *    IDLE, then:
6168         *      cpu_util >= task_util
6169         *      cpu_util > util_est (== 0)
6170         *    and thus we discount *p's blocked utilization to return:
6171         *      cpu_util_without = (cpu_util - task_util) >= 0
6172         *
6173         * c) if other tasks are RUNNABLE on that CPU and
6174         *      util_est > cpu_util
6175         *    then we use util_est since it returns a more restrictive
6176         *    estimation of the spare capacity on that CPU, by just
6177         *    considering the expected utilization of tasks already
6178         *    runnable on that CPU.
6179         *
6180         * Cases a) and b) are covered by the above code, while case c) is
6181         * covered by the following code when estimated utilization is
6182         * enabled.
6183         */
6184        if (sched_feat(UTIL_EST)) {
6185                unsigned int estimated =
6186                        READ_ONCE(cfs_rq->avg.util_est.enqueued);
6187
6188                /*
6189                 * Despite the following checks we still have a small window
6190                 * for a possible race, when an execl's select_task_rq_fair()
6191                 * races with LB's detach_task():
6192                 *
6193                 *   detach_task()
6194                 *     p->on_rq = TASK_ON_RQ_MIGRATING;
6195                 *     ---------------------------------- A
6196                 *     deactivate_task()                   \
6197                 *       dequeue_task()                     + RaceTime
6198                 *         util_est_dequeue()              /
6199                 *     ---------------------------------- B
6200                 *
6201                 * The additional check on "current == p" it's required to
6202                 * properly fix the execl regression and it helps in further
6203                 * reducing the chances for the above race.
6204                 */
6205                if (unlikely(task_on_rq_queued(p) || current == p))
6206                        lsub_positive(&estimated, _task_util_est(p));
6207
6208                util = max(util, estimated);
6209        }
6210
6211        /*
6212         * Utilization (estimated) can exceed the CPU capacity, thus let's
6213         * clamp to the maximum CPU capacity to ensure consistency with
6214         * the cpu_util call.
6215         */
6216        return min_t(unsigned long, util, capacity_orig_of(cpu));
6217}
6218
6219/*
6220 * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
6221 * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
6222 *
6223 * In that case WAKE_AFFINE doesn't make sense and we'll let
6224 * BALANCE_WAKE sort things out.
6225 */
6226static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6227{
6228        long min_cap, max_cap;
6229
6230        if (!static_branch_unlikely(&sched_asym_cpucapacity))
6231                return 0;
6232
6233        min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
6234        max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
6235
6236        /* Minimum capacity is close to max, no need to abort wake_affine */
6237        if (max_cap - min_cap < max_cap >> 3)
6238                return 0;
6239
6240        /* Bring task utilization in sync with prev_cpu */
6241        sync_entity_load_avg(&p->se);
6242
6243        return !task_fits_capacity(p, min_cap);
6244}
6245
6246/*
6247 * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
6248 * to @dst_cpu.
6249 */
6250static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
6251{
6252        struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
6253        unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
6254
6255        /*
6256         * If @p migrates from @cpu to another, remove its contribution. Or,
6257         * if @p migrates from another CPU to @cpu, add its contribution. In
6258         * the other cases, @cpu is not impacted by the migration, so the
6259         * util_avg should already be correct.
6260         */
6261        if (task_cpu(p) == cpu && dst_cpu != cpu)
6262                sub_positive(&util, task_util(p));
6263        else if (task_cpu(p) != cpu && dst_cpu == cpu)
6264                util += task_util(p);
6265
6266        if (sched_feat(UTIL_EST)) {
6267                util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
6268
6269                /*
6270                 * During wake-up, the task isn't enqueued yet and doesn't
6271                 * appear in the cfs_rq->avg.util_est.enqueued of any rq,
6272                 * so just add it (if needed) to "simulate" what will be
6273                 * cpu_util() after the task has been enqueued.
6274                 */
6275                if (dst_cpu == cpu)
6276                        util_est += _task_util_est(p);
6277
6278                util = max(util, util_est);
6279        }
6280
6281        return min(util, capacity_orig_of(cpu));
6282}
6283
6284/*
6285 * compute_energy(): Estimates the energy that would be consumed if @p was
6286 * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
6287 * landscape of the * CPUs after the task migration, and uses the Energy Model
6288 * to compute what would be the energy if we decided to actually migrate that
6289 * task.
6290 */
6291static long
6292compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
6293{
6294        unsigned int max_util, util_cfs, cpu_util, cpu_cap;
6295        unsigned long sum_util, energy = 0;
6296        struct task_struct *tsk;
6297        int cpu;
6298
6299        for (; pd; pd = pd->next) {
6300                struct cpumask *pd_mask = perf_domain_span(pd);
6301
6302                /*
6303                 * The energy model mandates all the CPUs of a performance
6304                 * domain have the same capacity.
6305                 */
6306                cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
6307                max_util = sum_util = 0;
6308
6309                /*
6310                 * The capacity state of CPUs of the current rd can be driven by
6311                 * CPUs of another rd if they belong to the same performance
6312                 * domain. So, account for the utilization of these CPUs too
6313                 * by masking pd with cpu_online_mask instead of the rd span.
6314                 *
6315                 * If an entire performance domain is outside of the current rd,
6316                 * it will not appear in its pd list and will not be accounted
6317                 * by compute_energy().
6318                 */
6319                for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
6320                        util_cfs = cpu_util_next(cpu, p, dst_cpu);
6321
6322                        /*
6323                         * Busy time computation: utilization clamping is not
6324                         * required since the ratio (sum_util / cpu_capacity)
6325                         * is already enough to scale the EM reported power
6326                         * consumption at the (eventually clamped) cpu_capacity.
6327                         */
6328                        sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6329                                                       ENERGY_UTIL, NULL);
6330
6331                        /*
6332                         * Performance domain frequency: utilization clamping
6333                         * must be considered since it affects the selection
6334                         * of the performance domain frequency.
6335                         * NOTE: in case RT tasks are running, by default the
6336                         * FREQUENCY_UTIL's utilization can be max OPP.
6337                         */
6338                        tsk = cpu == dst_cpu ? p : NULL;
6339                        cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6340                                                      FREQUENCY_UTIL, tsk);
6341                        max_util = max(max_util, cpu_util);
6342                }
6343
6344                energy += em_pd_energy(pd->em_pd, max_util, sum_util);
6345        }
6346
6347        return energy;
6348}
6349
6350/*
6351 * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
6352 * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
6353 * spare capacity in each performance domain and uses it as a potential
6354 * candidate to execute the task. Then, it uses the Energy Model to figure
6355 * out which of the CPU candidates is the most energy-efficient.
6356 *
6357 * The rationale for this heuristic is as follows. In a performance domain,
6358 * all the most energy efficient CPU candidates (according to the Energy
6359 * Model) are those for which we'll request a low frequency. When there are
6360 * several CPUs for which the frequency request will be the same, we don't
6361 * have enough data to break the tie between them, because the Energy Model
6362 * only includes active power costs. With this model, if we assume that
6363 * frequency requests follow utilization (e.g. using schedutil), the CPU with
6364 * the maximum spare capacity in a performance domain is guaranteed to be among
6365 * the best candidates of the performance domain.
6366 *
6367 * In practice, it could be preferable from an energy standpoint to pack
6368 * small tasks on a CPU in order to let other CPUs go in deeper idle states,
6369 * but that could also hurt our chances to go cluster idle, and we have no
6370 * ways to tell with the current Energy Model if this is actually a good
6371 * idea or not. So, find_energy_efficient_cpu() basically favors
6372 * cluster-packing, and spreading inside a cluster. That should at least be
6373 * a good thing for latency, and this is consistent with the idea that most
6374 * of the energy savings of EAS come from the asymmetry of the system, and
6375 * not so much from breaking the tie between identical CPUs. That's also the
6376 * reason why EAS is enabled in the topology code only for systems where
6377 * SD_ASYM_CPUCAPACITY is set.
6378 *
6379 * NOTE: Forkees are not accepted in the energy-aware wake-up path because
6380 * they don't have any useful utilization data yet and it's not possible to
6381 * forecast their impact on energy consumption. Consequently, they will be
6382 * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
6383 * to be energy-inefficient in some use-cases. The alternative would be to
6384 * bias new tasks towards specific types of CPUs first, or to try to infer
6385 * their util_avg from the parent task, but those heuristics could hurt
6386 * other use-cases too. So, until someone finds a better way to solve this,
6387 * let's keep things simple by re-using the existing slow path.
6388 */
6389
6390static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
6391{
6392        unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
6393        struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
6394        int cpu, best_energy_cpu = prev_cpu;
6395        struct perf_domain *head, *pd;
6396        unsigned long cpu_cap, util;
6397        struct sched_domain *sd;
6398
6399        rcu_read_lock();
6400        pd = rcu_dereference(rd->pd);
6401        if (!pd || READ_ONCE(rd->overutilized))
6402                goto fail;
6403        head = pd;
6404
6405        /*
6406         * Energy-aware wake-up happens on the lowest sched_domain starting
6407         * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
6408         */
6409        sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
6410        while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
6411                sd = sd->parent;
6412        if (!sd)
6413                goto fail;
6414
6415        sync_entity_load_avg(&p->se);
6416        if (!task_util_est(p))
6417                goto unlock;
6418
6419        for (; pd; pd = pd->next) {
6420                unsigned long cur_energy, spare_cap, max_spare_cap = 0;
6421                int max_spare_cap_cpu = -1;
6422
6423                for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
6424                        if (!cpumask_test_cpu(cpu, p->cpus_ptr))
6425                                continue;
6426
6427                        /* Skip CPUs that will be overutilized. */
6428                        util = cpu_util_next(cpu, p, cpu);
6429                        cpu_cap = capacity_of(cpu);
6430                        if (cpu_cap * 1024 < util * capacity_margin)
6431                                continue;
6432
6433                        /* Always use prev_cpu as a candidate. */
6434                        if (cpu == prev_cpu) {
6435                                prev_energy = compute_energy(p, prev_cpu, head);
6436                                best_energy = min(best_energy, prev_energy);
6437                                continue;
6438                        }
6439
6440                        /*
6441                         * Find the CPU with the maximum spare capacity in
6442                         * the performance domain
6443                         */
6444                        spare_cap = cpu_cap - util;
6445                        if (spare_cap > max_spare_cap) {
6446                                max_spare_cap = spare_cap;
6447                                max_spare_cap_cpu = cpu;
6448                        }
6449                }
6450
6451                /* Evaluate the energy impact of using this CPU. */
6452                if (max_spare_cap_cpu >= 0) {
6453                        cur_energy = compute_energy(p, max_spare_cap_cpu, head);
6454                        if (cur_energy < best_energy) {
6455                                best_energy = cur_energy;
6456                                best_energy_cpu = max_spare_cap_cpu;
6457                        }
6458                }
6459        }
6460unlock:
6461        rcu_read_unlock();
6462
6463        /*
6464         * Pick the best CPU if prev_cpu cannot be used, or if it saves at
6465         * least 6% of the energy used by prev_cpu.
6466         */
6467        if (prev_energy == ULONG_MAX)
6468                return best_energy_cpu;
6469
6470        if ((prev_energy - best_energy) > (prev_energy >> 4))
6471                return best_energy_cpu;
6472
6473        return prev_cpu;
6474
6475fail:
6476        rcu_read_unlock();
6477
6478        return -1;
6479}
6480
6481/*
6482 * select_task_rq_fair: Select target runqueue for the waking task in domains
6483 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
6484 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
6485 *
6486 * Balances load by selecting the idlest CPU in the idlest group, or under
6487 * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
6488 *
6489 * Returns the target CPU number.
6490 *
6491 * preempt must be disabled.
6492 */
6493static int
6494select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
6495{
6496        struct sched_domain *tmp, *sd = NULL;
6497        int cpu = smp_processor_id();
6498        int new_cpu = prev_cpu;
6499        int want_affine = 0;
6500        int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
6501
6502        if (sd_flag & SD_BALANCE_WAKE) {
6503                record_wakee(p);
6504
6505                if (sched_energy_enabled()) {
6506                        new_cpu = find_energy_efficient_cpu(p, prev_cpu);
6507                        if (new_cpu >= 0)
6508                                return new_cpu;
6509                        new_cpu = prev_cpu;
6510                }
6511
6512                want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
6513                              cpumask_test_cpu(cpu, p->cpus_ptr);
6514        }
6515
6516        rcu_read_lock();
6517        for_each_domain(cpu, tmp) {
6518                if (!(tmp->flags & SD_LOAD_BALANCE))
6519                        break;
6520
6521                /*
6522                 * If both 'cpu' and 'prev_cpu' are part of this domain,
6523                 * cpu is a valid SD_WAKE_AFFINE target.
6524                 */
6525                if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
6526                    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
6527                        if (cpu != prev_cpu)
6528                                new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
6529
6530                        sd = NULL; /* Prefer wake_affine over balance flags */
6531                        break;
6532                }
6533
6534                if (tmp->flags & sd_flag)
6535                        sd = tmp;
6536                else if (!want_affine)
6537                        break;
6538        }
6539
6540        if (unlikely(sd)) {
6541                /* Slow path */
6542                new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
6543        } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
6544                /* Fast path */
6545
6546                new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
6547
6548                if (want_affine)
6549                        current->recent_used_cpu = cpu;
6550        }
6551        rcu_read_unlock();
6552
6553        return new_cpu;
6554}
6555
6556static void detach_entity_cfs_rq(struct sched_entity *se);
6557
6558/*
6559 * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
6560 * cfs_rq_of(p) references at time of call are still valid and identify the
6561 * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
6562 */
6563static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
6564{
6565        /*
6566         * As blocked tasks retain absolute vruntime the migration needs to
6567         * deal with this by subtracting the old and adding the new
6568         * min_vruntime -- the latter is done by enqueue_entity() when placing
6569         * the task on the new runqueue.
6570         */
6571        if (p->state == TASK_WAKING) {
6572                struct sched_entity *se = &p->se;
6573                struct cfs_rq *cfs_rq = cfs_rq_of(se);
6574                u64 min_vruntime;
6575
6576#ifndef CONFIG_64BIT
6577                u64 min_vruntime_copy;
6578
6579                do {
6580                        min_vruntime_copy = cfs_rq->min_vruntime_copy;
6581                        smp_rmb();
6582                        min_vruntime = cfs_rq->min_vruntime;
6583                } while (min_vruntime != min_vruntime_copy);
6584#else
6585                min_vruntime = cfs_rq->min_vruntime;
6586#endif
6587
6588                se->vruntime -= min_vruntime;
6589        }
6590
6591        if (p->on_rq == TASK_ON_RQ_MIGRATING) {
6592                /*
6593                 * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
6594                 * rq->lock and can modify state directly.
6595                 */
6596                lockdep_assert_held(&task_rq(p)->lock);
6597                detach_entity_cfs_rq(&p->se);
6598
6599        } else {
6600                /*
6601                 * We are supposed to update the task to "current" time, then
6602                 * its up to date and ready to go to new CPU/cfs_rq. But we
6603                 * have difficulty in getting what current time is, so simply
6604                 * throw away the out-of-date time. This will result in the
6605                 * wakee task is less decayed, but giving the wakee more load
6606                 * sounds not bad.
6607                 */
6608                remove_entity_load_avg(&p->se);
6609        }
6610
6611        /* Tell new CPU we are migrated */
6612        p->se.avg.last_update_time = 0;
6613
6614        /* We have migrated, no longer consider this task hot */
6615        p->se.exec_start = 0;
6616
6617        update_scan_period(p, new_cpu);
6618}
6619
6620static void task_dead_fair(struct task_struct *p)
6621{
6622        remove_entity_load_avg(&p->se);
6623}
6624#endif /* CONFIG_SMP */
6625
6626static unsigned long wakeup_gran(struct sched_entity *se)
6627{
6628        unsigned long gran = sysctl_sched_wakeup_granularity;
6629
6630        /*
6631         * Since its curr running now, convert the gran from real-time
6632         * to virtual-time in his units.
6633         *
6634         * By using 'se' instead of 'curr' we penalize light tasks, so
6635         * they get preempted easier. That is, if 'se' < 'curr' then
6636         * the resulting gran will be larger, therefore penalizing the
6637         * lighter, if otoh 'se' > 'curr' then the resulting gran will
6638         * be smaller, again penalizing the lighter task.
6639         *
6640         * This is especially important for buddies when the leftmost
6641         * task is higher priority than the buddy.
6642         */
6643        return calc_delta_fair(gran, se);
6644}
6645
6646/*
6647 * Should 'se' preempt 'curr'.
6648 *
6649 *             |s1
6650 *        |s2
6651 *   |s3
6652 *         g
6653 *      |<--->|c
6654 *
6655 *  w(c, s1) = -1
6656 *  w(c, s2) =  0
6657 *  w(c, s3) =  1
6658 *
6659 */
6660static int
6661wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
6662{
6663        s64 gran, vdiff = curr->vruntime - se->vruntime;
6664
6665        if (vdiff <= 0)
6666                return -1;
6667
6668        gran = wakeup_gran(se);
6669        if (vdiff > gran)
6670                return 1;
6671
6672        return 0;
6673}
6674
6675static void set_last_buddy(struct sched_entity *se)
6676{
6677        if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
6678                return;
6679
6680        for_each_sched_entity(se) {
6681                if (SCHED_WARN_ON(!se->on_rq))
6682                        return;
6683                cfs_rq_of(se)->last = se;
6684        }
6685}
6686
6687static void set_next_buddy(struct sched_entity *se)
6688{
6689        if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
6690                return;
6691
6692        for_each_sched_entity(se) {
6693                if (SCHED_WARN_ON(!se->on_rq))
6694                        return;
6695                cfs_rq_of(se)->next = se;
6696        }
6697}
6698
6699static void set_skip_buddy(struct sched_entity *se)
6700{
6701        for_each_sched_entity(se)
6702                cfs_rq_of(se)->skip = se;
6703}
6704
6705/*
6706 * Preempt the current task with a newly woken task if needed:
6707 */
6708static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
6709{
6710        struct task_struct *curr = rq->curr;
6711        struct sched_entity *se = &curr->se, *pse = &p->se;
6712        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
6713        int scale = cfs_rq->nr_running >= sched_nr_latency;
6714        int next_buddy_marked = 0;
6715
6716        if (unlikely(se == pse))
6717                return;
6718
6719        /*
6720         * This is possible from callers such as attach_tasks(), in which we
6721         * unconditionally check_prempt_curr() after an enqueue (which may have
6722         * lead to a throttle).  This both saves work and prevents false
6723         * next-buddy nomination below.
6724         */
6725        if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
6726                return;
6727
6728        if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
6729                set_next_buddy(pse);
6730                next_buddy_marked = 1;
6731        }
6732
6733        /*
6734         * We can come here with TIF_NEED_RESCHED already set from new task
6735         * wake up path.
6736         *
6737         * Note: this also catches the edge-case of curr being in a throttled
6738         * group (e.g. via set_curr_task), since update_curr() (in the
6739         * enqueue of curr) will have resulted in resched being set.  This
6740         * prevents us from potentially nominating it as a false LAST_BUDDY
6741         * below.
6742         */
6743        if (test_tsk_need_resched(curr))
6744                return;
6745
6746        /* Idle tasks are by definition preempted by non-idle tasks. */
6747        if (unlikely(task_has_idle_policy(curr)) &&
6748            likely(!task_has_idle_policy(p)))
6749                goto preempt;
6750
6751        /*
6752         * Batch and idle tasks do not preempt non-idle tasks (their preemption
6753         * is driven by the tick):
6754         */
6755        if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
6756                return;
6757
6758        find_matching_se(&se, &pse);
6759        update_curr(cfs_rq_of(se));
6760        BUG_ON(!pse);
6761        if (wakeup_preempt_entity(se, pse) == 1) {
6762                /*
6763                 * Bias pick_next to pick the sched entity that is
6764                 * triggering this preemption.
6765                 */
6766                if (!next_buddy_marked)
6767                        set_next_buddy(pse);
6768                goto preempt;
6769        }
6770
6771        return;
6772
6773preempt:
6774        resched_curr(rq);
6775        /*
6776         * Only set the backward buddy when the current task is still
6777         * on the rq. This can happen when a wakeup gets interleaved
6778         * with schedule on the ->pre_schedule() or idle_balance()
6779         * point, either of which can * drop the rq lock.
6780         *
6781         * Also, during early boot the idle thread is in the fair class,
6782         * for obvious reasons its a bad idea to schedule back to it.
6783         */
6784        if (unlikely(!se->on_rq || curr == rq->idle))
6785                return;
6786
6787        if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
6788                set_last_buddy(se);
6789}
6790
6791static struct task_struct *
6792pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
6793{
6794        struct cfs_rq *cfs_rq = &rq->cfs;
6795        struct sched_entity *se;
6796        struct task_struct *p;
6797        int new_tasks;
6798
6799again:
6800        if (!cfs_rq->nr_running)
6801                goto idle;
6802
6803#ifdef CONFIG_FAIR_GROUP_SCHED
6804        if (prev->sched_class != &fair_sched_class)
6805                goto simple;
6806
6807        /*
6808         * Because of the set_next_buddy() in dequeue_task_fair() it is rather
6809         * likely that a next task is from the same cgroup as the current.
6810         *
6811         * Therefore attempt to avoid putting and setting the entire cgroup
6812         * hierarchy, only change the part that actually changes.
6813         */
6814
6815        do {
6816                struct sched_entity *curr = cfs_rq->curr;
6817
6818                /*
6819                 * Since we got here without doing put_prev_entity() we also
6820                 * have to consider cfs_rq->curr. If it is still a runnable
6821                 * entity, update_curr() will update its vruntime, otherwise
6822                 * forget we've ever seen it.
6823                 */
6824                if (curr) {
6825                        if (curr->on_rq)
6826                                update_curr(cfs_rq);
6827                        else
6828                                curr = NULL;
6829
6830                        /*
6831                         * This call to check_cfs_rq_runtime() will do the
6832                         * throttle and dequeue its entity in the parent(s).
6833                         * Therefore the nr_running test will indeed
6834                         * be correct.
6835                         */
6836                        if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
6837                                cfs_rq = &rq->cfs;
6838
6839                                if (!cfs_rq->nr_running)
6840                                        goto idle;
6841
6842                                goto simple;
6843                        }
6844                }
6845
6846                se = pick_next_entity(cfs_rq, curr);
6847                cfs_rq = group_cfs_rq(se);
6848        } while (cfs_rq);
6849
6850        p = task_of(se);
6851
6852        /*
6853         * Since we haven't yet done put_prev_entity and if the selected task
6854         * is a different task than we started out with, try and touch the
6855         * least amount of cfs_rqs.
6856         */
6857        if (prev != p) {
6858                struct sched_entity *pse = &prev->se;
6859
6860                while (!(cfs_rq = is_same_group(se, pse))) {
6861                        int se_depth = se->depth;
6862                        int pse_depth = pse->depth;
6863
6864                        if (se_depth <= pse_depth) {
6865                                put_prev_entity(cfs_rq_of(pse), pse);
6866                                pse = parent_entity(pse);
6867                        }
6868                        if (se_depth >= pse_depth) {
6869                                set_next_entity(cfs_rq_of(se), se);
6870                                se = parent_entity(se);
6871                        }
6872                }
6873
6874                put_prev_entity(cfs_rq, pse);
6875                set_next_entity(cfs_rq, se);
6876        }
6877
6878        goto done;
6879simple:
6880#endif
6881
6882        put_prev_task(rq, prev);
6883
6884        do {
6885                se = pick_next_entity(cfs_rq, NULL);
6886                set_next_entity(cfs_rq, se);
6887                cfs_rq = group_cfs_rq(se);
6888        } while (cfs_rq);
6889
6890        p = task_of(se);
6891
6892done: __maybe_unused;
6893#ifdef CONFIG_SMP
6894        /*
6895         * Move the next running task to the front of
6896         * the list, so our cfs_tasks list becomes MRU
6897         * one.
6898         */
6899        list_move(&p->se.group_node, &rq->cfs_tasks);
6900#endif
6901
6902        if (hrtick_enabled(rq))
6903                hrtick_start_fair(rq, p);
6904
6905        update_misfit_status(p, rq);
6906
6907        return p;
6908
6909idle:
6910        update_misfit_status(NULL, rq);
6911        new_tasks = idle_balance(rq, rf);
6912
6913        /*
6914         * Because idle_balance() releases (and re-acquires) rq->lock, it is
6915         * possible for any higher priority task to appear. In that case we
6916         * must re-start the pick_next_entity() loop.
6917         */
6918        if (new_tasks < 0)
6919                return RETRY_TASK;
6920
6921        if (new_tasks > 0)
6922                goto again;
6923
6924        /*
6925         * rq is about to be idle, check if we need to update the
6926         * lost_idle_time of clock_pelt
6927         */
6928        update_idle_rq_clock_pelt(rq);
6929
6930        return NULL;
6931}
6932
6933/*
6934 * Account for a descheduled task:
6935 */
6936static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
6937{
6938        struct sched_entity *se = &prev->se;
6939        struct cfs_rq *cfs_rq;
6940
6941        for_each_sched_entity(se) {
6942                cfs_rq = cfs_rq_of(se);
6943                put_prev_entity(cfs_rq, se);
6944        }
6945}
6946
6947/*
6948 * sched_yield() is very simple
6949 *
6950 * The magic of dealing with the ->skip buddy is in pick_next_entity.
6951 */
6952static void yield_task_fair(struct rq *rq)
6953{
6954        struct task_struct *curr = rq->curr;
6955        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
6956        struct sched_entity *se = &curr->se;
6957
6958        /*
6959         * Are we the only task in the tree?
6960         */
6961        if (unlikely(rq->nr_running == 1))
6962                return;
6963
6964        clear_buddies(cfs_rq, se);
6965
6966        if (curr->policy != SCHED_BATCH) {
6967                update_rq_clock(rq);
6968                /*
6969                 * Update run-time statistics of the 'current'.
6970                 */
6971                update_curr(cfs_rq);
6972                /*
6973                 * Tell update_rq_clock() that we've just updated,
6974                 * so we don't do microscopic update in schedule()
6975                 * and double the fastpath cost.
6976                 */
6977                rq_clock_skip_update(rq);
6978        }
6979
6980        set_skip_buddy(se);
6981}
6982
6983static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
6984{
6985        struct sched_entity *se = &p->se;
6986
6987        /* throttled hierarchies are not runnable */
6988        if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
6989                return false;
6990
6991        /* Tell the scheduler that we'd really like pse to run next. */
6992        set_next_buddy(se);
6993
6994        yield_task_fair(rq);
6995
6996        return true;
6997}
6998
6999#ifdef CONFIG_SMP
7000/**************************************************

7001 * Fair scheduling class load-balancing methods.
7002 *
7003 * BASICS
7004 *
7005 * The purpose of load-balancing is to achieve the same basic fairness the
7006 * per-CPU scheduler provides, namely provide a proportional amount of compute
7007 * time to each task. This is expressed in the following equation:
7008 *
7009 *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
7010 *
7011 * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
7012 * W_i,0 is defined as:
7013 *
7014 *   W_i,0 = \Sum_j w_i,j                                             (2)
7015 *
7016 * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
7017 * is derived from the nice value as per sched_prio_to_weight[].
7018 *
7019 * The weight average is an exponential decay average of the instantaneous
7020 * weight:
7021 *
7022 *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
7023 *
7024 * C_i is the compute capacity of CPU i, typically it is the
7025 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
7026 * can also include other factors [XXX].
7027 *
7028 * To achieve this balance we define a measure of imbalance which follows
7029 * directly from (1):
7030 *
7031 *   imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j }    (4)
7032 *
7033 * We them move tasks around to minimize the imbalance. In the continuous
7034 * function space it is obvious this converges, in the discrete case we get
7035 * a few fun cases generally called infeasible weight scenarios.
7036 *
7037 * [XXX expand on:
7038 *     - infeasible weights;
7039 *     - local vs global optima in the discrete case. ]
7040 *
7041 *
7042 * SCHED DOMAINS
7043 *
7044 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
7045 * for all i,j solution, we create a tree of CPUs that follows the hardware
7046 * topology where each level pairs two lower groups (or better). This results
7047 * in O(log n) layers. Furthermore we reduce the number of CPUs going up the
7048 * tree to only the first of the previous level and we decrease the frequency
7049 * of load-balance at each level inv. proportional to the number of CPUs in
7050 * the groups.
7051 *
7052 * This yields:
7053 *
7054 *     log_2 n     1     n
7055 *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
7056 *     i = 0      2^i   2^i
7057 *                               `- size of each group
7058 *         |         |     `- number of CPUs doing load-balance
7059 *         |         `- freq
7060 *         `- sum over all levels
7061 *
7062 * Coupled with a limit on how many tasks we can migrate every balance pass,
7063 * this makes (5) the runtime complexity of the balancer.
7064 *
7065 * An important property here is that each CPU is still (indirectly) connected
7066 * to every other CPU in at most O(log n) steps:
7067 *
7068 * The adjacency matrix of the resulting graph is given by:
7069 *
7070 *             log_2 n
7071 *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
7072 *             k = 0
7073 *
7074 * And you'll find that:
7075 *
7076 *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
7077 *
7078 * Showing there's indeed a path between every CPU in at most O(log n) steps.
7079 * The task movement gives a factor of O(m), giving a convergence complexity
7080 * of:
7081 *
7082 *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
7083 *
7084 *
7085 * WORK CONSERVING
7086 *
7087 * In order to avoid CPUs going idle while there's still work to do, new idle
7088 * balancing is more aggressive and has the newly idle CPU iterate up the domain
7089 * tree itself instead of relying on other CPUs to bring it work.
7090 *
7091 * This adds some complexity to both (5) and (8) but it reduces the total idle
7092 * time.
7093 *
7094 * [XXX more?]
7095 *
7096 *
7097 * CGROUPS
7098 *
7099 * Cgroups make a horror show out of (2), instead of a simple sum we get:
7100 *
7101 *                                s_k,i
7102 *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
7103 *                                 S_k
7104 *
7105 * Where
7106 *
7107 *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
7108 *
7109 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
7110 *
7111 * The big problem is S_k, its a global sum needed to compute a local (W_i)
7112 * property.
7113 *
7114 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
7115 *      rewrite all of this once again.]
7116 */
7117
7118static unsigned long __read_mostly max_load_balance_interval = HZ/10;
7119
7120enum fbq_type { regular, remote, all };
7121
7122enum group_type {
7123        group_other = 0,
7124        group_misfit_task,
7125        group_imbalanced,
7126        group_overloaded,
7127};
7128
7129#define LBF_ALL_PINNED  0x01
7130#define LBF_NEED_BREAK  0x02
7131#define LBF_DST_PINNED  0x04
7132#define LBF_SOME_PINNED 0x08
7133#define LBF_NOHZ_STATS  0x10
7134#define LBF_NOHZ_AGAIN  0x20
7135
7136struct lb_env {
7137        struct sched_domain     *sd;
7138
7139        struct rq               *src_rq;
7140        int                     src_cpu;
7141
7142        int                     dst_cpu;
7143        struct rq               *dst_rq;
7144
7145        struct cpumask          *dst_grpmask;
7146        int                     new_dst_cpu;
7147        enum cpu_idle_type      idle;
7148        long                    imbalance;
7149        /* The set of CPUs under consideration for load-balancing */
7150        struct cpumask          *cpus;
7151
7152        unsigned int            flags;
7153
7154        unsigned int            loop;
7155        unsigned int            loop_break;
7156        unsigned int            loop_max;
7157
7158        enum fbq_type           fbq_type;
7159        enum group_type         src_grp_type;
7160        struct list_head        tasks;
7161};
7162
7163/*
7164 * Is this task likely cache-hot:
7165 */
7166static int task_hot(struct task_struct *p, struct lb_env *env)
7167{
7168        s64 delta;
7169
7170        lockdep_assert_held(&env->src_rq->lock);
7171
7172        if (p->sched_class != &fair_sched_class)
7173                return 0;
7174
7175        if (unlikely(task_has_idle_policy(p)))
7176                return 0;
7177
7178        /*
7179         * Buddy candidates are cache hot:
7180         */
7181        if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
7182                        (&p->se == cfs_rq_of(&p->se)->next ||
7183                         &p->se == cfs_rq_of(&p->se)->last))
7184                return 1;
7185
7186        if (sysctl_sched_migration_cost == -1)
7187                return 1;
7188        if (sysctl_sched_migration_cost == 0)
7189                return 0;
7190
7191        delta = rq_clock_task(env->src_rq) - p->se.exec_start;
7192
7193        return delta < (s64)sysctl_sched_migration_cost;
7194}
7195
7196#ifdef CONFIG_NUMA_BALANCING
7197/*
7198 * Returns 1, if task migration degrades locality
7199 * Returns 0, if task migration improves locality i.e migration preferred.
7200 * Returns -1, if task migration is not affected by locality.
7201 */
7202static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
7203{
7204        struct numa_group *numa_group = rcu_dereference(p->numa_group);
7205        unsigned long src_weight, dst_weight;
7206        int src_nid, dst_nid, dist;
7207
7208        if (!static_branch_likely(&sched_numa_balancing))
7209                return -1;
7210
7211        if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
7212                return -1;
7213
7214        src_nid = cpu_to_node(env->src_cpu);
7215        dst_nid = cpu_to_node(env->dst_cpu);
7216
7217        if (src_nid == dst_nid)
7218                return -1;
7219
7220        /* Migrating away from the preferred node is always bad. */
7221        if (src_nid == p->numa_preferred_nid) {
7222                if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
7223                        return 1;
7224                else
7225                        return -1;
7226        }
7227
7228        /* Encourage migration to the preferred node. */
7229        if (dst_nid == p->numa_preferred_nid)
7230                return 0;
7231
7232        /* Leaving a core idle is often worse than degrading locality. */
7233        if (env->idle == CPU_IDLE)
7234                return -1;
7235
7236        dist = node_distance(src_nid, dst_nid);
7237        if (numa_group) {
7238                src_weight = group_weight(p, src_nid, dist);
7239                dst_weight = group_weight(p, dst_nid, dist);
7240        } else {
7241                src_weight = task_weight(p, src_nid, dist);
7242                dst_weight = task_weight(p, dst_nid, dist);
7243        }
7244
7245        return dst_weight < src_weight;
7246}
7247
7248#else
7249static inline int migrate_degrades_locality(struct task_struct *p,
7250                                             struct lb_env *env)
7251{
7252        return -1;
7253}
7254#endif
7255
7256/*
7257 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
7258 */
7259static
7260int can_migrate_task(struct task_struct *p, struct lb_env *env)
7261{
7262        int tsk_cache_hot;
7263
7264        lockdep_assert_held(&env->src_rq->lock);
7265
7266        /*
7267         * We do not migrate tasks that are:
7268         * 1) throttled_lb_pair, or
7269         * 2) cannot be migrated to this CPU due to cpus_ptr, or
7270         * 3) running (obviously), or
7271         * 4) are cache-hot on their current CPU.
7272         */
7273        if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
7274                return 0;
7275
7276        if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
7277                int cpu;
7278
7279                schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
7280
7281                env->flags |= LBF_SOME_PINNED;
7282
7283                /*
7284                 * Remember if this task can be migrated to any other CPU in
7285                 * our sched_group. We may want to revisit it if we couldn't
7286                 * meet load balance goals by pulling other tasks on src_cpu.
7287                 *
7288                 * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
7289                 * already computed one in current iteration.
7290                 */
7291                if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
7292                        return 0;
7293
7294                /* Prevent to re-select dst_cpu via env's CPUs: */
7295                for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
7296                        if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
7297                                env->flags |= LBF_DST_PINNED;
7298                                env->new_dst_cpu = cpu;
7299                                break;
7300                        }
7301                }
7302
7303                return 0;
7304        }
7305
7306        /* Record that we found atleast one task that could run on dst_cpu */
7307        env->flags &= ~LBF_ALL_PINNED;
7308
7309        if (task_running(env->src_rq, p)) {
7310                schedstat_inc(p->se.statistics.nr_failed_migrations_running);
7311                return 0;
7312        }
7313
7314        /*
7315         * Aggressive migration if:
7316         * 1) destination numa is preferred
7317         * 2) task is cache cold, or
7318         * 3) too many balance attempts have failed.
7319         */
7320        tsk_cache_hot = migrate_degrades_locality(p, env);
7321        if (tsk_cache_hot == -1)
7322                tsk_cache_hot = task_hot(p, env);
7323
7324        if (tsk_cache_hot <= 0 ||
7325            env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
7326                if (tsk_cache_hot == 1) {
7327                        schedstat_inc(env->sd->lb_hot_gained[env->idle]);
7328                        schedstat_inc(p->se.statistics.nr_forced_migrations);
7329                }
7330                return 1;
7331        }
7332
7333        schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
7334        return 0;
7335}
7336
7337/*
7338 * detach_task() -- detach the task for the migration specified in env
7339 */
7340static void detach_task(struct task_struct *p, struct lb_env *env)
7341{
7342        lockdep_assert_held(&env->src_rq->lock);
7343
7344        deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
7345        set_task_cpu(p, env->dst_cpu);
7346}
7347
7348/*
7349 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
7350 * part of active balancing operations within "domain".
7351 *
7352 * Returns a task if successful and NULL otherwise.
7353 */
7354static struct task_struct *detach_one_task(struct lb_env *env)
7355{
7356        struct task_struct *p;
7357
7358        lockdep_assert_held(&env->src_rq->lock);
7359
7360        list_for_each_entry_reverse(p,
7361                        &env->src_rq->cfs_tasks, se.group_node) {
7362                if (!can_migrate_task(p, env))
7363                        continue;
7364
7365                detach_task(p, env);
7366
7367                /*
7368                 * Right now, this is only the second place where
7369                 * lb_gained[env->idle] is updated (other is detach_tasks)
7370                 * so we can safely collect stats here rather than
7371                 * inside detach_tasks().
7372                 */
7373                schedstat_inc(env->sd->lb_gained[env->idle]);
7374                return p;
7375        }
7376        return NULL;
7377}
7378
7379static const unsigned int sched_nr_migrate_break = 32;
7380
7381/*
7382 * detach_tasks() -- tries to detach up to imbalance runnable load from
7383 * busiest_rq, as part of a balancing operation within domain "sd".
7384 *
7385 * Returns number of detached tasks if successful and 0 otherwise.
7386 */
7387static int detach_tasks(struct lb_env *env)
7388{
7389        struct list_head *tasks = &env->src_rq->cfs_tasks;
7390        struct task_struct *p;
7391        unsigned long load;
7392        int detached = 0;
7393
7394        lockdep_assert_held(&env->src_rq->lock);
7395
7396        if (env->imbalance <= 0)
7397                return 0;
7398
7399        while (!list_empty(tasks)) {
7400                /*
7401                 * We don't want to steal all, otherwise we may be treated likewise,
7402                 * which could at worst lead to a livelock crash.
7403                 */
7404                if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
7405                        break;
7406
7407                p = list_last_entry(tasks, struct task_struct, se.group_node);
7408
7409                env->loop++;
7410                /* We've more or less seen every task there is, call it quits */
7411                if (env->loop > env->loop_max)
7412                        break;
7413
7414                /* take a breather every nr_migrate tasks */
7415                if (env->loop > env->loop_break) {
7416                        env->loop_break += sched_nr_migrate_break;
7417                        env->flags |= LBF_NEED_BREAK;
7418                        break;
7419                }
7420
7421                if (!can_migrate_task(p, env))
7422                        goto next;
7423
7424                load = task_h_load(p);
7425
7426                if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
7427                        goto next;
7428
7429                if ((load / 2) > env->imbalance)
7430                        goto next;
7431
7432                detach_task(p, env);
7433                list_add(&p->se.group_node, &env->tasks);
7434
7435                detached++;
7436                env->imbalance -= load;
7437
7438#ifdef CONFIG_PREEMPT
7439                /*
7440                 * NEWIDLE balancing is a source of latency, so preemptible
7441                 * kernels will stop after the first task is detached to minimize
7442                 * the critical section.
7443                 */
7444                if (env->idle == CPU_NEWLY_IDLE)
7445                        break;
7446#endif
7447
7448                /*
7449                 * We only want to steal up to the prescribed amount of
7450                 * runnable load.
7451                 */
7452                if (env->imbalance <= 0)
7453                        break;
7454
7455                continue;
7456next:
7457                list_move(&p->se.group_node, tasks);
7458        }
7459
7460        /*
7461         * Right now, this is one of only two places we collect this stat
7462         * so we can safely collect detach_one_task() stats here rather
7463         * than inside detach_one_task().
7464         */
7465        schedstat_add(env->sd->lb_gained[env->idle], detached);
7466
7467        return detached;
7468}
7469
7470/*
7471 * attach_task() -- attach the task detached by detach_task() to its new rq.
7472 */
7473static void attach_task(struct rq *rq, struct task_struct *p)
7474{
7475        lockdep_assert_held(&rq->lock);
7476
7477        BUG_ON(task_rq(p) != rq);
7478        activate_task(rq, p, ENQUEUE_NOCLOCK);
7479        check_preempt_curr(rq, p, 0);
7480}
7481
7482/*
7483 * attach_one_task() -- attaches the task returned from detach_one_task() to
7484 * its new rq.
7485 */
7486static void attach_one_task(struct rq *rq, struct task_struct *p)
7487{
7488        struct rq_flags rf;
7489
7490        rq_lock(rq, &rf);
7491        update_rq_clock(rq);
7492        attach_task(rq, p);
7493        rq_unlock(rq, &rf);
7494}
7495
7496/*
7497 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
7498 * new rq.
7499 */
7500static void attach_tasks(struct lb_env *env)
7501{
7502        struct list_head *tasks = &env->tasks;
7503        struct task_struct *p;
7504        struct rq_flags rf;
7505
7506        rq_lock(env->dst_rq, &rf);
7507        update_rq_clock(env->dst_rq);
7508
7509        while (!list_empty(tasks)) {
7510                p = list_first_entry(tasks, struct task_struct, se.group_node);
7511                list_del_init(&p->se.group_node);
7512
7513                attach_task(env->dst_rq, p);
7514        }
7515
7516        rq_unlock(env->dst_rq, &rf);
7517}
7518
7519#ifdef CONFIG_NO_HZ_COMMON
7520static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
7521{
7522        if (cfs_rq->avg.load_avg)
7523                return true;
7524
7525        if (cfs_rq->avg.util_avg)
7526                return true;
7527
7528        return false;
7529}
7530
7531static inline bool others_have_blocked(struct rq *rq)
7532{
7533        if (READ_ONCE(rq->avg_rt.util_avg))
7534                return true;
7535
7536        if (READ_ONCE(rq->avg_dl.util_avg))
7537                return true;
7538
7539#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
7540        if (READ_ONCE(rq->avg_irq.util_avg))
7541                return true;
7542#endif
7543
7544        return false;
7545}
7546
7547static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
7548{
7549        rq->last_blocked_load_update_tick = jiffies;
7550
7551        if (!has_blocked)
7552                rq->has_blocked_load = 0;
7553}
7554#else
7555static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
7556static inline bool others_have_blocked(struct rq *rq) { return false; }
7557static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
7558#endif
7559
7560#ifdef CONFIG_FAIR_GROUP_SCHED
7561
7562static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
7563{
7564        if (cfs_rq->load.weight)
7565                return false;
7566
7567        if (cfs_rq->avg.load_sum)
7568                return false;
7569
7570        if (cfs_rq->avg.util_sum)
7571                return false;
7572
7573        if (cfs_rq->avg.runnable_load_sum)
7574                return false;
7575
7576        return true;
7577}
7578
7579static void update_blocked_averages(int cpu)
7580{
7581        struct rq *rq = cpu_rq(cpu);
7582        struct cfs_rq *cfs_rq, *pos;
7583        const struct sched_class *curr_class;
7584        struct rq_flags rf;
7585        bool done = true;
7586
7587        rq_lock_irqsave(rq, &rf);
7588        update_rq_clock(rq);
7589
7590        /*
7591         * Iterates the task_group tree in a bottom up fashion, see
7592         * list_add_leaf_cfs_rq() for details.
7593         */
7594        for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
7595                struct sched_entity *se;
7596
7597                if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
7598                        update_tg_load_avg(cfs_rq, 0);
7599
7600                /* Propagate pending load changes to the parent, if any: */
7601                se = cfs_rq->tg->se[cpu];
7602                if (se && !skip_blocked_update(se))
7603                        update_load_avg(cfs_rq_of(se), se, 0);
7604
7605                /*
7606                 * There can be a lot of idle CPU cgroups.  Don't let fully
7607                 * decayed cfs_rqs linger on the list.
7608                 */
7609                if (cfs_rq_is_decayed(cfs_rq))
7610                        list_del_leaf_cfs_rq(cfs_rq);
7611
7612                /* Don't need periodic decay once load/util_avg are null */
7613                if (cfs_rq_has_blocked(cfs_rq))
7614                        done = false;
7615        }
7616
7617        curr_class = rq->curr->sched_class;
7618        update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
7619        update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
7620        update_irq_load_avg(rq, 0);
7621        /* Don't need periodic decay once load/util_avg are null */
7622        if (others_have_blocked(rq))
7623                done = false;
7624
7625        update_blocked_load_status(rq, !done);
7626        rq_unlock_irqrestore(rq, &rf);
7627}
7628
7629/*
7630 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
7631 * This needs to be done in a top-down fashion because the load of a child
7632 * group is a fraction of its parents load.
7633 */
7634static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
7635{
7636        struct rq *rq = rq_of(cfs_rq);
7637        struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
7638        unsigned long now = jiffies;
7639        unsigned long load;
7640
7641        if (cfs_rq->last_h_load_update == now)
7642                return;
7643
7644        WRITE_ONCE(cfs_rq->h_load_next, NULL);
7645        for_each_sched_entity(se) {
7646                cfs_rq = cfs_rq_of(se);
7647                WRITE_ONCE(cfs_rq->h_load_next, se);
7648                if (cfs_rq->last_h_load_update == now)
7649                        break;
7650        }
7651
7652        if (!se) {
7653                cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
7654                cfs_rq->last_h_load_update = now;
7655        }
7656
7657        while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
7658                load = cfs_rq->h_load;
7659                load = div64_ul(load * se->avg.load_avg,
7660                        cfs_rq_load_avg(cfs_rq) + 1);
7661                cfs_rq = group_cfs_rq(se);
7662                cfs_rq->h_load = load;
7663                cfs_rq->last_h_load_update = now;
7664        }
7665}
7666
7667static unsigned long task_h_load(struct task_struct *p)
7668{
7669        struct cfs_rq *cfs_rq = task_cfs_rq(p);
7670
7671        update_cfs_rq_h_load(cfs_rq);
7672        return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
7673                        cfs_rq_load_avg(cfs_rq) + 1);
7674}
7675#else
7676static inline void update_blocked_averages(int cpu)
7677{
7678        struct rq *rq = cpu_rq(cpu);
7679        struct cfs_rq *cfs_rq = &rq->cfs;
7680        const struct sched_class *curr_class;
7681        struct rq_flags rf;
7682
7683        rq_lock_irqsave(rq, &rf);
7684        update_rq_clock(rq);
7685        update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
7686
7687        curr_class = rq->curr->sched_class;
7688        update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
7689        update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
7690        update_irq_load_avg(rq, 0);
7691        update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq));
7692        rq_unlock_irqrestore(rq, &rf);
7693}
7694
7695static unsigned long task_h_load(struct task_struct *p)
7696{
7697        return p->se.avg.load_avg;
7698}
7699#endif
7700
7701/********** Helpers for find_busiest_group ************************/
7702
7703/*
7704 * sg_lb_stats - stats of a sched_group required for load_balancing
7705 */
7706struct sg_lb_stats {
7707        unsigned long avg_load; /*Avg load across the CPUs of the group */
7708        unsigned long group_load; /* Total load over the CPUs of the group */
7709        unsigned long load_per_task;
7710        unsigned long group_capacity;
7711        unsigned long group_util; /* Total utilization of the group */
7712        unsigned int sum_nr_running; /* Nr tasks running in the group */
7713        unsigned int idle_cpus;
7714        unsigned int group_weight;
7715        enum group_type group_type;
7716        int group_no_capacity;
7717        unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
7718#ifdef CONFIG_NUMA_BALANCING
7719        unsigned int nr_numa_running;
7720        unsigned int nr_preferred_running;
7721#endif
7722};
7723
7724/*
7725 * sd_lb_stats - Structure to store the statistics of a sched_domain
7726 *               during load balancing.
7727 */
7728struct sd_lb_stats {
7729        struct sched_group *busiest;    /* Busiest group in this sd */
7730        struct sched_group *local;      /* Local group in this sd */
7731        unsigned long total_running;
7732        unsigned long total_load;       /* Total load of all groups in sd */
7733        unsigned long total_capacity;   /* Total capacity of all groups in sd */
7734        unsigned long avg_load; /* Average load across all groups in sd */
7735
7736        struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
7737        struct sg_lb_stats local_stat;  /* Statistics of the local group */
7738};
7739
7740static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
7741{
7742        /*
7743         * Skimp on the clearing to avoid duplicate work. We can avoid clearing
7744         * local_stat because update_sg_lb_stats() does a full clear/assignment.
7745         * We must however clear busiest_stat::avg_load because
7746         * update_sd_pick_busiest() reads this before assignment.
7747         */
7748        *sds = (struct sd_lb_stats){
7749                .busiest = NULL,
7750                .local = NULL,
7751                .total_running = 0UL,
7752                .total_load = 0UL,
7753                .total_capacity = 0UL,
7754                .busiest_stat = {
7755                        .avg_load = 0UL,
7756                        .sum_nr_running = 0,
7757                        .group_type = group_other,
7758                },
7759        };
7760}
7761
7762static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
7763{
7764        struct rq *rq = cpu_rq(cpu);
7765        unsigned long max = arch_scale_cpu_capacity(cpu);
7766        unsigned long used, free;
7767        unsigned long irq;
7768
7769        irq = cpu_util_irq(rq);
7770
7771        if (unlikely(irq >= max))
7772                return 1;
7773
7774        used = READ_ONCE(rq->avg_rt.util_avg);
7775        used += READ_ONCE(rq->avg_dl.util_avg);
7776
7777        if (unlikely(used >= max))
7778                return 1;
7779
7780        free = max - used;
7781
7782        return scale_irq_capacity(free, irq, max);
7783}
7784
7785static void update_cpu_capacity(struct sched_domain *sd, int cpu)
7786{
7787        unsigned long capacity = scale_rt_capacity(sd, cpu);
7788        struct sched_group *sdg = sd->groups;
7789
7790        cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
7791
7792        if (!capacity)
7793                capacity = 1;
7794
7795        cpu_rq(cpu)->cpu_capacity = capacity;
7796        sdg->sgc->capacity = capacity;
7797        sdg->sgc->min_capacity = capacity;
7798        sdg->sgc->max_capacity = capacity;
7799}
7800
7801void update_group_capacity(struct sched_domain *sd, int cpu)
7802{
7803        struct sched_domain *child = sd->child;
7804        struct sched_group *group, *sdg = sd->groups;
7805        unsigned long capacity, min_capacity, max_capacity;
7806        unsigned long interval;
7807
7808        interval = msecs_to_jiffies(sd->balance_interval);
7809        interval = clamp(interval, 1UL, max_load_balance_interval);
7810        sdg->sgc->next_update = jiffies + interval;
7811
7812        if (!child) {
7813                update_cpu_capacity(sd, cpu);
7814                return;
7815        }
7816
7817        capacity = 0;
7818        min_capacity = ULONG_MAX;
7819        max_capacity = 0;
7820
7821        if (child->flags & SD_OVERLAP) {
7822                /*
7823                 * SD_OVERLAP domains cannot assume that child groups
7824                 * span the current group.
7825                 */
7826
7827                for_each_cpu(cpu, sched_group_span(sdg)) {
7828                        struct sched_group_capacity *sgc;
7829                        struct rq *rq = cpu_rq(cpu);
7830
7831                        /*
7832                         * build_sched_domains() -> init_sched_groups_capacity()
7833                         * gets here before we've attached the domains to the
7834                         * runqueues.
7835                         *
7836                         * Use capacity_of(), which is set irrespective of domains
7837                         * in update_cpu_capacity().
7838                         *
7839                         * This avoids capacity from being 0 and
7840                         * causing divide-by-zero issues on boot.
7841                         */
7842                        if (unlikely(!rq->sd)) {
7843                                capacity += capacity_of(cpu);
7844                        } else {
7845                                sgc = rq->sd->groups->sgc;
7846                                capacity += sgc->capacity;
7847                        }
7848
7849                        min_capacity = min(capacity, min_capacity);
7850                        max_capacity = max(capacity, max_capacity);
7851                }
7852        } else  {
7853                /*
7854                 * !SD_OVERLAP domains can assume that child groups
7855                 * span the current group.
7856                 */
7857
7858                group = child->groups;
7859                do {
7860                        struct sched_group_capacity *sgc = group->sgc;
7861
7862                        capacity += sgc->capacity;
7863                        min_capacity = min(sgc->min_capacity, min_capacity);
7864                        max_capacity = max(sgc->max_capacity, max_capacity);
7865                        group = group->next;
7866                } while (group != child->groups);
7867        }
7868
7869        sdg->sgc->capacity = capacity;
7870        sdg->sgc->min_capacity = min_capacity;
7871        sdg->sgc->max_capacity = max_capacity;
7872}
7873
7874/*
7875 * Check whether the capacity of the rq has been noticeably reduced by side
7876 * activity. The imbalance_pct is used for the threshold.
7877 * Return true is the capacity is reduced
7878 */
7879static inline int
7880check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
7881{
7882        return ((rq->cpu_capacity * sd->imbalance_pct) <
7883                                (rq->cpu_capacity_orig * 100));
7884}
7885
7886/*
7887 * Check whether a rq has a misfit task and if it looks like we can actually
7888 * help that task: we can migrate the task to a CPU of higher capacity, or
7889 * the task's current CPU is heavily pressured.
7890 */
7891static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
7892{
7893        return rq->misfit_task_load &&
7894                (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
7895                 check_cpu_capacity(rq, sd));
7896}
7897
7898/*
7899 * Group imbalance indicates (and tries to solve) the problem where balancing
7900 * groups is inadequate due to ->cpus_ptr constraints.
7901 *
7902 * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
7903 * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
7904 * Something like:
7905 *
7906 *      { 0 1 2 3 } { 4 5 6 7 }
7907 *              *     * * *
7908 *
7909 * If we were to balance group-wise we'd place two tasks in the first group and
7910 * two tasks in the second group. Clearly this is undesired as it will overload
7911 * cpu 3 and leave one of the CPUs in the second group unused.
7912 *
7913 * The current solution to this issue is detecting the skew in the first group
7914 * by noticing the lower domain failed to reach balance and had difficulty
7915 * moving tasks due to affinity constraints.
7916 *
7917 * When this is so detected; this group becomes a candidate for busiest; see
7918 * update_sd_pick_busiest(). And calculate_imbalance() and
7919 * find_busiest_group() avoid some of the usual balance conditions to allow it
7920 * to create an effective group imbalance.
7921 *
7922 * This is a somewhat tricky proposition since the next run might not find the
7923 * group imbalance and decide the groups need to be balanced again. A most
7924 * subtle and fragile situation.
7925 */
7926
7927static inline int sg_imbalanced(struct sched_group *group)
7928{
7929        return group->sgc->imbalance;
7930}
7931
7932/*
7933 * group_has_capacity returns true if the group has spare capacity that could
7934 * be used by some tasks.
7935 * We consider that a group has spare capacity if the  * number of task is
7936 * smaller than the number of CPUs or if the utilization is lower than the
7937 * available capacity for CFS tasks.
7938 * For the latter, we use a threshold to stabilize the state, to take into
7939 * account the variance of the tasks' load and to return true if the available
7940 * capacity in meaningful for the load balancer.
7941 * As an example, an available capacity of 1% can appear but it doesn't make
7942 * any benefit for the load balance.
7943 */
7944static inline bool
7945group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
7946{
7947        if (sgs->sum_nr_running < sgs->group_weight)
7948                return true;
7949
7950        if ((sgs->group_capacity * 100) >
7951                        (sgs->group_util * env->sd->imbalance_pct))
7952                return true;
7953
7954        return false;
7955}
7956
7957/*
7958 *  group_is_overloaded returns true if the group has more tasks than it can
7959 *  handle.
7960 *  group_is_overloaded is not equals to !group_has_capacity because a group
7961 *  with the exact right number of tasks, has no more spare capacity but is not
7962 *  overloaded so both group_has_capacity and group_is_overloaded return
7963 *  false.
7964 */
7965static inline bool
7966group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
7967{
7968        if (sgs->sum_nr_running <= sgs->group_weight)
7969                return false;
7970
7971        if ((sgs->group_capacity * 100) <
7972                        (sgs->group_util * env->sd->imbalance_pct))
7973                return true;
7974
7975        return false;
7976}
7977
7978/*
7979 * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
7980 * per-CPU capacity than sched_group ref.
7981 */
7982static inline bool
7983group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
7984{
7985        return sg->sgc->min_capacity * capacity_margin <
7986                                                ref->sgc->min_capacity * 1024;
7987}
7988
7989/*
7990 * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller
7991 * per-CPU capacity_orig than sched_group ref.
7992 */
7993static inline bool
7994group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
7995{
7996        return sg->sgc->max_capacity * capacity_margin <
7997                                                ref->sgc->max_capacity * 1024;
7998}
7999
8000static inline enum

8001group_type group_classify(struct sched_group *group,
8002                          struct sg_lb_stats *sgs)
8003{
8004        if (sgs->group_no_capacity)
8005                return group_overloaded;
8006
8007        if (sg_imbalanced(group))
8008                return group_imbalanced;
8009
8010        if (sgs->group_misfit_task_load)
8011                return group_misfit_task;
8012
8013        return group_other;
8014}
8015
8016static bool update_nohz_stats(struct rq *rq, bool force)
8017{
8018#ifdef CONFIG_NO_HZ_COMMON
8019        unsigned int cpu = rq->cpu;
8020
8021        if (!rq->has_blocked_load)
8022                return false;
8023
8024        if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
8025                return false;
8026
8027        if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
8028                return true;
8029
8030        update_blocked_averages(cpu);
8031
8032        return rq->has_blocked_load;
8033#else
8034        return false;
8035#endif
8036}
8037
8038/**
8039 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
8040 * @env: The load balancing environment.
8041 * @group: sched_group whose statistics are to be updated.
8042 * @sgs: variable to hold the statistics for this group.
8043 * @sg_status: Holds flag indicating the status of the sched_group
8044 */
8045static inline void update_sg_lb_stats(struct lb_env *env,
8046                                      struct sched_group *group,
8047                                      struct sg_lb_stats *sgs,
8048                                      int *sg_status)
8049{
8050        int i, nr_running;
8051
8052        memset(sgs, 0, sizeof(*sgs));
8053
8054        for_each_cpu_and(i, sched_group_span(group), env->cpus) {
8055                struct rq *rq = cpu_rq(i);
8056
8057                if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
8058                        env->flags |= LBF_NOHZ_AGAIN;
8059
8060                sgs->group_load += cpu_runnable_load(rq);
8061                sgs->group_util += cpu_util(i);
8062                sgs->sum_nr_running += rq->cfs.h_nr_running;
8063
8064                nr_running = rq->nr_running;
8065                if (nr_running > 1)
8066                        *sg_status |= SG_OVERLOAD;
8067
8068                if (cpu_overutilized(i))
8069                        *sg_status |= SG_OVERUTILIZED;
8070
8071#ifdef CONFIG_NUMA_BALANCING
8072                sgs->nr_numa_running += rq->nr_numa_running;
8073                sgs->nr_preferred_running += rq->nr_preferred_running;
8074#endif
8075                /*
8076                 * No need to call idle_cpu() if nr_running is not 0
8077                 */
8078                if (!nr_running && idle_cpu(i))
8079                        sgs->idle_cpus++;
8080
8081                if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
8082                    sgs->group_misfit_task_load < rq->misfit_task_load) {
8083                        sgs->group_misfit_task_load = rq->misfit_task_load;
8084                        *sg_status |= SG_OVERLOAD;
8085                }
8086        }
8087
8088        /* Adjust by relative CPU capacity of the group */
8089        sgs->group_capacity = group->sgc->capacity;
8090        sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
8091
8092        if (sgs->sum_nr_running)
8093                sgs->load_per_task = sgs->group_load / sgs->sum_nr_running;
8094
8095        sgs->group_weight = group->group_weight;
8096
8097        sgs->group_no_capacity = group_is_overloaded(env, sgs);
8098        sgs->group_type = group_classify(group, sgs);
8099}
8100
8101/**
8102 * update_sd_pick_busiest - return 1 on busiest group
8103 * @env: The load balancing environment.
8104 * @sds: sched_domain statistics
8105 * @sg: sched_group candidate to be checked for being the busiest
8106 * @sgs: sched_group statistics
8107 *
8108 * Determine if @sg is a busier group than the previously selected
8109 * busiest group.
8110 *
8111 * Return: %true if @sg is a busier group than the previously selected
8112 * busiest group. %false otherwise.
8113 */
8114static bool update_sd_pick_busiest(struct lb_env *env,
8115                                   struct sd_lb_stats *sds,
8116                                   struct sched_group *sg,
8117                                   struct sg_lb_stats *sgs)
8118{
8119        struct sg_lb_stats *busiest = &sds->busiest_stat;
8120
8121        /*
8122         * Don't try to pull misfit tasks we can't help.
8123         * We can use max_capacity here as reduction in capacity on some
8124         * CPUs in the group should either be possible to resolve
8125         * internally or be covered by avg_load imbalance (eventually).
8126         */
8127        if (sgs->group_type == group_misfit_task &&
8128            (!group_smaller_max_cpu_capacity(sg, sds->local) ||
8129             !group_has_capacity(env, &sds->local_stat)))
8130                return false;
8131
8132        if (sgs->group_type > busiest->group_type)
8133                return true;
8134
8135        if (sgs->group_type < busiest->group_type)
8136                return false;
8137
8138        if (sgs->avg_load <= busiest->avg_load)
8139                return false;
8140
8141        if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
8142                goto asym_packing;
8143
8144        /*
8145         * Candidate sg has no more than one task per CPU and
8146         * has higher per-CPU capacity. Migrating tasks to less
8147         * capable CPUs may harm throughput. Maximize throughput,
8148         * power/energy consequences are not considered.
8149         */
8150        if (sgs->sum_nr_running <= sgs->group_weight &&
8151            group_smaller_min_cpu_capacity(sds->local, sg))
8152                return false;
8153
8154        /*
8155         * If we have more than one misfit sg go with the biggest misfit.
8156         */
8157        if (sgs->group_type == group_misfit_task &&
8158            sgs->group_misfit_task_load < busiest->group_misfit_task_load)
8159                return false;
8160
8161asym_packing:
8162        /* This is the busiest node in its class. */
8163        if (!(env->sd->flags & SD_ASYM_PACKING))
8164                return true;
8165
8166        /* No ASYM_PACKING if target CPU is already busy */
8167        if (env->idle == CPU_NOT_IDLE)
8168                return true;
8169        /*
8170         * ASYM_PACKING needs to move all the work to the highest
8171         * prority CPUs in the group, therefore mark all groups
8172         * of lower priority than ourself as busy.
8173         */
8174        if (sgs->sum_nr_running &&
8175            sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
8176                if (!sds->busiest)
8177                        return true;
8178
8179                /* Prefer to move from lowest priority CPU's work */
8180                if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
8181                                      sg->asym_prefer_cpu))
8182                        return true;
8183        }
8184
8185        return false;
8186}
8187
8188#ifdef CONFIG_NUMA_BALANCING
8189static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8190{
8191        if (sgs->sum_nr_running > sgs->nr_numa_running)
8192                return regular;
8193        if (sgs->sum_nr_running > sgs->nr_preferred_running)
8194                return remote;
8195        return all;
8196}
8197
8198static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8199{
8200        if (rq->nr_running > rq->nr_numa_running)
8201                return regular;
8202        if (rq->nr_running > rq->nr_preferred_running)
8203                return remote;
8204        return all;
8205}
8206#else
8207static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8208{
8209        return all;
8210}
8211
8212static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8213{
8214        return regular;
8215}
8216#endif /* CONFIG_NUMA_BALANCING */
8217
8218/**
8219 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
8220 * @env: The load balancing environment.
8221 * @sds: variable to hold the statistics for this sched_domain.
8222 */
8223static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
8224{
8225        struct sched_domain *child = env->sd->child;
8226        struct sched_group *sg = env->sd->groups;
8227        struct sg_lb_stats *local = &sds->local_stat;
8228        struct sg_lb_stats tmp_sgs;
8229        bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
8230        int sg_status = 0;
8231
8232#ifdef CONFIG_NO_HZ_COMMON
8233        if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
8234                env->flags |= LBF_NOHZ_STATS;
8235#endif
8236
8237        do {
8238                struct sg_lb_stats *sgs = &tmp_sgs;
8239                int local_group;
8240
8241                local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
8242                if (local_group) {
8243                        sds->local = sg;
8244                        sgs = local;
8245
8246                        if (env->idle != CPU_NEWLY_IDLE ||
8247                            time_after_eq(jiffies, sg->sgc->next_update))
8248                                update_group_capacity(env->sd, env->dst_cpu);
8249                }
8250
8251                update_sg_lb_stats(env, sg, sgs, &sg_status);
8252
8253                if (local_group)
8254                        goto next_group;
8255
8256                /*
8257                 * In case the child domain prefers tasks go to siblings
8258                 * first, lower the sg capacity so that we'll try
8259                 * and move all the excess tasks away. We lower the capacity
8260                 * of a group only if the local group has the capacity to fit
8261                 * these excess tasks. The extra check prevents the case where
8262                 * you always pull from the heaviest group when it is already
8263                 * under-utilized (possible with a large weight task outweighs
8264                 * the tasks on the system).
8265                 */
8266                if (prefer_sibling && sds->local &&
8267                    group_has_capacity(env, local) &&
8268                    (sgs->sum_nr_running > local->sum_nr_running + 1)) {
8269                        sgs->group_no_capacity = 1;
8270                        sgs->group_type = group_classify(sg, sgs);
8271                }
8272
8273                if (update_sd_pick_busiest(env, sds, sg, sgs)) {
8274                        sds->busiest = sg;
8275                        sds->busiest_stat = *sgs;
8276                }
8277
8278next_group:
8279                /* Now, start updating sd_lb_stats */
8280                sds->total_running += sgs->sum_nr_running;
8281                sds->total_load += sgs->group_load;
8282                sds->total_capacity += sgs->group_capacity;
8283
8284                sg = sg->next;
8285        } while (sg != env->sd->groups);
8286
8287#ifdef CONFIG_NO_HZ_COMMON
8288        if ((env->flags & LBF_NOHZ_AGAIN) &&
8289            cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
8290
8291                WRITE_ONCE(nohz.next_blocked,
8292                           jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
8293        }
8294#endif
8295
8296        if (env->sd->flags & SD_NUMA)
8297                env->fbq_type = fbq_classify_group(&sds->busiest_stat);
8298
8299        if (!env->sd->parent) {
8300                struct root_domain *rd = env->dst_rq->rd;
8301
8302                /* update overload indicator if we are at root domain */
8303                WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
8304
8305                /* Update over-utilization (tipping point, U >= 0) indicator */
8306                WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
8307                trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
8308        } else if (sg_status & SG_OVERUTILIZED) {
8309                struct root_domain *rd = env->dst_rq->rd;
8310
8311                WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
8312                trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
8313        }
8314}
8315
8316/**
8317 * check_asym_packing - Check to see if the group is packed into the
8318 *                      sched domain.
8319 *
8320 * This is primarily intended to used at the sibling level.  Some
8321 * cores like POWER7 prefer to use lower numbered SMT threads.  In the
8322 * case of POWER7, it can move to lower SMT modes only when higher
8323 * threads are idle.  When in lower SMT modes, the threads will
8324 * perform better since they share less core resources.  Hence when we
8325 * have idle threads, we want them to be the higher ones.
8326 *
8327 * This packing function is run on idle threads.  It checks to see if
8328 * the busiest CPU in this domain (core in the P7 case) has a higher
8329 * CPU number than the packing function is being run on.  Here we are
8330 * assuming lower CPU number will be equivalent to lower a SMT thread
8331 * number.
8332 *
8333 * Return: 1 when packing is required and a task should be moved to
8334 * this CPU.  The amount of the imbalance is returned in env->imbalance.
8335 *
8336 * @env: The load balancing environment.
8337 * @sds: Statistics of the sched_domain which is to be packed
8338 */
8339static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
8340{
8341        int busiest_cpu;
8342
8343        if (!(env->sd->flags & SD_ASYM_PACKING))
8344                return 0;
8345
8346        if (env->idle == CPU_NOT_IDLE)
8347                return 0;
8348
8349        if (!sds->busiest)
8350                return 0;
8351
8352        busiest_cpu = sds->busiest->asym_prefer_cpu;
8353        if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
8354                return 0;
8355
8356        env->imbalance = sds->busiest_stat.group_load;
8357
8358        return 1;
8359}
8360
8361/**
8362 * fix_small_imbalance - Calculate the minor imbalance that exists
8363 *                      amongst the groups of a sched_domain, during
8364 *                      load balancing.
8365 * @env: The load balancing environment.
8366 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
8367 */
8368static inline
8369void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
8370{
8371        unsigned long tmp, capa_now = 0, capa_move = 0;
8372        unsigned int imbn = 2;
8373        unsigned long scaled_busy_load_per_task;
8374        struct sg_lb_stats *local, *busiest;
8375
8376        local = &sds->local_stat;
8377        busiest = &sds->busiest_stat;
8378
8379        if (!local->sum_nr_running)
8380                local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
8381        else if (busiest->load_per_task > local->load_per_task)
8382                imbn = 1;
8383
8384        scaled_busy_load_per_task =
8385                (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
8386                busiest->group_capacity;
8387
8388        if (busiest->avg_load + scaled_busy_load_per_task >=
8389            local->avg_load + (scaled_busy_load_per_task * imbn)) {
8390                env->imbalance = busiest->load_per_task;
8391                return;
8392        }
8393
8394        /*
8395         * OK, we don't have enough imbalance to justify moving tasks,
8396         * however we may be able to increase total CPU capacity used by
8397         * moving them.
8398         */
8399
8400        capa_now += busiest->group_capacity *
8401                        min(busiest->load_per_task, busiest->avg_load);
8402        capa_now += local->group_capacity *
8403                        min(local->load_per_task, local->avg_load);
8404        capa_now /= SCHED_CAPACITY_SCALE;
8405
8406        /* Amount of load we'd subtract */
8407        if (busiest->avg_load > scaled_busy_load_per_task) {
8408                capa_move += busiest->group_capacity *
8409                            min(busiest->load_per_task,
8410                                busiest->avg_load - scaled_busy_load_per_task);
8411        }
8412
8413        /* Amount of load we'd add */
8414        if (busiest->avg_load * busiest->group_capacity <
8415            busiest->load_per_task * SCHED_CAPACITY_SCALE) {
8416                tmp = (busiest->avg_load * busiest->group_capacity) /
8417                      local->group_capacity;
8418        } else {
8419                tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
8420                      local->group_capacity;
8421        }
8422        capa_move += local->group_capacity *
8423                    min(local->load_per_task, local->avg_load + tmp);
8424        capa_move /= SCHED_CAPACITY_SCALE;
8425
8426        /* Move if we gain throughput */
8427        if (capa_move > capa_now)
8428                env->imbalance = busiest->load_per_task;
8429}
8430
8431/**
8432 * calculate_imbalance - Calculate the amount of imbalance present within the
8433 *                       groups of a given sched_domain during load balance.
8434 * @env: load balance environment
8435 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
8436 */
8437static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
8438{
8439        unsigned long max_pull, load_above_capacity = ~0UL;
8440        struct sg_lb_stats *local, *busiest;
8441
8442        local = &sds->local_stat;
8443        busiest = &sds->busiest_stat;
8444
8445        if (busiest->group_type == group_imbalanced) {
8446                /*
8447                 * In the group_imb case we cannot rely on group-wide averages
8448                 * to ensure CPU-load equilibrium, look at wider averages. XXX
8449                 */
8450                busiest->load_per_task =
8451                        min(busiest->load_per_task, sds->avg_load);
8452        }
8453
8454        /*
8455         * Avg load of busiest sg can be less and avg load of local sg can
8456         * be greater than avg load across all sgs of sd because avg load
8457         * factors in sg capacity and sgs with smaller group_type are
8458         * skipped when updating the busiest sg:
8459         */
8460        if (busiest->group_type != group_misfit_task &&
8461            (busiest->avg_load <= sds->avg_load ||
8462             local->avg_load >= sds->avg_load)) {
8463                env->imbalance = 0;
8464                return fix_small_imbalance(env, sds);
8465        }
8466
8467        /*
8468         * If there aren't any idle CPUs, avoid creating some.
8469         */
8470        if (busiest->group_type == group_overloaded &&
8471            local->group_type   == group_overloaded) {
8472                load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
8473                if (load_above_capacity > busiest->group_capacity) {
8474                        load_above_capacity -= busiest->group_capacity;
8475                        load_above_capacity *= scale_load_down(NICE_0_LOAD);
8476                        load_above_capacity /= busiest->group_capacity;
8477                } else
8478                        load_above_capacity = ~0UL;
8479        }
8480
8481        /*
8482         * We're trying to get all the CPUs to the average_load, so we don't
8483         * want to push ourselves above the average load, nor do we wish to
8484         * reduce the max loaded CPU below the average load. At the same time,
8485         * we also don't want to reduce the group load below the group
8486         * capacity. Thus we look for the minimum possible imbalance.
8487         */
8488        max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
8489
8490        /* How much load to actually move to equalise the imbalance */
8491        env->imbalance = min(
8492                max_pull * busiest->group_capacity,
8493                (sds->avg_load - local->avg_load) * local->group_capacity
8494        ) / SCHED_CAPACITY_SCALE;
8495
8496        /* Boost imbalance to allow misfit task to be balanced. */
8497        if (busiest->group_type == group_misfit_task) {
8498                env->imbalance = max_t(long, env->imbalance,
8499                                       busiest->group_misfit_task_load);
8500        }
8501
8502        /*
8503         * if *imbalance is less than the average load per runnable task
8504         * there is no guarantee that any tasks will be moved so we'll have
8505         * a think about bumping its value to force at least one task to be
8506         * moved
8507         */
8508        if (env->imbalance < busiest->load_per_task)
8509                return fix_small_imbalance(env, sds);
8510}
8511
8512/******* find_busiest_group() helpers end here *********************/
8513
8514/**
8515 * find_busiest_group - Returns the busiest group within the sched_domain
8516 * if there is an imbalance.
8517 *
8518 * Also calculates the amount of runnable load which should be moved
8519 * to restore balance.
8520 *
8521 * @env: The load balancing environment.
8522 *
8523 * Return:      - The busiest group if imbalance exists.
8524 */
8525static struct sched_group *find_busiest_group(struct lb_env *env)
8526{
8527        struct sg_lb_stats *local, *busiest;
8528        struct sd_lb_stats sds;
8529
8530        init_sd_lb_stats(&sds);
8531
8532        /*
8533         * Compute the various statistics relavent for load balancing at
8534         * this level.
8535         */
8536        update_sd_lb_stats(env, &sds);
8537
8538        if (sched_energy_enabled()) {
8539                struct root_domain *rd = env->dst_rq->rd;
8540
8541                if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
8542                        goto out_balanced;
8543        }
8544
8545        local = &sds.local_stat;
8546        busiest = &sds.busiest_stat;
8547
8548        /* ASYM feature bypasses nice load balance check */
8549        if (check_asym_packing(env, &sds))
8550                return sds.busiest;
8551
8552        /* There is no busy sibling group to pull tasks from */
8553        if (!sds.busiest || busiest->sum_nr_running == 0)
8554                goto out_balanced;
8555
8556        /* XXX broken for overlapping NUMA groups */
8557        sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
8558                                                / sds.total_capacity;
8559
8560        /*
8561         * If the busiest group is imbalanced the below checks don't
8562         * work because they assume all things are equal, which typically
8563         * isn't true due to cpus_ptr constraints and the like.
8564         */
8565        if (busiest->group_type == group_imbalanced)
8566                goto force_balance;
8567
8568        /*
8569         * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
8570         * capacities from resulting in underutilization due to avg_load.
8571         */
8572        if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
8573            busiest->group_no_capacity)
8574                goto force_balance;
8575
8576        /* Misfit tasks should be dealt with regardless of the avg load */
8577        if (busiest->group_type == group_misfit_task)
8578                goto force_balance;
8579
8580        /*
8581         * If the local group is busier than the selected busiest group
8582         * don't try and pull any tasks.
8583         */
8584        if (local->avg_load >= busiest->avg_load)
8585                goto out_balanced;
8586
8587        /*
8588         * Don't pull any tasks if this group is already above the domain
8589         * average load.
8590         */
8591        if (local->avg_load >= sds.avg_load)
8592                goto out_balanced;
8593
8594        if (env->idle == CPU_IDLE) {
8595                /*
8596                 * This CPU is idle. If the busiest group is not overloaded
8597                 * and there is no imbalance between this and busiest group
8598                 * wrt idle CPUs, it is balanced. The imbalance becomes
8599                 * significant if the diff is greater than 1 otherwise we
8600                 * might end up to just move the imbalance on another group
8601                 */
8602                if ((busiest->group_type != group_overloaded) &&
8603                                (local->idle_cpus <= (busiest->idle_cpus + 1)))
8604                        goto out_balanced;
8605        } else {
8606                /*
8607                 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
8608                 * imbalance_pct to be conservative.
8609                 */
8610                if (100 * busiest->avg_load <=
8611                                env->sd->imbalance_pct * local->avg_load)
8612                        goto out_balanced;
8613        }
8614
8615force_balance:
8616        /* Looks like there is an imbalance. Compute it */
8617        env->src_grp_type = busiest->group_type;
8618        calculate_imbalance(env, &sds);
8619        return env->imbalance ? sds.busiest : NULL;
8620
8621out_balanced:
8622        env->imbalance = 0;
8623        return NULL;
8624}
8625
8626/*
8627 * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
8628 */
8629static struct rq *find_busiest_queue(struct lb_env *env,
8630                                     struct sched_group *group)
8631{
8632        struct rq *busiest = NULL, *rq;
8633        unsigned long busiest_load = 0, busiest_capacity = 1;
8634        int i;
8635
8636        for_each_cpu_and(i, sched_group_span(group), env->cpus) {
8637                unsigned long capacity, load;
8638                enum fbq_type rt;
8639
8640                rq = cpu_rq(i);
8641                rt = fbq_classify_rq(rq);
8642
8643                /*
8644                 * We classify groups/runqueues into three groups:
8645                 *  - regular: there are !numa tasks
8646                 *  - remote:  there are numa tasks that run on the 'wrong' node
8647                 *  - all:     there is no distinction
8648                 *
8649                 * In order to avoid migrating ideally placed numa tasks,
8650                 * ignore those when there's better options.
8651                 *
8652                 * If we ignore the actual busiest queue to migrate another
8653                 * task, the next balance pass can still reduce the busiest
8654                 * queue by moving tasks around inside the node.
8655                 *
8656                 * If we cannot move enough load due to this classification
8657                 * the next pass will adjust the group classification and
8658                 * allow migration of more tasks.
8659                 *
8660                 * Both cases only affect the total convergence complexity.
8661                 */
8662                if (rt > env->fbq_type)
8663                        continue;
8664
8665                /*
8666                 * For ASYM_CPUCAPACITY domains with misfit tasks we simply
8667                 * seek the "biggest" misfit task.
8668                 */
8669                if (env->src_grp_type == group_misfit_task) {
8670                        if (rq->misfit_task_load > busiest_load) {
8671                                busiest_load = rq->misfit_task_load;
8672                                busiest = rq;
8673                        }
8674
8675                        continue;
8676                }
8677
8678                capacity = capacity_of(i);
8679
8680                /*
8681                 * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
8682                 * eventually lead to active_balancing high->low capacity.
8683                 * Higher per-CPU capacity is considered better than balancing
8684                 * average load.
8685                 */
8686                if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
8687                    capacity_of(env->dst_cpu) < capacity &&
8688                    rq->nr_running == 1)
8689                        continue;
8690
8691                load = cpu_runnable_load(rq);
8692
8693                /*
8694                 * When comparing with imbalance, use cpu_runnable_load()
8695                 * which is not scaled with the CPU capacity.
8696                 */
8697
8698                if (rq->nr_running == 1 && load > env->imbalance &&
8699                    !check_cpu_capacity(rq, env->sd))
8700                        continue;
8701
8702                /*
8703                 * For the load comparisons with the other CPU's, consider
8704                 * the cpu_runnable_load() scaled with the CPU capacity, so
8705                 * that the load can be moved away from the CPU that is
8706                 * potentially running at a lower capacity.
8707                 *
8708                 * Thus we're looking for max(load_i / capacity_i), crosswise
8709                 * multiplication to rid ourselves of the division works out
8710                 * to: load_i * capacity_j > load_j * capacity_i;  where j is
8711                 * our previous maximum.
8712                 */
8713                if (load * busiest_capacity > busiest_load * capacity) {
8714                        busiest_load = load;
8715                        busiest_capacity = capacity;
8716                        busiest = rq;
8717                }
8718        }
8719
8720        return busiest;
8721}
8722
8723/*
8724 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
8725 * so long as it is large enough.
8726 */
8727#define MAX_PINNED_INTERVAL     512
8728
8729static inline bool
8730asym_active_balance(struct lb_env *env)
8731{
8732        /*
8733         * ASYM_PACKING needs to force migrate tasks from busy but
8734         * lower priority CPUs in order to pack all tasks in the
8735         * highest priority CPUs.
8736         */
8737        return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
8738               sched_asym_prefer(env->dst_cpu, env->src_cpu);
8739}
8740
8741static inline bool
8742voluntary_active_balance(struct lb_env *env)
8743{
8744        struct sched_domain *sd = env->sd;
8745
8746        if (asym_active_balance(env))
8747                return 1;
8748
8749        /*
8750         * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
8751         * It's worth migrating the task if the src_cpu's capacity is reduced
8752         * because of other sched_class or IRQs if more capacity stays
8753         * available on dst_cpu.
8754         */
8755        if ((env->idle != CPU_NOT_IDLE) &&
8756            (env->src_rq->cfs.h_nr_running == 1)) {
8757                if ((check_cpu_capacity(env->src_rq, sd)) &&
8758                    (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
8759                        return 1;
8760        }
8761
8762        if (env->src_grp_type == group_misfit_task)
8763                return 1;
8764
8765        return 0;
8766}
8767
8768static int need_active_balance(struct lb_env *env)
8769{
8770        struct sched_domain *sd = env->sd;
8771
8772        if (voluntary_active_balance(env))
8773                return 1;
8774
8775        return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
8776}
8777
8778static int active_load_balance_cpu_stop(void *data);
8779
8780static int should_we_balance(struct lb_env *env)
8781{
8782        struct sched_group *sg = env->sd->groups;
8783        int cpu, balance_cpu = -1;
8784
8785        /*
8786         * Ensure the balancing environment is consistent; can happen
8787         * when the softirq triggers 'during' hotplug.
8788         */
8789        if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
8790                return 0;
8791
8792        /*
8793         * In the newly idle case, we will allow all the CPUs
8794         * to do the newly idle load balance.
8795         */
8796        if (env->idle == CPU_NEWLY_IDLE)
8797                return 1;
8798
8799        /* Try to find first idle CPU */
8800        for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
8801                if (!idle_cpu(cpu))
8802                        continue;
8803
8804                balance_cpu = cpu;
8805                break;
8806        }
8807
8808        if (balance_cpu == -1)
8809                balance_cpu = group_balance_cpu(sg);
8810
8811        /*
8812         * First idle CPU or the first CPU(busiest) in this sched group
8813         * is eligible for doing load balancing at this and above domains.
8814         */
8815        return balance_cpu == env->dst_cpu;
8816}
8817
8818/*
8819 * Check this_cpu to ensure it is balanced within domain. Attempt to move
8820 * tasks if there is an imbalance.
8821 */
8822static int load_balance(int this_cpu, struct rq *this_rq,
8823                        struct sched_domain *sd, enum cpu_idle_type idle,
8824                        int *continue_balancing)
8825{
8826        int ld_moved, cur_ld_moved, active_balance = 0;
8827        struct sched_domain *sd_parent = sd->parent;
8828        struct sched_group *group;
8829        struct rq *busiest;
8830        struct rq_flags rf;
8831        struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
8832
8833        struct lb_env env = {
8834                .sd             = sd,
8835                .dst_cpu        = this_cpu,
8836                .dst_rq         = this_rq,
8837                .dst_grpmask    = sched_group_span(sd->groups),
8838                .idle           = idle,
8839                .loop_break     = sched_nr_migrate_break,
8840                .cpus           = cpus,
8841                .fbq_type       = all,
8842                .tasks          = LIST_HEAD_INIT(env.tasks),
8843        };
8844
8845        cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
8846
8847        schedstat_inc(sd->lb_count[idle]);
8848
8849redo:
8850        if (!should_we_balance(&env)) {
8851                *continue_balancing = 0;
8852                goto out_balanced;
8853        }
8854
8855        group = find_busiest_group(&env);
8856        if (!group) {
8857                schedstat_inc(sd->lb_nobusyg[idle]);
8858                goto out_balanced;
8859        }
8860
8861        busiest = find_busiest_queue(&env, group);
8862        if (!busiest) {
8863                schedstat_inc(sd->lb_nobusyq[idle]);
8864                goto out_balanced;
8865        }
8866
8867        BUG_ON(busiest == env.dst_rq);
8868
8869        schedstat_add(sd->lb_imbalance[idle], env.imbalance);
8870
8871        env.src_cpu = busiest->cpu;
8872        env.src_rq = busiest;
8873
8874        ld_moved = 0;
8875        if (busiest->nr_running > 1) {
8876                /*
8877                 * Attempt to move tasks. If find_busiest_group has found
8878                 * an imbalance but busiest->nr_running <= 1, the group is
8879                 * still unbalanced. ld_moved simply stays zero, so it is
8880                 * correctly treated as an imbalance.
8881                 */
8882                env.flags |= LBF_ALL_PINNED;
8883                env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
8884
8885more_balance:
8886                rq_lock_irqsave(busiest, &rf);
8887                update_rq_clock(busiest);
8888
8889                /*
8890                 * cur_ld_moved - load moved in current iteration
8891                 * ld_moved     - cumulative load moved across iterations
8892                 */
8893                cur_ld_moved = detach_tasks(&env);
8894
8895                /*
8896                 * We've detached some tasks from busiest_rq. Every
8897                 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
8898                 * unlock busiest->lock, and we are able to be sure
8899                 * that nobody can manipulate the tasks in parallel.
8900                 * See task_rq_lock() family for the details.
8901                 */
8902
8903                rq_unlock(busiest, &rf);
8904
8905                if (cur_ld_moved) {
8906                        attach_tasks(&env);
8907                        ld_moved += cur_ld_moved;
8908                }
8909
8910                local_irq_restore(rf.flags);
8911
8912                if (env.flags & LBF_NEED_BREAK) {
8913                        env.flags &= ~LBF_NEED_BREAK;
8914                        goto more_balance;
8915                }
8916
8917                /*
8918                 * Revisit (affine) tasks on src_cpu that couldn't be moved to
8919                 * us and move them to an alternate dst_cpu in our sched_group
8920                 * where they can run. The upper limit on how many times we
8921                 * iterate on same src_cpu is dependent on number of CPUs in our
8922                 * sched_group.
8923                 *
8924                 * This changes load balance semantics a bit on who can move
8925                 * load to a given_cpu. In addition to the given_cpu itself
8926                 * (or a ilb_cpu acting on its behalf where given_cpu is
8927                 * nohz-idle), we now have balance_cpu in a position to move
8928                 * load to given_cpu. In rare situations, this may cause
8929                 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
8930                 * _independently_ and at _same_ time to move some load to
8931                 * given_cpu) causing exceess load to be moved to given_cpu.
8932                 * This however should not happen so much in practice and
8933                 * moreover subsequent load balance cycles should correct the
8934                 * excess load moved.
8935                 */
8936                if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
8937
8938                        /* Prevent to re-select dst_cpu via env's CPUs */
8939                        __cpumask_clear_cpu(env.dst_cpu, env.cpus);
8940
8941                        env.dst_rq       = cpu_rq(env.new_dst_cpu);
8942                        env.dst_cpu      = env.new_dst_cpu;
8943                        env.flags       &= ~LBF_DST_PINNED;
8944                        env.loop         = 0;
8945                        env.loop_break   = sched_nr_migrate_break;
8946
8947                        /*
8948                         * Go back to "more_balance" rather than "redo" since we
8949                         * need to continue with same src_cpu.
8950                         */
8951                        goto more_balance;
8952                }
8953
8954                /*
8955                 * We failed to reach balance because of affinity.
8956                 */
8957                if (sd_parent) {
8958                        int *group_imbalance = &sd_parent->groups->sgc->imbalance;
8959
8960                        if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
8961                                *group_imbalance = 1;
8962                }
8963
8964                /* All tasks on this runqueue were pinned by CPU affinity */
8965                if (unlikely(env.flags & LBF_ALL_PINNED)) {
8966                        __cpumask_clear_cpu(cpu_of(busiest), cpus);
8967                        /*
8968                         * Attempting to continue load balancing at the current
8969                         * sched_domain level only makes sense if there are
8970                         * active CPUs remaining as possible busiest CPUs to
8971                         * pull load from which are not contained within the
8972                         * destination group that is receiving any migrated
8973                         * load.
8974                         */
8975                        if (!cpumask_subset(cpus, env.dst_grpmask)) {
8976                                env.loop = 0;
8977                                env.loop_break = sched_nr_migrate_break;
8978                                goto redo;
8979                        }
8980                        goto out_all_pinned;
8981                }
8982        }
8983
8984        if (!ld_moved) {
8985                schedstat_inc(sd->lb_failed[idle]);
8986                /*
8987                 * Increment the failure counter only on periodic balance.
8988                 * We do not want newidle balance, which can be very
8989                 * frequent, pollute the failure counter causing
8990                 * excessive cache_hot migrations and active balances.
8991                 */
8992                if (idle != CPU_NEWLY_IDLE)
8993                        sd->nr_balance_failed++;
8994
8995                if (need_active_balance(&env)) {
8996                        unsigned long flags;
8997
8998                        raw_spin_lock_irqsave(&busiest->lock, flags);
8999
9000                        /*

9001                         * Don't kick the active_load_balance_cpu_stop,
9002                         * if the curr task on busiest CPU can't be
9003                         * moved to this_cpu:
9004                         */
9005                        if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
9006                                raw_spin_unlock_irqrestore(&busiest->lock,
9007                                                            flags);
9008                                env.flags |= LBF_ALL_PINNED;
9009                                goto out_one_pinned;
9010                        }
9011
9012                        /*
9013                         * ->active_balance synchronizes accesses to
9014                         * ->active_balance_work.  Once set, it's cleared
9015                         * only after active load balance is finished.
9016                         */
9017                        if (!busiest->active_balance) {
9018                                busiest->active_balance = 1;
9019                                busiest->push_cpu = this_cpu;
9020                                active_balance = 1;
9021                        }
9022                        raw_spin_unlock_irqrestore(&busiest->lock, flags);
9023
9024                        if (active_balance) {
9025                                stop_one_cpu_nowait(cpu_of(busiest),
9026                                        active_load_balance_cpu_stop, busiest,
9027                                        &busiest->active_balance_work);
9028                        }
9029
9030                        /* We've kicked active balancing, force task migration. */
9031                        sd->nr_balance_failed = sd->cache_nice_tries+1;
9032                }
9033        } else
9034                sd->nr_balance_failed = 0;
9035
9036        if (likely(!active_balance) || voluntary_active_balance(&env)) {
9037                /* We were unbalanced, so reset the balancing interval */
9038                sd->balance_interval = sd->min_interval;
9039        } else {
9040                /*
9041                 * If we've begun active balancing, start to back off. This
9042                 * case may not be covered by the all_pinned logic if there
9043                 * is only 1 task on the busy runqueue (because we don't call
9044                 * detach_tasks).
9045                 */
9046                if (sd->balance_interval < sd->max_interval)
9047                        sd->balance_interval *= 2;
9048        }
9049
9050        goto out;
9051
9052out_balanced:
9053        /*
9054         * We reach balance although we may have faced some affinity
9055         * constraints. Clear the imbalance flag if it was set.
9056         */
9057        if (sd_parent) {
9058                int *group_imbalance = &sd_parent->groups->sgc->imbalance;
9059
9060                if (*group_imbalance)
9061                        *group_imbalance = 0;
9062        }
9063
9064out_all_pinned:
9065        /*
9066         * We reach balance because all tasks are pinned at this level so
9067         * we can't migrate them. Let the imbalance flag set so parent level
9068         * can try to migrate them.
9069         */
9070        schedstat_inc(sd->lb_balanced[idle]);
9071
9072        sd->nr_balance_failed = 0;
9073
9074out_one_pinned:
9075        ld_moved = 0;
9076
9077        /*
9078         * idle_balance() disregards balance intervals, so we could repeatedly
9079         * reach this code, which would lead to balance_interval skyrocketting
9080         * in a short amount of time. Skip the balance_interval increase logic
9081         * to avoid that.
9082         */
9083        if (env.idle == CPU_NEWLY_IDLE)
9084                goto out;
9085
9086        /* tune up the balancing interval */
9087        if ((env.flags & LBF_ALL_PINNED &&
9088             sd->balance_interval < MAX_PINNED_INTERVAL) ||
9089            sd->balance_interval < sd->max_interval)
9090                sd->balance_interval *= 2;
9091out:
9092        return ld_moved;
9093}
9094
9095static inline unsigned long
9096get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
9097{
9098        unsigned long interval = sd->balance_interval;
9099
9100        if (cpu_busy)
9101                interval *= sd->busy_factor;
9102
9103        /* scale ms to jiffies */
9104        interval = msecs_to_jiffies(interval);
9105        interval = clamp(interval, 1UL, max_load_balance_interval);
9106
9107        return interval;
9108}
9109
9110static inline void
9111update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
9112{
9113        unsigned long interval, next;
9114
9115        /* used by idle balance, so cpu_busy = 0 */
9116        interval = get_sd_balance_interval(sd, 0);
9117        next = sd->last_balance + interval;
9118
9119        if (time_after(*next_balance, next))
9120                *next_balance = next;
9121}
9122
9123/*
9124 * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
9125 * running tasks off the busiest CPU onto idle CPUs. It requires at
9126 * least 1 task to be running on each physical CPU where possible, and
9127 * avoids physical / logical imbalances.
9128 */
9129static int active_load_balance_cpu_stop(void *data)
9130{
9131        struct rq *busiest_rq = data;
9132        int busiest_cpu = cpu_of(busiest_rq);
9133        int target_cpu = busiest_rq->push_cpu;
9134        struct rq *target_rq = cpu_rq(target_cpu);
9135        struct sched_domain *sd;
9136        struct task_struct *p = NULL;
9137        struct rq_flags rf;
9138
9139        rq_lock_irq(busiest_rq, &rf);
9140        /*
9141         * Between queueing the stop-work and running it is a hole in which
9142         * CPUs can become inactive. We should not move tasks from or to
9143         * inactive CPUs.
9144         */
9145        if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
9146                goto out_unlock;
9147
9148        /* Make sure the requested CPU hasn't gone down in the meantime: */
9149        if (unlikely(busiest_cpu != smp_processor_id() ||
9150                     !busiest_rq->active_balance))
9151                goto out_unlock;
9152
9153        /* Is there any task to move? */
9154        if (busiest_rq->nr_running <= 1)
9155                goto out_unlock;
9156
9157        /*
9158         * This condition is "impossible", if it occurs
9159         * we need to fix it. Originally reported by
9160         * Bjorn Helgaas on a 128-CPU setup.
9161         */
9162        BUG_ON(busiest_rq == target_rq);
9163
9164        /* Search for an sd spanning us and the target CPU. */
9165        rcu_read_lock();
9166        for_each_domain(target_cpu, sd) {
9167                if ((sd->flags & SD_LOAD_BALANCE) &&
9168                    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
9169                                break;
9170        }
9171
9172        if (likely(sd)) {
9173                struct lb_env env = {
9174                        .sd             = sd,
9175                        .dst_cpu        = target_cpu,
9176                        .dst_rq         = target_rq,
9177                        .src_cpu        = busiest_rq->cpu,
9178                        .src_rq         = busiest_rq,
9179                        .idle           = CPU_IDLE,
9180                        /*
9181                         * can_migrate_task() doesn't need to compute new_dst_cpu
9182                         * for active balancing. Since we have CPU_IDLE, but no
9183                         * @dst_grpmask we need to make that test go away with lying
9184                         * about DST_PINNED.
9185                         */
9186                        .flags          = LBF_DST_PINNED,
9187                };
9188
9189                schedstat_inc(sd->alb_count);
9190                update_rq_clock(busiest_rq);
9191
9192                p = detach_one_task(&env);
9193                if (p) {
9194                        schedstat_inc(sd->alb_pushed);
9195                        /* Active balancing done, reset the failure counter. */
9196                        sd->nr_balance_failed = 0;
9197                } else {
9198                        schedstat_inc(sd->alb_failed);
9199                }
9200        }
9201        rcu_read_unlock();
9202out_unlock:
9203        busiest_rq->active_balance = 0;
9204        rq_unlock(busiest_rq, &rf);
9205
9206        if (p)
9207                attach_one_task(target_rq, p);
9208
9209        local_irq_enable();
9210
9211        return 0;
9212}
9213
9214static DEFINE_SPINLOCK(balancing);
9215
9216/*
9217 * Scale the max load_balance interval with the number of CPUs in the system.
9218 * This trades load-balance latency on larger machines for less cross talk.
9219 */
9220void update_max_interval(void)
9221{
9222        max_load_balance_interval = HZ*num_online_cpus()/10;
9223}
9224
9225/*
9226 * It checks each scheduling domain to see if it is due to be balanced,
9227 * and initiates a balancing operation if so.
9228 *
9229 * Balancing parameters are set up in init_sched_domains.
9230 */
9231static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
9232{
9233        int continue_balancing = 1;
9234        int cpu = rq->cpu;
9235        unsigned long interval;
9236        struct sched_domain *sd;
9237        /* Earliest time when we have to do rebalance again */
9238        unsigned long next_balance = jiffies + 60*HZ;
9239        int update_next_balance = 0;
9240        int need_serialize, need_decay = 0;
9241        u64 max_cost = 0;
9242
9243        rcu_read_lock();
9244        for_each_domain(cpu, sd) {
9245                /*
9246                 * Decay the newidle max times here because this is a regular
9247                 * visit to all the domains. Decay ~1% per second.
9248                 */
9249                if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
9250                        sd->max_newidle_lb_cost =
9251                                (sd->max_newidle_lb_cost * 253) / 256;
9252                        sd->next_decay_max_lb_cost = jiffies + HZ;
9253                        need_decay = 1;
9254                }
9255                max_cost += sd->max_newidle_lb_cost;
9256
9257                if (!(sd->flags & SD_LOAD_BALANCE))
9258                        continue;
9259
9260                /*
9261                 * Stop the load balance at this level. There is another
9262                 * CPU in our sched group which is doing load balancing more
9263                 * actively.
9264                 */
9265                if (!continue_balancing) {
9266                        if (need_decay)
9267                                continue;
9268                        break;
9269                }
9270
9271                interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
9272
9273                need_serialize = sd->flags & SD_SERIALIZE;
9274                if (need_serialize) {
9275                        if (!spin_trylock(&balancing))
9276                                goto out;
9277                }
9278
9279                if (time_after_eq(jiffies, sd->last_balance + interval)) {
9280                        if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
9281                                /*
9282                                 * The LBF_DST_PINNED logic could have changed
9283                                 * env->dst_cpu, so we can't know our idle
9284                                 * state even if we migrated tasks. Update it.
9285                                 */
9286                                idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
9287                        }
9288                        sd->last_balance = jiffies;
9289                        interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
9290                }
9291                if (need_serialize)
9292                        spin_unlock(&balancing);
9293out:
9294                if (time_after(next_balance, sd->last_balance + interval)) {
9295                        next_balance = sd->last_balance + interval;
9296                        update_next_balance = 1;
9297                }
9298        }
9299        if (need_decay) {
9300                /*
9301                 * Ensure the rq-wide value also decays but keep it at a
9302                 * reasonable floor to avoid funnies with rq->avg_idle.
9303                 */
9304                rq->max_idle_balance_cost =
9305                        max((u64)sysctl_sched_migration_cost, max_cost);
9306        }
9307        rcu_read_unlock();
9308
9309        /*
9310         * next_balance will be updated only when there is a need.
9311         * When the cpu is attached to null domain for ex, it will not be
9312         * updated.
9313         */
9314        if (likely(update_next_balance)) {
9315                rq->next_balance = next_balance;
9316
9317#ifdef CONFIG_NO_HZ_COMMON
9318                /*
9319                 * If this CPU has been elected to perform the nohz idle
9320                 * balance. Other idle CPUs have already rebalanced with
9321                 * nohz_idle_balance() and nohz.next_balance has been
9322                 * updated accordingly. This CPU is now running the idle load
9323                 * balance for itself and we need to update the
9324                 * nohz.next_balance accordingly.
9325                 */
9326                if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
9327                        nohz.next_balance = rq->next_balance;
9328#endif
9329        }
9330}
9331
9332static inline int on_null_domain(struct rq *rq)
9333{
9334        return unlikely(!rcu_dereference_sched(rq->sd));
9335}
9336
9337#ifdef CONFIG_NO_HZ_COMMON
9338/*
9339 * idle load balancing details
9340 * - When one of the busy CPUs notice that there may be an idle rebalancing
9341 *   needed, they will kick the idle load balancer, which then does idle
9342 *   load balancing for all the idle CPUs.
9343 * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set
9344 *   anywhere yet.
9345 */
9346
9347static inline int find_new_ilb(void)
9348{
9349        int ilb;
9350
9351        for_each_cpu_and(ilb, nohz.idle_cpus_mask,
9352                              housekeeping_cpumask(HK_FLAG_MISC)) {
9353                if (idle_cpu(ilb))
9354                        return ilb;
9355        }
9356
9357        return nr_cpu_ids;
9358}
9359
9360/*
9361 * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
9362 * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).
9363 */
9364static void kick_ilb(unsigned int flags)
9365{
9366        int ilb_cpu;
9367
9368        nohz.next_balance++;
9369
9370        ilb_cpu = find_new_ilb();
9371
9372        if (ilb_cpu >= nr_cpu_ids)
9373                return;
9374
9375        flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
9376        if (flags & NOHZ_KICK_MASK)
9377                return;
9378
9379        /*
9380         * Use smp_send_reschedule() instead of resched_cpu().
9381         * This way we generate a sched IPI on the target CPU which
9382         * is idle. And the softirq performing nohz idle load balance
9383         * will be run before returning from the IPI.
9384         */
9385        smp_send_reschedule(ilb_cpu);
9386}
9387
9388/*
9389 * Current decision point for kicking the idle load balancer in the presence
9390 * of idle CPUs in the system.
9391 */
9392static void nohz_balancer_kick(struct rq *rq)
9393{
9394        unsigned long now = jiffies;
9395        struct sched_domain_shared *sds;
9396        struct sched_domain *sd;
9397        int nr_busy, i, cpu = rq->cpu;
9398        unsigned int flags = 0;
9399
9400        if (unlikely(rq->idle_balance))
9401                return;
9402
9403        /*
9404         * We may be recently in ticked or tickless idle mode. At the first
9405         * busy tick after returning from idle, we will update the busy stats.
9406         */
9407        nohz_balance_exit_idle(rq);
9408
9409        /*
9410         * None are in tickless mode and hence no need for NOHZ idle load
9411         * balancing.
9412         */
9413        if (likely(!atomic_read(&nohz.nr_cpus)))
9414                return;
9415
9416        if (READ_ONCE(nohz.has_blocked) &&
9417            time_after(now, READ_ONCE(nohz.next_blocked)))
9418                flags = NOHZ_STATS_KICK;
9419
9420        if (time_before(now, nohz.next_balance))
9421                goto out;
9422
9423        if (rq->nr_running >= 2) {
9424                flags = NOHZ_KICK_MASK;
9425                goto out;
9426        }
9427
9428        rcu_read_lock();
9429
9430        sd = rcu_dereference(rq->sd);
9431        if (sd) {
9432                /*
9433                 * If there's a CFS task and the current CPU has reduced
9434                 * capacity; kick the ILB to see if there's a better CPU to run
9435                 * on.
9436                 */
9437                if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
9438                        flags = NOHZ_KICK_MASK;
9439                        goto unlock;
9440                }
9441        }
9442
9443        sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
9444        if (sd) {
9445                /*
9446                 * When ASYM_PACKING; see if there's a more preferred CPU
9447                 * currently idle; in which case, kick the ILB to move tasks
9448                 * around.
9449                 */
9450                for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
9451                        if (sched_asym_prefer(i, cpu)) {
9452                                flags = NOHZ_KICK_MASK;
9453                                goto unlock;
9454                        }
9455                }
9456        }
9457
9458        sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
9459        if (sd) {
9460                /*
9461                 * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
9462                 * to run the misfit task on.
9463                 */
9464                if (check_misfit_status(rq, sd)) {
9465                        flags = NOHZ_KICK_MASK;
9466                        goto unlock;
9467                }
9468
9469                /*
9470                 * For asymmetric systems, we do not want to nicely balance
9471                 * cache use, instead we want to embrace asymmetry and only
9472                 * ensure tasks have enough CPU capacity.
9473                 *
9474                 * Skip the LLC logic because it's not relevant in that case.
9475                 */
9476                goto unlock;
9477        }
9478
9479        sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
9480        if (sds) {
9481                /*
9482                 * If there is an imbalance between LLC domains (IOW we could
9483                 * increase the overall cache use), we need some less-loaded LLC
9484                 * domain to pull some load. Likewise, we may need to spread
9485                 * load within the current LLC domain (e.g. packed SMT cores but
9486                 * other CPUs are idle). We can't really know from here how busy
9487                 * the others are - so just get a nohz balance going if it looks
9488                 * like this LLC domain has tasks we could move.
9489                 */
9490                nr_busy = atomic_read(&sds->nr_busy_cpus);
9491                if (nr_busy > 1) {
9492                        flags = NOHZ_KICK_MASK;
9493                        goto unlock;
9494                }
9495        }
9496unlock:
9497        rcu_read_unlock();
9498out:
9499        if (flags)
9500                kick_ilb(flags);
9501}
9502
9503static void set_cpu_sd_state_busy(int cpu)
9504{
9505        struct sched_domain *sd;
9506
9507        rcu_read_lock();
9508        sd = rcu_dereference(per_cpu(sd_llc, cpu));
9509
9510        if (!sd || !sd->nohz_idle)
9511                goto unlock;
9512        sd->nohz_idle = 0;
9513
9514        atomic_inc(&sd->shared->nr_busy_cpus);
9515unlock:
9516        rcu_read_unlock();
9517}
9518
9519void nohz_balance_exit_idle(struct rq *rq)
9520{
9521        SCHED_WARN_ON(rq != this_rq());
9522
9523        if (likely(!rq->nohz_tick_stopped))
9524                return;
9525
9526        rq->nohz_tick_stopped = 0;
9527        cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
9528        atomic_dec(&nohz.nr_cpus);
9529
9530        set_cpu_sd_state_busy(rq->cpu);
9531}
9532
9533static void set_cpu_sd_state_idle(int cpu)
9534{
9535        struct sched_domain *sd;
9536
9537        rcu_read_lock();
9538        sd = rcu_dereference(per_cpu(sd_llc, cpu));
9539
9540        if (!sd || sd->nohz_idle)
9541                goto unlock;
9542        sd->nohz_idle = 1;
9543
9544        atomic_dec(&sd->shared->nr_busy_cpus);
9545unlock:
9546        rcu_read_unlock();
9547}
9548
9549/*
9550 * This routine will record that the CPU is going idle with tick stopped.
9551 * This info will be used in performing idle load balancing in the future.
9552 */
9553void nohz_balance_enter_idle(int cpu)
9554{
9555        struct rq *rq = cpu_rq(cpu);
9556
9557        SCHED_WARN_ON(cpu != smp_processor_id());
9558
9559        /* If this CPU is going down, then nothing needs to be done: */
9560        if (!cpu_active(cpu))
9561                return;
9562
9563        /* Spare idle load balancing on CPUs that don't want to be disturbed: */
9564        if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
9565                return;
9566
9567        /*
9568         * Can be set safely without rq->lock held
9569         * If a clear happens, it will have evaluated last additions because
9570         * rq->lock is held during the check and the clear
9571         */
9572        rq->has_blocked_load = 1;
9573
9574        /*
9575         * The tick is still stopped but load could have been added in the
9576         * meantime. We set the nohz.has_blocked flag to trig a check of the
9577         * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
9578         * of nohz.has_blocked can only happen after checking the new load
9579         */
9580        if (rq->nohz_tick_stopped)
9581                goto out;
9582
9583        /* If we're a completely isolated CPU, we don't play: */
9584        if (on_null_domain(rq))
9585                return;
9586
9587        rq->nohz_tick_stopped = 1;
9588
9589        cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
9590        atomic_inc(&nohz.nr_cpus);
9591
9592        /*
9593         * Ensures that if nohz_idle_balance() fails to observe our
9594         * @idle_cpus_mask store, it must observe the @has_blocked
9595         * store.
9596         */
9597        smp_mb__after_atomic();
9598
9599        set_cpu_sd_state_idle(cpu);
9600
9601out:
9602        /*
9603         * Each time a cpu enter idle, we assume that it has blocked load and
9604         * enable the periodic update of the load of idle cpus
9605         */
9606        WRITE_ONCE(nohz.has_blocked, 1);
9607}
9608
9609/*
9610 * Internal function that runs load balance for all idle cpus. The load balance
9611 * can be a simple update of blocked load or a complete load balance with
9612 * tasks movement depending of flags.
9613 * The function returns false if the loop has stopped before running
9614 * through all idle CPUs.
9615 */
9616static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
9617                               enum cpu_idle_type idle)
9618{
9619        /* Earliest time when we have to do rebalance again */
9620        unsigned long now = jiffies;
9621        unsigned long next_balance = now + 60*HZ;
9622        bool has_blocked_load = false;
9623        int update_next_balance = 0;
9624        int this_cpu = this_rq->cpu;
9625        int balance_cpu;
9626        int ret = false;
9627        struct rq *rq;
9628
9629        SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
9630
9631        /*
9632         * We assume there will be no idle load after this update and clear
9633         * the has_blocked flag. If a cpu enters idle in the mean time, it will
9634         * set the has_blocked flag and trig another update of idle load.
9635         * Because a cpu that becomes idle, is added to idle_cpus_mask before
9636         * setting the flag, we are sure to not clear the state and not
9637         * check the load of an idle cpu.
9638         */
9639        WRITE_ONCE(nohz.has_blocked, 0);
9640
9641        /*
9642         * Ensures that if we miss the CPU, we must see the has_blocked
9643         * store from nohz_balance_enter_idle().
9644         */
9645        smp_mb();
9646
9647        for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
9648                if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
9649                        continue;
9650
9651                /*
9652                 * If this CPU gets work to do, stop the load balancing
9653                 * work being done for other CPUs. Next load
9654                 * balancing owner will pick it up.
9655                 */
9656                if (need_resched()) {
9657                        has_blocked_load = true;
9658                        goto abort;
9659                }
9660
9661                rq = cpu_rq(balance_cpu);
9662
9663                has_blocked_load |= update_nohz_stats(rq, true);
9664
9665                /*
9666                 * If time for next balance is due,
9667                 * do the balance.
9668                 */
9669                if (time_after_eq(jiffies, rq->next_balance)) {
9670                        struct rq_flags rf;
9671
9672                        rq_lock_irqsave(rq, &rf);
9673                        update_rq_clock(rq);
9674                        rq_unlock_irqrestore(rq, &rf);
9675
9676                        if (flags & NOHZ_BALANCE_KICK)
9677                                rebalance_domains(rq, CPU_IDLE);
9678                }
9679
9680                if (time_after(next_balance, rq->next_balance)) {
9681                        next_balance = rq->next_balance;
9682                        update_next_balance = 1;
9683                }
9684        }
9685
9686        /* Newly idle CPU doesn't need an update */
9687        if (idle != CPU_NEWLY_IDLE) {
9688                update_blocked_averages(this_cpu);
9689                has_blocked_load |= this_rq->has_blocked_load;
9690        }
9691
9692        if (flags & NOHZ_BALANCE_KICK)
9693                rebalance_domains(this_rq, CPU_IDLE);
9694
9695        WRITE_ONCE(nohz.next_blocked,
9696                now + msecs_to_jiffies(LOAD_AVG_PERIOD));
9697
9698        /* The full idle balance loop has been done */
9699        ret = true;
9700
9701abort:
9702        /* There is still blocked load, enable periodic update */
9703        if (has_blocked_load)
9704                WRITE_ONCE(nohz.has_blocked, 1);
9705
9706        /*
9707         * next_balance will be updated only when there is a need.
9708         * When the CPU is attached to null domain for ex, it will not be
9709         * updated.
9710         */
9711        if (likely(update_next_balance))
9712                nohz.next_balance = next_balance;
9713
9714        return ret;
9715}
9716
9717/*
9718 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
9719 * rebalancing for all the cpus for whom scheduler ticks are stopped.
9720 */
9721static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9722{
9723        int this_cpu = this_rq->cpu;
9724        unsigned int flags;
9725
9726        if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
9727                return false;
9728
9729        if (idle != CPU_IDLE) {
9730                atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
9731                return false;
9732        }
9733
9734        /* could be _relaxed() */
9735        flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
9736        if (!(flags & NOHZ_KICK_MASK))
9737                return false;
9738
9739        _nohz_idle_balance(this_rq, flags, idle);
9740
9741        return true;
9742}
9743
9744static void nohz_newidle_balance(struct rq *this_rq)
9745{
9746        int this_cpu = this_rq->cpu;
9747
9748        /*
9749         * This CPU doesn't want to be disturbed by scheduler
9750         * housekeeping
9751         */
9752        if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
9753                return;
9754
9755        /* Will wake up very soon. No time for doing anything else*/
9756        if (this_rq->avg_idle < sysctl_sched_migration_cost)
9757                return;
9758
9759        /* Don't need to update blocked load of idle CPUs*/
9760        if (!READ_ONCE(nohz.has_blocked) ||
9761            time_before(jiffies, READ_ONCE(nohz.next_blocked)))
9762                return;
9763
9764        raw_spin_unlock(&this_rq->lock);
9765        /*
9766         * This CPU is going to be idle and blocked load of idle CPUs
9767         * need to be updated. Run the ilb locally as it is a good
9768         * candidate for ilb instead of waking up another idle CPU.
9769         * Kick an normal ilb if we failed to do the update.
9770         */
9771        if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
9772                kick_ilb(NOHZ_STATS_KICK);
9773        raw_spin_lock(&this_rq->lock);
9774}
9775
9776#else /* !CONFIG_NO_HZ_COMMON */
9777static inline void nohz_balancer_kick(struct rq *rq) { }
9778
9779static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9780{
9781        return false;
9782}
9783
9784static inline void nohz_newidle_balance(struct rq *this_rq) { }
9785#endif /* CONFIG_NO_HZ_COMMON */
9786
9787/*
9788 * idle_balance is called by schedule() if this_cpu is about to become
9789 * idle. Attempts to pull tasks from other CPUs.
9790 */
9791static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
9792{
9793        unsigned long next_balance = jiffies + HZ;
9794        int this_cpu = this_rq->cpu;
9795        struct sched_domain *sd;
9796        int pulled_task = 0;
9797        u64 curr_cost = 0;
9798
9799        /*
9800         * We must set idle_stamp _before_ calling idle_balance(), such that we
9801         * measure the duration of idle_balance() as idle time.
9802         */
9803        this_rq->idle_stamp = rq_clock(this_rq);
9804
9805        /*
9806         * Do not pull tasks towards !active CPUs...
9807         */
9808        if (!cpu_active(this_cpu))
9809                return 0;
9810
9811        /*
9812         * This is OK, because current is on_cpu, which avoids it being picked
9813         * for load-balance and preemption/IRQs are still disabled avoiding
9814         * further scheduler activity on it and we're being very careful to
9815         * re-start the picking loop.
9816         */
9817        rq_unpin_lock(this_rq, rf);
9818
9819        if (this_rq->avg_idle < sysctl_sched_migration_cost ||
9820            !READ_ONCE(this_rq->rd->overload)) {
9821
9822                rcu_read_lock();
9823                sd = rcu_dereference_check_sched_domain(this_rq->sd);
9824                if (sd)
9825                        update_next_balance(sd, &next_balance);
9826                rcu_read_unlock();
9827
9828                nohz_newidle_balance(this_rq);
9829
9830                goto out;
9831        }
9832
9833        raw_spin_unlock(&this_rq->lock);
9834
9835        update_blocked_averages(this_cpu);
9836        rcu_read_lock();
9837        for_each_domain(this_cpu, sd) {
9838                int continue_balancing = 1;
9839                u64 t0, domain_cost;
9840
9841                if (!(sd->flags & SD_LOAD_BALANCE))
9842                        continue;
9843
9844                if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
9845                        update_next_balance(sd, &next_balance);
9846                        break;
9847                }
9848
9849                if (sd->flags & SD_BALANCE_NEWIDLE) {
9850                        t0 = sched_clock_cpu(this_cpu);
9851
9852                        pulled_task = load_balance(this_cpu, this_rq,
9853                                                   sd, CPU_NEWLY_IDLE,
9854                                                   &continue_balancing);
9855
9856                        domain_cost = sched_clock_cpu(this_cpu) - t0;
9857                        if (domain_cost > sd->max_newidle_lb_cost)
9858                                sd->max_newidle_lb_cost = domain_cost;
9859
9860                        curr_cost += domain_cost;
9861                }
9862
9863                update_next_balance(sd, &next_balance);
9864
9865                /*
9866                 * Stop searching for tasks to pull if there are
9867                 * now runnable tasks on this rq.
9868                 */
9869                if (pulled_task || this_rq->nr_running > 0)
9870                        break;
9871        }
9872        rcu_read_unlock();
9873
9874        raw_spin_lock(&this_rq->lock);
9875
9876        if (curr_cost > this_rq->max_idle_balance_cost)
9877                this_rq->max_idle_balance_cost = curr_cost;
9878
9879out:
9880        /*
9881         * While browsing the domains, we released the rq lock, a task could
9882         * have been enqueued in the meantime. Since we're not going idle,
9883         * pretend we pulled a task.
9884         */
9885        if (this_rq->cfs.h_nr_running && !pulled_task)
9886                pulled_task = 1;
9887
9888        /* Move the next balance forward */
9889        if (time_after(this_rq->next_balance, next_balance))
9890                this_rq->next_balance = next_balance;
9891
9892        /* Is there a task of a high priority class? */
9893        if (this_rq->nr_running != this_rq->cfs.h_nr_running)
9894                pulled_task = -1;
9895
9896        if (pulled_task)
9897                this_rq->idle_stamp = 0;
9898
9899        rq_repin_lock(this_rq, rf);
9900
9901        return pulled_task;
9902}
9903
9904/*
9905 * run_rebalance_domains is triggered when needed from the scheduler tick.
9906 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
9907 */
9908static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
9909{
9910        struct rq *this_rq = this_rq();
9911        enum cpu_idle_type idle = this_rq->idle_balance ?
9912                                                CPU_IDLE : CPU_NOT_IDLE;
9913
9914        /*
9915         * If this CPU has a pending nohz_balance_kick, then do the
9916         * balancing on behalf of the other idle CPUs whose ticks are
9917         * stopped. Do nohz_idle_balance *before* rebalance_domains to
9918         * give the idle CPUs a chance to load balance. Else we may
9919         * load balance only within the local sched_domain hierarchy
9920         * and abort nohz_idle_balance altogether if we pull some load.
9921         */
9922        if (nohz_idle_balance(this_rq, idle))
9923                return;
9924
9925        /* normal load balance */
9926        update_blocked_averages(this_rq->cpu);
9927        rebalance_domains(this_rq, idle);
9928}
9929
9930/*
9931 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
9932 */
9933void trigger_load_balance(struct rq *rq)
9934{
9935        /* Don't need to rebalance while attached to NULL domain */
9936        if (unlikely(on_null_domain(rq)))
9937                return;
9938
9939        if (time_after_eq(jiffies, rq->next_balance))
9940                raise_softirq(SCHED_SOFTIRQ);
9941
9942        nohz_balancer_kick(rq);
9943}
9944
9945static void rq_online_fair(struct rq *rq)
9946{
9947        update_sysctl();
9948
9949        update_runtime_enabled(rq);
9950}
9951
9952static void rq_offline_fair(struct rq *rq)
9953{
9954        update_sysctl();
9955
9956        /* Ensure any throttled groups are reachable by pick_next_task */
9957        unthrottle_offline_cfs_rqs(rq);
9958}
9959
9960#endif /* CONFIG_SMP */
9961
9962/*
9963 * scheduler tick hitting a task of our scheduling class.
9964 *
9965 * NOTE: This function can be called remotely by the tick offload that
9966 * goes along full dynticks. Therefore no local assumption can be made
9967 * and everything must be accessed through the @rq and @curr passed in
9968 * parameters.
9969 */
9970static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
9971{
9972        struct cfs_rq *cfs_rq;
9973        struct sched_entity *se = &curr->se;
9974
9975        for_each_sched_entity(se) {
9976                cfs_rq = cfs_rq_of(se);
9977                entity_tick(cfs_rq, se, queued);
9978        }
9979
9980        if (static_branch_unlikely(&sched_numa_balancing))
9981                task_tick_numa(rq, curr);
9982
9983        update_misfit_status(curr, rq);
9984        update_overutilized_status(task_rq(curr));
9985}
9986
9987/*
9988 * called on fork with the child task as argument from the parent's context
9989 *  - child not yet on the tasklist
9990 *  - preemption disabled
9991 */
9992static void task_fork_fair(struct task_struct *p)
9993{
9994        struct cfs_rq *cfs_rq;
9995        struct sched_entity *se = &p->se, *curr;
9996        struct rq *rq = this_rq();
9997        struct rq_flags rf;
9998
9999        rq_lock(rq, &rf);
10000        update_rq_clock(rq);

10001
10002        cfs_rq = task_cfs_rq(current);
10003        curr = cfs_rq->curr;
10004        if (curr) {
10005                update_curr(cfs_rq);
10006                se->vruntime = curr->vruntime;
10007        }
10008        place_entity(cfs_rq, se, 1);
10009
10010        if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
10011                /*
10012                 * Upon rescheduling, sched_class::put_prev_task() will place
10013                 * 'current' within the tree based on its new key value.
10014                 */
10015                swap(curr->vruntime, se->vruntime);
10016                resched_curr(rq);
10017        }
10018
10019        se->vruntime -= cfs_rq->min_vruntime;
10020        rq_unlock(rq, &rf);
10021}
10022
10023/*
10024 * Priority of the task has changed. Check to see if we preempt
10025 * the current task.
10026 */
10027static void
10028prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
10029{
10030        if (!task_on_rq_queued(p))
10031                return;
10032
10033        /*
10034         * Reschedule if we are currently running on this runqueue and
10035         * our priority decreased, or if we are not currently running on
10036         * this runqueue and our priority is higher than the current's
10037         */
10038        if (rq->curr == p) {
10039                if (p->prio > oldprio)
10040                        resched_curr(rq);
10041        } else
10042                check_preempt_curr(rq, p, 0);
10043}
10044
10045static inline bool vruntime_normalized(struct task_struct *p)
10046{
10047        struct sched_entity *se = &p->se;
10048
10049        /*
10050         * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
10051         * the dequeue_entity(.flags=0) will already have normalized the
10052         * vruntime.
10053         */
10054        if (p->on_rq)
10055                return true;
10056
10057        /*
10058         * When !on_rq, vruntime of the task has usually NOT been normalized.
10059         * But there are some cases where it has already been normalized:
10060         *
10061         * - A forked child which is waiting for being woken up by
10062         *   wake_up_new_task().
10063         * - A task which has been woken up by try_to_wake_up() and
10064         *   waiting for actually being woken up by sched_ttwu_pending().
10065         */
10066        if (!se->sum_exec_runtime ||
10067            (p->state == TASK_WAKING && p->sched_remote_wakeup))
10068                return true;
10069
10070        return false;
10071}
10072
10073#ifdef CONFIG_FAIR_GROUP_SCHED
10074/*
10075 * Propagate the changes of the sched_entity across the tg tree to make it
10076 * visible to the root
10077 */
10078static void propagate_entity_cfs_rq(struct sched_entity *se)
10079{
10080        struct cfs_rq *cfs_rq;
10081
10082        /* Start to propagate at parent */
10083        se = se->parent;
10084
10085        for_each_sched_entity(se) {
10086                cfs_rq = cfs_rq_of(se);
10087
10088                if (cfs_rq_throttled(cfs_rq))
10089                        break;
10090
10091                update_load_avg(cfs_rq, se, UPDATE_TG);
10092        }
10093}
10094#else
10095static void propagate_entity_cfs_rq(struct sched_entity *se) { }
10096#endif
10097
10098static void detach_entity_cfs_rq(struct sched_entity *se)
10099{
10100        struct cfs_rq *cfs_rq = cfs_rq_of(se);
10101
10102        /* Catch up with the cfs_rq and remove our load when we leave */
10103        update_load_avg(cfs_rq, se, 0);
10104        detach_entity_load_avg(cfs_rq, se);
10105        update_tg_load_avg(cfs_rq, false);
10106        propagate_entity_cfs_rq(se);
10107}
10108
10109static void attach_entity_cfs_rq(struct sched_entity *se)
10110{
10111        struct cfs_rq *cfs_rq = cfs_rq_of(se);
10112
10113#ifdef CONFIG_FAIR_GROUP_SCHED
10114        /*
10115         * Since the real-depth could have been changed (only FAIR
10116         * class maintain depth value), reset depth properly.
10117         */
10118        se->depth = se->parent ? se->parent->depth + 1 : 0;
10119#endif
10120
10121        /* Synchronize entity with its cfs_rq */
10122        update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
10123        attach_entity_load_avg(cfs_rq, se, 0);
10124        update_tg_load_avg(cfs_rq, false);
10125        propagate_entity_cfs_rq(se);
10126}
10127
10128static void detach_task_cfs_rq(struct task_struct *p)
10129{
10130        struct sched_entity *se = &p->se;
10131        struct cfs_rq *cfs_rq = cfs_rq_of(se);
10132
10133        if (!vruntime_normalized(p)) {
10134                /*
10135                 * Fix up our vruntime so that the current sleep doesn't
10136                 * cause 'unlimited' sleep bonus.
10137                 */
10138                place_entity(cfs_rq, se, 0);
10139                se->vruntime -= cfs_rq->min_vruntime;
10140        }
10141
10142        detach_entity_cfs_rq(se);
10143}
10144
10145static void attach_task_cfs_rq(struct task_struct *p)
10146{
10147        struct sched_entity *se = &p->se;
10148        struct cfs_rq *cfs_rq = cfs_rq_of(se);
10149
10150        attach_entity_cfs_rq(se);
10151
10152        if (!vruntime_normalized(p))
10153                se->vruntime += cfs_rq->min_vruntime;
10154}
10155
10156static void switched_from_fair(struct rq *rq, struct task_struct *p)
10157{
10158        detach_task_cfs_rq(p);
10159}
10160
10161static void switched_to_fair(struct rq *rq, struct task_struct *p)
10162{
10163        attach_task_cfs_rq(p);
10164
10165        if (task_on_rq_queued(p)) {
10166                /*
10167                 * We were most likely switched from sched_rt, so
10168                 * kick off the schedule if running, otherwise just see
10169                 * if we can still preempt the current task.
10170                 */
10171                if (rq->curr == p)
10172                        resched_curr(rq);
10173                else
10174                        check_preempt_curr(rq, p, 0);
10175        }
10176}
10177
10178/* Account for a task changing its policy or group.
10179 *
10180 * This routine is mostly called to set cfs_rq->curr field when a task
10181 * migrates between groups/classes.
10182 */
10183static void set_curr_task_fair(struct rq *rq)
10184{
10185        struct sched_entity *se = &rq->curr->se;
10186
10187        for_each_sched_entity(se) {
10188                struct cfs_rq *cfs_rq = cfs_rq_of(se);
10189
10190                set_next_entity(cfs_rq, se);
10191                /* ensure bandwidth has been allocated on our new cfs_rq */
10192                account_cfs_rq_runtime(cfs_rq, 0);
10193        }
10194}
10195
10196void init_cfs_rq(struct cfs_rq *cfs_rq)
10197{
10198        cfs_rq->tasks_timeline = RB_ROOT_CACHED;
10199        cfs_rq->min_vruntime = (u64)(-(1LL << 20));
10200#ifndef CONFIG_64BIT
10201        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
10202#endif
10203#ifdef CONFIG_SMP
10204        raw_spin_lock_init(&cfs_rq->removed.lock);
10205#endif
10206}
10207
10208#ifdef CONFIG_FAIR_GROUP_SCHED
10209static void task_set_group_fair(struct task_struct *p)
10210{
10211        struct sched_entity *se = &p->se;
10212
10213        set_task_rq(p, task_cpu(p));
10214        se->depth = se->parent ? se->parent->depth + 1 : 0;
10215}
10216
10217static void task_move_group_fair(struct task_struct *p)
10218{
10219        detach_task_cfs_rq(p);
10220        set_task_rq(p, task_cpu(p));
10221
10222#ifdef CONFIG_SMP
10223        /* Tell se's cfs_rq has been changed -- migrated */
10224        p->se.avg.last_update_time = 0;
10225#endif
10226        attach_task_cfs_rq(p);
10227}
10228
10229static void task_change_group_fair(struct task_struct *p, int type)
10230{
10231        switch (type) {
10232        case TASK_SET_GROUP:
10233                task_set_group_fair(p);
10234                break;
10235
10236        case TASK_MOVE_GROUP:
10237                task_move_group_fair(p);
10238                break;
10239        }
10240}
10241
10242void free_fair_sched_group(struct task_group *tg)
10243{
10244        int i;
10245
10246        destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
10247
10248        for_each_possible_cpu(i) {
10249                if (tg->cfs_rq)
10250                        kfree(tg->cfs_rq[i]);
10251                if (tg->se)
10252                        kfree(tg->se[i]);
10253        }
10254
10255        kfree(tg->cfs_rq);
10256        kfree(tg->se);
10257}
10258
10259int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
10260{
10261        struct sched_entity *se;
10262        struct cfs_rq *cfs_rq;
10263        int i;
10264
10265        tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
10266        if (!tg->cfs_rq)
10267                goto err;
10268        tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
10269        if (!tg->se)
10270                goto err;
10271
10272        tg->shares = NICE_0_LOAD;
10273
10274        init_cfs_bandwidth(tg_cfs_bandwidth(tg));
10275
10276        for_each_possible_cpu(i) {
10277                cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
10278                                      GFP_KERNEL, cpu_to_node(i));
10279                if (!cfs_rq)
10280                        goto err;
10281
10282                se = kzalloc_node(sizeof(struct sched_entity),
10283                                  GFP_KERNEL, cpu_to_node(i));
10284                if (!se)
10285                        goto err_free_rq;
10286
10287                init_cfs_rq(cfs_rq);
10288                init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
10289                init_entity_runnable_average(se);
10290        }
10291
10292        return 1;
10293
10294err_free_rq:
10295        kfree(cfs_rq);
10296err:
10297        return 0;
10298}
10299
10300void online_fair_sched_group(struct task_group *tg)
10301{
10302        struct sched_entity *se;
10303        struct rq *rq;
10304        int i;
10305
10306        for_each_possible_cpu(i) {
10307                rq = cpu_rq(i);
10308                se = tg->se[i];
10309
10310                raw_spin_lock_irq(&rq->lock);
10311                update_rq_clock(rq);
10312                attach_entity_cfs_rq(se);
10313                sync_throttle(tg, i);
10314                raw_spin_unlock_irq(&rq->lock);
10315        }
10316}
10317
10318void unregister_fair_sched_group(struct task_group *tg)
10319{
10320        unsigned long flags;
10321        struct rq *rq;
10322        int cpu;
10323
10324        for_each_possible_cpu(cpu) {
10325                if (tg->se[cpu])
10326                        remove_entity_load_avg(tg->se[cpu]);
10327
10328                /*
10329                 * Only empty task groups can be destroyed; so we can speculatively
10330                 * check on_list without danger of it being re-added.
10331                 */
10332                if (!tg->cfs_rq[cpu]->on_list)
10333                        continue;
10334
10335                rq = cpu_rq(cpu);
10336
10337                raw_spin_lock_irqsave(&rq->lock, flags);
10338                list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
10339                raw_spin_unlock_irqrestore(&rq->lock, flags);
10340        }
10341}
10342
10343void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
10344                        struct sched_entity *se, int cpu,
10345                        struct sched_entity *parent)
10346{
10347        struct rq *rq = cpu_rq(cpu);
10348
10349        cfs_rq->tg = tg;
10350        cfs_rq->rq = rq;
10351        init_cfs_rq_runtime(cfs_rq);
10352
10353        tg->cfs_rq[cpu] = cfs_rq;
10354        tg->se[cpu] = se;
10355
10356        /* se could be NULL for root_task_group */
10357        if (!se)
10358                return;
10359
10360        if (!parent) {
10361                se->cfs_rq = &rq->cfs;
10362                se->depth = 0;
10363        } else {
10364                se->cfs_rq = parent->my_q;
10365                se->depth = parent->depth + 1;
10366        }
10367
10368        se->my_q = cfs_rq;
10369        /* guarantee group entities always have weight */
10370        update_load_set(&se->load, NICE_0_LOAD);
10371        se->parent = parent;
10372}
10373
10374static DEFINE_MUTEX(shares_mutex);
10375
10376int sched_group_set_shares(struct task_group *tg, unsigned long shares)
10377{
10378        int i;
10379
10380        /*
10381         * We can't change the weight of the root cgroup.
10382         */
10383        if (!tg->se[0])
10384                return -EINVAL;
10385
10386        shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
10387
10388        mutex_lock(&shares_mutex);
10389        if (tg->shares == shares)
10390                goto done;
10391
10392        tg->shares = shares;
10393        for_each_possible_cpu(i) {
10394                struct rq *rq = cpu_rq(i);
10395                struct sched_entity *se = tg->se[i];
10396                struct rq_flags rf;
10397
10398                /* Propagate contribution to hierarchy */
10399                rq_lock_irqsave(rq, &rf);
10400                update_rq_clock(rq);
10401                for_each_sched_entity(se) {
10402                        update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
10403                        update_cfs_group(se);
10404                }
10405                rq_unlock_irqrestore(rq, &rf);
10406        }
10407
10408done:
10409        mutex_unlock(&shares_mutex);
10410        return 0;
10411}
10412#else /* CONFIG_FAIR_GROUP_SCHED */
10413
10414void free_fair_sched_group(struct task_group *tg) { }
10415
10416int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
10417{
10418        return 1;
10419}
10420
10421void online_fair_sched_group(struct task_group *tg) { }
10422
10423void unregister_fair_sched_group(struct task_group *tg) { }
10424
10425#endif /* CONFIG_FAIR_GROUP_SCHED */
10426
10427
10428static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
10429{
10430        struct sched_entity *se = &task->se;
10431        unsigned int rr_interval = 0;
10432
10433        /*
10434         * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
10435         * idle runqueue:
10436         */
10437        if (rq->cfs.load.weight)
10438                rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
10439
10440        return rr_interval;
10441}
10442
10443/*
10444 * All the scheduling class methods:
10445 */
10446const struct sched_class fair_sched_class = {
10447        .next                   = &idle_sched_class,
10448        .enqueue_task           = enqueue_task_fair,
10449        .dequeue_task           = dequeue_task_fair,
10450        .yield_task             = yield_task_fair,
10451        .yield_to_task          = yield_to_task_fair,
10452
10453        .check_preempt_curr     = check_preempt_wakeup,
10454
10455        .pick_next_task         = pick_next_task_fair,
10456        .put_prev_task          = put_prev_task_fair,
10457
10458#ifdef CONFIG_SMP
10459        .select_task_rq         = select_task_rq_fair,
10460        .migrate_task_rq        = migrate_task_rq_fair,
10461
10462        .rq_online              = rq_online_fair,
10463        .rq_offline             = rq_offline_fair,
10464
10465        .task_dead              = task_dead_fair,
10466        .set_cpus_allowed       = set_cpus_allowed_common,
10467#endif
10468
10469        .set_curr_task          = set_curr_task_fair,
10470        .task_tick              = task_tick_fair,
10471        .task_fork              = task_fork_fair,
10472
10473        .prio_changed           = prio_changed_fair,
10474        .switched_from          = switched_from_fair,
10475        .switched_to            = switched_to_fair,
10476
10477        .get_rr_interval        = get_rr_interval_fair,
10478
10479        .update_curr            = update_curr_fair,
10480
10481#ifdef CONFIG_FAIR_GROUP_SCHED
10482        .task_change_group      = task_change_group_fair,
10483#endif
10484
10485#ifdef CONFIG_UCLAMP_TASK
10486        .uclamp_enabled         = 1,
10487#endif
10488};
10489
10490#ifdef CONFIG_SCHED_DEBUG
10491void print_cfs_stats(struct seq_file *m, int cpu)
10492{
10493        struct cfs_rq *cfs_rq, *pos;
10494
10495        rcu_read_lock();
10496        for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
10497                print_cfs_rq(m, cpu, cfs_rq);
10498        rcu_read_unlock();
10499}
10500
10501#ifdef CONFIG_NUMA_BALANCING
10502void show_numa_stats(struct task_struct *p, struct seq_file *m)
10503{
10504        int node;
10505        unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
10506        struct numa_group *ng;
10507
10508        rcu_read_lock();
10509        ng = rcu_dereference(p->numa_group);
10510        for_each_online_node(node) {
10511                if (p->numa_faults) {
10512                        tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
10513                        tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
10514                }
10515                if (ng) {
10516                        gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
10517                        gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
10518                }
10519                print_numa_stats(m, node, tsf, tpf, gsf, gpf);
10520        }
10521        rcu_read_unlock();
10522}
10523#endif /* CONFIG_NUMA_BALANCING */
10524#endif /* CONFIG_SCHED_DEBUG */
10525
10526__init void init_sched_fair_class(void)
10527{
10528#ifdef CONFIG_SMP
10529        open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
10530
10531#ifdef CONFIG_NO_HZ_COMMON
10532        nohz.next_balance = jiffies;
10533        nohz.next_blocked = jiffies;
10534        zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
10535#endif
10536#endif /* SMP */
10537
10538}
10539
10540/*
10541 * Helper functions to facilitate extracting info from tracepoints.
10542 */
10543
10544const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
10545{
10546#ifdef CONFIG_SMP
10547        return cfs_rq ? &cfs_rq->avg : NULL;
10548#else
10549        return NULL;
10550#endif
10551}
10552EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
10553
10554char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
10555{
10556        if (!cfs_rq) {
10557                if (str)
10558                        strlcpy(str, "(null)", len);
10559                else
10560                        return NULL;
10561        }
10562
10563        cfs_rq_tg_path(cfs_rq, str, len);
10564        return str;
10565}
10566EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
10567
10568int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
10569{
10570        return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
10571}
10572EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
10573
10574const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
10575{
10576#ifdef CONFIG_SMP
10577        return rq ? &rq->avg_rt : NULL;
10578#else
10579        return NULL;
10580#endif
10581}
10582EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
10583
10584const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
10585{
10586#ifdef CONFIG_SMP
10587        return rq ? &rq->avg_dl : NULL;
10588#else
10589        return NULL;
10590#endif
10591}
10592EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
10593
10594const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
10595{
10596#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
10597        return rq ? &rq->avg_irq : NULL;
10598#else
10599        return NULL;
10600#endif
10601}
10602EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
10603
10604int sched_trace_rq_cpu(struct rq *rq)
10605{
10606        return rq ? cpu_of(rq) : -1;
10607}
10608EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
10609
10610const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
10611{
10612#ifdef CONFIG_SMP
10613        return rd ? rd->span : NULL;
10614#else
10615        return NULL;
10616#endif
10617}
10618EXPORT_SYMBOL_GPL(sched_trace_rd_span);
10619