linux/kernel/sched/fair.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
   4 *
   5 *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   6 *
   7 *  Interactivity improvements by Mike Galbraith
   8 *  (C) 2007 Mike Galbraith <efault@gmx.de>
   9 *
  10 *  Various enhancements by Dmitry Adamushko.
  11 *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  12 *
  13 *  Group scheduling enhancements by Srivatsa Vaddagiri
  14 *  Copyright IBM Corporation, 2007
  15 *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  16 *
  17 *  Scaled math optimizations by Thomas Gleixner
  18 *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  19 *
  20 *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  21 *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  22 */
  23#include "sched.h"
  24
  25#include <trace/events/sched.h>
  26
  27/*
  28 * Targeted preemption latency for CPU-bound tasks:
  29 *
  30 * NOTE: this latency value is not the same as the concept of
  31 * 'timeslice length' - timeslices in CFS are of variable length
  32 * and have no persistent notion like in traditional, time-slice
  33 * based scheduling concepts.
  34 *
  35 * (to see the precise effective timeslice length of your workload,
  36 *  run vmstat and monitor the context-switches (cs) field)
  37 *
  38 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
  39 */
  40unsigned int sysctl_sched_latency                       = 6000000ULL;
  41static unsigned int normalized_sysctl_sched_latency     = 6000000ULL;
  42
  43/*
  44 * The initial- and re-scaling of tunables is configurable
  45 *
  46 * Options are:
  47 *
  48 *   SCHED_TUNABLESCALING_NONE - unscaled, always *1
  49 *   SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
  50 *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
  51 *
  52 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
  53 */
  54enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
  55
  56/*
  57 * Minimal preemption granularity for CPU-bound tasks:
  58 *
  59 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  60 */
  61unsigned int sysctl_sched_min_granularity                       = 750000ULL;
  62static unsigned int normalized_sysctl_sched_min_granularity     = 750000ULL;
  63
  64/*
  65 * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
  66 */
  67static unsigned int sched_nr_latency = 8;
  68
  69/*
  70 * After fork, child runs first. If set to 0 (default) then
  71 * parent will (try to) run first.
  72 */
  73unsigned int sysctl_sched_child_runs_first __read_mostly;
  74
  75/*
  76 * SCHED_OTHER wake-up granularity.
  77 *
  78 * This option delays the preemption effects of decoupled workloads
  79 * and reduces their over-scheduling. Synchronous workloads will still
  80 * have immediate wakeup/sleep latencies.
  81 *
  82 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
  83 */
  84unsigned int sysctl_sched_wakeup_granularity                    = 1000000UL;
  85static unsigned int normalized_sysctl_sched_wakeup_granularity  = 1000000UL;
  86
  87const_debug unsigned int sysctl_sched_migration_cost    = 500000UL;
  88
  89#ifdef CONFIG_SMP
  90/*
  91 * For asym packing, by default the lower numbered CPU has higher priority.
  92 */
  93int __weak arch_asym_cpu_priority(int cpu)
  94{
  95        return -cpu;
  96}
  97
  98/*
  99 * The margin used when comparing utilization with CPU capacity.
 100 *
 101 * (default: ~20%)
 102 */
 103#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
 104
 105#endif
 106
 107#ifdef CONFIG_CFS_BANDWIDTH
 108/*
 109 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
 110 * each time a cfs_rq requests quota.
 111 *
 112 * Note: in the case that the slice exceeds the runtime remaining (either due
 113 * to consumption or the quota being specified to be smaller than the slice)
 114 * we will always only issue the remaining available time.
 115 *
 116 * (default: 5 msec, units: microseconds)
 117 */
 118unsigned int sysctl_sched_cfs_bandwidth_slice           = 5000UL;
 119#endif
 120
 121static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 122{
 123        lw->weight += inc;
 124        lw->inv_weight = 0;
 125}
 126
 127static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
 128{
 129        lw->weight -= dec;
 130        lw->inv_weight = 0;
 131}
 132
 133static inline void update_load_set(struct load_weight *lw, unsigned long w)
 134{
 135        lw->weight = w;
 136        lw->inv_weight = 0;
 137}
 138
 139/*
 140 * Increase the granularity value when there are more CPUs,
 141 * because with more CPUs the 'effective latency' as visible
 142 * to users decreases. But the relationship is not linear,
 143 * so pick a second-best guess by going with the log2 of the
 144 * number of CPUs.
 145 *
 146 * This idea comes from the SD scheduler of Con Kolivas:
 147 */
 148static unsigned int get_update_sysctl_factor(void)
 149{
 150        unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
 151        unsigned int factor;
 152
 153        switch (sysctl_sched_tunable_scaling) {
 154        case SCHED_TUNABLESCALING_NONE:
 155                factor = 1;
 156                break;
 157        case SCHED_TUNABLESCALING_LINEAR:
 158                factor = cpus;
 159                break;
 160        case SCHED_TUNABLESCALING_LOG:
 161        default:
 162                factor = 1 + ilog2(cpus);
 163                break;
 164        }
 165
 166        return factor;
 167}
 168
 169static void update_sysctl(void)
 170{
 171        unsigned int factor = get_update_sysctl_factor();
 172
 173#define SET_SYSCTL(name) \
 174        (sysctl_##name = (factor) * normalized_sysctl_##name)
 175        SET_SYSCTL(sched_min_granularity);
 176        SET_SYSCTL(sched_latency);
 177        SET_SYSCTL(sched_wakeup_granularity);
 178#undef SET_SYSCTL
 179}
 180
 181void sched_init_granularity(void)
 182{
 183        update_sysctl();
 184}
 185
 186#define WMULT_CONST     (~0U)
 187#define WMULT_SHIFT     32
 188
 189static void __update_inv_weight(struct load_weight *lw)
 190{
 191        unsigned long w;
 192
 193        if (likely(lw->inv_weight))
 194                return;
 195
 196        w = scale_load_down(lw->weight);
 197
 198        if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
 199                lw->inv_weight = 1;
 200        else if (unlikely(!w))
 201                lw->inv_weight = WMULT_CONST;
 202        else
 203                lw->inv_weight = WMULT_CONST / w;
 204}
 205
 206/*
 207 * delta_exec * weight / lw.weight
 208 *   OR
 209 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
 210 *
 211 * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
 212 * we're guaranteed shift stays positive because inv_weight is guaranteed to
 213 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
 214 *
 215 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
 216 * weight/lw.weight <= 1, and therefore our shift will also be positive.
 217 */
 218static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
 219{
 220        u64 fact = scale_load_down(weight);
 221        int shift = WMULT_SHIFT;
 222
 223        __update_inv_weight(lw);
 224
 225        if (unlikely(fact >> 32)) {
 226                while (fact >> 32) {
 227                        fact >>= 1;
 228                        shift--;
 229                }
 230        }
 231
 232        /* hint to use a 32x32->64 mul */
 233        fact = (u64)(u32)fact * lw->inv_weight;
 234
 235        while (fact >> 32) {
 236                fact >>= 1;
 237                shift--;
 238        }
 239
 240        return mul_u64_u32_shr(delta_exec, fact, shift);
 241}
 242
 243
 244const struct sched_class fair_sched_class;
 245
 246/**************************************************************
 247 * CFS operations on generic schedulable entities:
 248 */
 249
 250#ifdef CONFIG_FAIR_GROUP_SCHED
 251static inline struct task_struct *task_of(struct sched_entity *se)
 252{
 253        SCHED_WARN_ON(!entity_is_task(se));
 254        return container_of(se, struct task_struct, se);
 255}
 256
 257/* Walk up scheduling entities hierarchy */
 258#define for_each_sched_entity(se) \
 259                for (; se; se = se->parent)
 260
 261static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 262{
 263        return p->se.cfs_rq;
 264}
 265
 266/* runqueue on which this entity is (to be) queued */
 267static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 268{
 269        return se->cfs_rq;
 270}
 271
 272/* runqueue "owned" by this group */
 273static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 274{
 275        return grp->my_q;
 276}
 277
 278static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
 279{
 280        if (!path)
 281                return;
 282
 283        if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
 284                autogroup_path(cfs_rq->tg, path, len);
 285        else if (cfs_rq && cfs_rq->tg->css.cgroup)
 286                cgroup_path(cfs_rq->tg->css.cgroup, path, len);
 287        else
 288                strlcpy(path, "(null)", len);
 289}
 290
 291static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 292{
 293        struct rq *rq = rq_of(cfs_rq);
 294        int cpu = cpu_of(rq);
 295
 296        if (cfs_rq->on_list)
 297                return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
 298
 299        cfs_rq->on_list = 1;
 300
 301        /*
 302         * Ensure we either appear before our parent (if already
 303         * enqueued) or force our parent to appear after us when it is
 304         * enqueued. The fact that we always enqueue bottom-up
 305         * reduces this to two cases and a special case for the root
 306         * cfs_rq. Furthermore, it also means that we will always reset
 307         * tmp_alone_branch either when the branch is connected
 308         * to a tree or when we reach the top of the tree
 309         */
 310        if (cfs_rq->tg->parent &&
 311            cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
 312                /*
 313                 * If parent is already on the list, we add the child
 314                 * just before. Thanks to circular linked property of
 315                 * the list, this means to put the child at the tail
 316                 * of the list that starts by parent.
 317                 */
 318                list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 319                        &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
 320                /*
 321                 * The branch is now connected to its tree so we can
 322                 * reset tmp_alone_branch to the beginning of the
 323                 * list.
 324                 */
 325                rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
 326                return true;
 327        }
 328
 329        if (!cfs_rq->tg->parent) {
 330                /*
 331                 * cfs rq without parent should be put
 332                 * at the tail of the list.
 333                 */
 334                list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 335                        &rq->leaf_cfs_rq_list);
 336                /*
 337                 * We have reach the top of a tree so we can reset
 338                 * tmp_alone_branch to the beginning of the list.
 339                 */
 340                rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
 341                return true;
 342        }
 343
 344        /*
 345         * The parent has not already been added so we want to
 346         * make sure that it will be put after us.
 347         * tmp_alone_branch points to the begin of the branch
 348         * where we will add parent.
 349         */
 350        list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
 351        /*
 352         * update tmp_alone_branch to points to the new begin
 353         * of the branch
 354         */
 355        rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
 356        return false;
 357}
 358
 359static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 360{
 361        if (cfs_rq->on_list) {
 362                struct rq *rq = rq_of(cfs_rq);
 363
 364                /*
 365                 * With cfs_rq being unthrottled/throttled during an enqueue,
 366                 * it can happen the tmp_alone_branch points the a leaf that
 367                 * we finally want to del. In this case, tmp_alone_branch moves
 368                 * to the prev element but it will point to rq->leaf_cfs_rq_list
 369                 * at the end of the enqueue.
 370                 */
 371                if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
 372                        rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
 373
 374                list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
 375                cfs_rq->on_list = 0;
 376        }
 377}
 378
 379static inline void assert_list_leaf_cfs_rq(struct rq *rq)
 380{
 381        SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
 382}
 383
 384/* Iterate thr' all leaf cfs_rq's on a runqueue */
 385#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)                      \
 386        list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,    \
 387                                 leaf_cfs_rq_list)
 388
 389/* Do the two (enqueued) entities belong to the same group ? */
 390static inline struct cfs_rq *
 391is_same_group(struct sched_entity *se, struct sched_entity *pse)
 392{
 393        if (se->cfs_rq == pse->cfs_rq)
 394                return se->cfs_rq;
 395
 396        return NULL;
 397}
 398
 399static inline struct sched_entity *parent_entity(struct sched_entity *se)
 400{
 401        return se->parent;
 402}
 403
 404static void
 405find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 406{
 407        int se_depth, pse_depth;
 408
 409        /*
 410         * preemption test can be made between sibling entities who are in the
 411         * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
 412         * both tasks until we find their ancestors who are siblings of common
 413         * parent.
 414         */
 415
 416        /* First walk up until both entities are at same depth */
 417        se_depth = (*se)->depth;
 418        pse_depth = (*pse)->depth;
 419
 420        while (se_depth > pse_depth) {
 421                se_depth--;
 422                *se = parent_entity(*se);
 423        }
 424
 425        while (pse_depth > se_depth) {
 426                pse_depth--;
 427                *pse = parent_entity(*pse);
 428        }
 429
 430        while (!is_same_group(*se, *pse)) {
 431                *se = parent_entity(*se);
 432                *pse = parent_entity(*pse);
 433        }
 434}
 435
 436#else   /* !CONFIG_FAIR_GROUP_SCHED */
 437
 438static inline struct task_struct *task_of(struct sched_entity *se)
 439{
 440        return container_of(se, struct task_struct, se);
 441}
 442
 443#define for_each_sched_entity(se) \
 444                for (; se; se = NULL)
 445
 446static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 447{
 448        return &task_rq(p)->cfs;
 449}
 450
 451static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 452{
 453        struct task_struct *p = task_of(se);
 454        struct rq *rq = task_rq(p);
 455
 456        return &rq->cfs;
 457}
 458
 459/* runqueue "owned" by this group */
 460static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 461{
 462        return NULL;
 463}
 464
 465static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
 466{
 467        if (path)
 468                strlcpy(path, "(null)", len);
 469}
 470
 471static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 472{
 473        return true;
 474}
 475
 476static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 477{
 478}
 479
 480static inline void assert_list_leaf_cfs_rq(struct rq *rq)
 481{
 482}
 483
 484#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)      \
 485                for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
 486
 487static inline struct sched_entity *parent_entity(struct sched_entity *se)
 488{
 489        return NULL;
 490}
 491
 492static inline void
 493find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 494{
 495}
 496
 497#endif  /* CONFIG_FAIR_GROUP_SCHED */
 498
 499static __always_inline
 500void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 501
 502/**************************************************************
 503 * Scheduling class tree data structure manipulation methods:
 504 */
 505
 506static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
 507{
 508        s64 delta = (s64)(vruntime - max_vruntime);
 509        if (delta > 0)
 510                max_vruntime = vruntime;
 511
 512        return max_vruntime;
 513}
 514
 515static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
 516{
 517        s64 delta = (s64)(vruntime - min_vruntime);
 518        if (delta < 0)
 519                min_vruntime = vruntime;
 520
 521        return min_vruntime;
 522}
 523
 524static inline int entity_before(struct sched_entity *a,
 525                                struct sched_entity *b)
 526{
 527        return (s64)(a->vruntime - b->vruntime) < 0;
 528}
 529
 530static void update_min_vruntime(struct cfs_rq *cfs_rq)
 531{
 532        struct sched_entity *curr = cfs_rq->curr;
 533        struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
 534
 535        u64 vruntime = cfs_rq->min_vruntime;
 536
 537        if (curr) {
 538                if (curr->on_rq)
 539                        vruntime = curr->vruntime;
 540                else
 541                        curr = NULL;
 542        }
 543
 544        if (leftmost) { /* non-empty tree */
 545                struct sched_entity *se;
 546                se = rb_entry(leftmost, struct sched_entity, run_node);
 547
 548                if (!curr)
 549                        vruntime = se->vruntime;
 550                else
 551                        vruntime = min_vruntime(vruntime, se->vruntime);
 552        }
 553
 554        /* ensure we never gain time by being placed backwards. */
 555        cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
 556#ifndef CONFIG_64BIT
 557        smp_wmb();
 558        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 559#endif
 560}
 561
 562/*
 563 * Enqueue an entity into the rb-tree:
 564 */
 565static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 566{
 567        struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
 568        struct rb_node *parent = NULL;
 569        struct sched_entity *entry;
 570        bool leftmost = true;
 571
 572        /*
 573         * Find the right place in the rbtree:
 574         */
 575        while (*link) {
 576                parent = *link;
 577                entry = rb_entry(parent, struct sched_entity, run_node);
 578                /*
 579                 * We dont care about collisions. Nodes with
 580                 * the same key stay together.
 581                 */
 582                if (entity_before(se, entry)) {
 583                        link = &parent->rb_left;
 584                } else {
 585                        link = &parent->rb_right;
 586                        leftmost = false;
 587                }
 588        }
 589
 590        rb_link_node(&se->run_node, parent, link);
 591        rb_insert_color_cached(&se->run_node,
 592                               &cfs_rq->tasks_timeline, leftmost);
 593}
 594
 595static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 596{
 597        rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
 598}
 599
 600struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 601{
 602        struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
 603
 604        if (!left)
 605                return NULL;
 606
 607        return rb_entry(left, struct sched_entity, run_node);
 608}
 609
 610static struct sched_entity *__pick_next_entity(struct sched_entity *se)
 611{
 612        struct rb_node *next = rb_next(&se->run_node);
 613
 614        if (!next)
 615                return NULL;
 616
 617        return rb_entry(next, struct sched_entity, run_node);
 618}
 619
 620#ifdef CONFIG_SCHED_DEBUG
 621struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 622{
 623        struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
 624
 625        if (!last)
 626                return NULL;
 627
 628        return rb_entry(last, struct sched_entity, run_node);
 629}
 630
 631/**************************************************************
 632 * Scheduling class statistics methods:
 633 */
 634
 635int sched_proc_update_handler(struct ctl_table *table, int write,
 636                void __user *buffer, size_t *lenp,
 637                loff_t *ppos)
 638{
 639        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 640        unsigned int factor = get_update_sysctl_factor();
 641
 642        if (ret || !write)
 643                return ret;
 644
 645        sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
 646                                        sysctl_sched_min_granularity);
 647
 648#define WRT_SYSCTL(name) \
 649        (normalized_sysctl_##name = sysctl_##name / (factor))
 650        WRT_SYSCTL(sched_min_granularity);
 651        WRT_SYSCTL(sched_latency);
 652        WRT_SYSCTL(sched_wakeup_granularity);
 653#undef WRT_SYSCTL
 654
 655        return 0;
 656}
 657#endif
 658
 659/*
 660 * delta /= w
 661 */
 662static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
 663{
 664        if (unlikely(se->load.weight != NICE_0_LOAD))
 665                delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
 666
 667        return delta;
 668}
 669
 670/*
 671 * The idea is to set a period in which each task runs once.
 672 *
 673 * When there are too many tasks (sched_nr_latency) we have to stretch
 674 * this period because otherwise the slices get too small.
 675 *
 676 * p = (nr <= nl) ? l : l*nr/nl
 677 */
 678static u64 __sched_period(unsigned long nr_running)
 679{
 680        if (unlikely(nr_running > sched_nr_latency))
 681                return nr_running * sysctl_sched_min_granularity;
 682        else
 683                return sysctl_sched_latency;
 684}
 685
 686/*
 687 * We calculate the wall-time slice from the period by taking a part
 688 * proportional to the weight.
 689 *
 690 * s = p*P[w/rw]
 691 */
 692static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 693{
 694        u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
 695
 696        for_each_sched_entity(se) {
 697                struct load_weight *load;
 698                struct load_weight lw;
 699
 700                cfs_rq = cfs_rq_of(se);
 701                load = &cfs_rq->load;
 702
 703                if (unlikely(!se->on_rq)) {
 704                        lw = cfs_rq->load;
 705
 706                        update_load_add(&lw, se->load.weight);
 707                        load = &lw;
 708                }
 709                slice = __calc_delta(slice, se->load.weight, load);
 710        }
 711        return slice;
 712}
 713
 714/*
 715 * We calculate the vruntime slice of a to-be-inserted task.
 716 *
 717 * vs = s/w
 718 */
 719static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 720{
 721        return calc_delta_fair(sched_slice(cfs_rq, se), se);
 722}
 723
 724#include "pelt.h"
 725#ifdef CONFIG_SMP
 726
 727static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
 728static unsigned long task_h_load(struct task_struct *p);
 729static unsigned long capacity_of(int cpu);
 730
 731/* Give new sched_entity start runnable values to heavy its load in infant time */
 732void init_entity_runnable_average(struct sched_entity *se)
 733{
 734        struct sched_avg *sa = &se->avg;
 735
 736        memset(sa, 0, sizeof(*sa));
 737
 738        /*
 739         * Tasks are initialized with full load to be seen as heavy tasks until
 740         * they get a chance to stabilize to their real load level.
 741         * Group entities are initialized with zero load to reflect the fact that
 742         * nothing has been attached to the task group yet.
 743         */
 744        if (entity_is_task(se))
 745                sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
 746
 747        se->runnable_weight = se->load.weight;
 748
 749        /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 750}
 751
 752static void attach_entity_cfs_rq(struct sched_entity *se);
 753
 754/*
 755 * With new tasks being created, their initial util_avgs are extrapolated
 756 * based on the cfs_rq's current util_avg:
 757 *
 758 *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
 759 *
 760 * However, in many cases, the above util_avg does not give a desired
 761 * value. Moreover, the sum of the util_avgs may be divergent, such
 762 * as when the series is a harmonic series.
 763 *
 764 * To solve this problem, we also cap the util_avg of successive tasks to
 765 * only 1/2 of the left utilization budget:
 766 *
 767 *   util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
 768 *
 769 * where n denotes the nth task and cpu_scale the CPU capacity.
 770 *
 771 * For example, for a CPU with 1024 of capacity, a simplest series from
 772 * the beginning would be like:
 773 *
 774 *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
 775 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
 776 *
 777 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
 778 * if util_avg > util_avg_cap.
 779 */
 780void post_init_entity_util_avg(struct task_struct *p)
 781{
 782        struct sched_entity *se = &p->se;
 783        struct cfs_rq *cfs_rq = cfs_rq_of(se);
 784        struct sched_avg *sa = &se->avg;
 785        long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
 786        long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
 787
 788        if (cap > 0) {
 789                if (cfs_rq->avg.util_avg != 0) {
 790                        sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
 791                        sa->util_avg /= (cfs_rq->avg.load_avg + 1);
 792
 793                        if (sa->util_avg > cap)
 794                                sa->util_avg = cap;
 795                } else {
 796                        sa->util_avg = cap;
 797                }
 798        }
 799
 800        if (p->sched_class != &fair_sched_class) {
 801                /*
 802                 * For !fair tasks do:
 803                 *
 804                update_cfs_rq_load_avg(now, cfs_rq);
 805                attach_entity_load_avg(cfs_rq, se, 0);
 806                switched_from_fair(rq, p);
 807                 *
 808                 * such that the next switched_to_fair() has the
 809                 * expected state.
 810                 */
 811                se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
 812                return;
 813        }
 814
 815        attach_entity_cfs_rq(se);
 816}
 817
 818#else /* !CONFIG_SMP */
 819void init_entity_runnable_average(struct sched_entity *se)
 820{
 821}
 822void post_init_entity_util_avg(struct task_struct *p)
 823{
 824}
 825static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 826{
 827}
 828#endif /* CONFIG_SMP */
 829
 830/*
 831 * Update the current task's runtime statistics.
 832 */
 833static void update_curr(struct cfs_rq *cfs_rq)
 834{
 835        struct sched_entity *curr = cfs_rq->curr;
 836        u64 now = rq_clock_task(rq_of(cfs_rq));
 837        u64 delta_exec;
 838
 839        if (unlikely(!curr))
 840                return;
 841
 842        delta_exec = now - curr->exec_start;
 843        if (unlikely((s64)delta_exec <= 0))
 844                return;
 845
 846        curr->exec_start = now;
 847
 848        schedstat_set(curr->statistics.exec_max,
 849                      max(delta_exec, curr->statistics.exec_max));
 850
 851        curr->sum_exec_runtime += delta_exec;
 852        schedstat_add(cfs_rq->exec_clock, delta_exec);
 853
 854        curr->vruntime += calc_delta_fair(delta_exec, curr);
 855        update_min_vruntime(cfs_rq);
 856
 857        if (entity_is_task(curr)) {
 858                struct task_struct *curtask = task_of(curr);
 859
 860                trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
 861                cgroup_account_cputime(curtask, delta_exec);
 862                account_group_exec_runtime(curtask, delta_exec);
 863        }
 864
 865        account_cfs_rq_runtime(cfs_rq, delta_exec);
 866}
 867
 868static void update_curr_fair(struct rq *rq)
 869{
 870        update_curr(cfs_rq_of(&rq->curr->se));
 871}
 872
 873static inline void
 874update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 875{
 876        u64 wait_start, prev_wait_start;
 877
 878        if (!schedstat_enabled())
 879                return;
 880
 881        wait_start = rq_clock(rq_of(cfs_rq));
 882        prev_wait_start = schedstat_val(se->statistics.wait_start);
 883
 884        if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
 885            likely(wait_start > prev_wait_start))
 886                wait_start -= prev_wait_start;
 887
 888        __schedstat_set(se->statistics.wait_start, wait_start);
 889}
 890
 891static inline void
 892update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 893{
 894        struct task_struct *p;
 895        u64 delta;
 896
 897        if (!schedstat_enabled())
 898                return;
 899
 900        delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
 901
 902        if (entity_is_task(se)) {
 903                p = task_of(se);
 904                if (task_on_rq_migrating(p)) {
 905                        /*
 906                         * Preserve migrating task's wait time so wait_start
 907                         * time stamp can be adjusted to accumulate wait time
 908                         * prior to migration.
 909                         */
 910                        __schedstat_set(se->statistics.wait_start, delta);
 911                        return;
 912                }
 913                trace_sched_stat_wait(p, delta);
 914        }
 915
 916        __schedstat_set(se->statistics.wait_max,
 917                      max(schedstat_val(se->statistics.wait_max), delta));
 918        __schedstat_inc(se->statistics.wait_count);
 919        __schedstat_add(se->statistics.wait_sum, delta);
 920        __schedstat_set(se->statistics.wait_start, 0);
 921}
 922
 923static inline void
 924update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 925{
 926        struct task_struct *tsk = NULL;
 927        u64 sleep_start, block_start;
 928
 929        if (!schedstat_enabled())
 930                return;
 931
 932        sleep_start = schedstat_val(se->statistics.sleep_start);
 933        block_start = schedstat_val(se->statistics.block_start);
 934
 935        if (entity_is_task(se))
 936                tsk = task_of(se);
 937
 938        if (sleep_start) {
 939                u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
 940
 941                if ((s64)delta < 0)
 942                        delta = 0;
 943
 944                if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
 945                        __schedstat_set(se->statistics.sleep_max, delta);
 946
 947                __schedstat_set(se->statistics.sleep_start, 0);
 948                __schedstat_add(se->statistics.sum_sleep_runtime, delta);
 949
 950                if (tsk) {
 951                        account_scheduler_latency(tsk, delta >> 10, 1);
 952                        trace_sched_stat_sleep(tsk, delta);
 953                }
 954        }
 955        if (block_start) {
 956                u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
 957
 958                if ((s64)delta < 0)
 959                        delta = 0;
 960
 961                if (unlikely(delta > schedstat_val(se->statistics.block_max)))
 962                        __schedstat_set(se->statistics.block_max, delta);
 963
 964                __schedstat_set(se->statistics.block_start, 0);
 965                __schedstat_add(se->statistics.sum_sleep_runtime, delta);
 966
 967                if (tsk) {
 968                        if (tsk->in_iowait) {
 969                                __schedstat_add(se->statistics.iowait_sum, delta);
 970                                __schedstat_inc(se->statistics.iowait_count);
 971                                trace_sched_stat_iowait(tsk, delta);
 972                        }
 973
 974                        trace_sched_stat_blocked(tsk, delta);
 975
 976                        /*
 977                         * Blocking time is in units of nanosecs, so shift by
 978                         * 20 to get a milliseconds-range estimation of the
 979                         * amount of time that the task spent sleeping:
 980                         */
 981                        if (unlikely(prof_on == SLEEP_PROFILING)) {
 982                                profile_hits(SLEEP_PROFILING,
 983                                                (void *)get_wchan(tsk),
 984                                                delta >> 20);
 985                        }
 986                        account_scheduler_latency(tsk, delta >> 10, 0);
 987                }
 988        }
 989}
 990
 991/*
 992 * Task is being enqueued - update stats:
 993 */
 994static inline void
 995update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 996{
 997        if (!schedstat_enabled())
 998                return;
 999
1000        /*
1001         * Are we enqueueing a waiting task? (for current tasks
1002         * a dequeue/enqueue event is a NOP)
1003         */
1004        if (se != cfs_rq->curr)
1005                update_stats_wait_start(cfs_rq, se);
1006
1007        if (flags & ENQUEUE_WAKEUP)
1008                update_stats_enqueue_sleeper(cfs_rq, se);
1009}
1010
1011static inline void
1012update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1013{
1014
1015        if (!schedstat_enabled())
1016                return;
1017
1018        /*
1019         * Mark the end of the wait period if dequeueing a
1020         * waiting task:
1021         */
1022        if (se != cfs_rq->curr)
1023                update_stats_wait_end(cfs_rq, se);
1024
1025        if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
1026                struct task_struct *tsk = task_of(se);
1027
1028                if (tsk->state & TASK_INTERRUPTIBLE)
1029                        __schedstat_set(se->statistics.sleep_start,
1030                                      rq_clock(rq_of(cfs_rq)));
1031                if (tsk->state & TASK_UNINTERRUPTIBLE)
1032                        __schedstat_set(se->statistics.block_start,
1033                                      rq_clock(rq_of(cfs_rq)));
1034        }
1035}
1036
1037/*
1038 * We are picking a new current task - update its stats:
1039 */
1040static inline void
1041update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
1042{
1043        /*
1044         * We are starting a new run period:
1045         */
1046        se->exec_start = rq_clock_task(rq_of(cfs_rq));
1047}
1048
1049/**************************************************
1050 * Scheduling class queueing methods:
1051 */
1052
1053#ifdef CONFIG_NUMA_BALANCING
1054/*
1055 * Approximate time to scan a full NUMA task in ms. The task scan period is
1056 * calculated based on the tasks virtual memory size and
1057 * numa_balancing_scan_size.
1058 */
1059unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1060unsigned int sysctl_numa_balancing_scan_period_max = 60000;
1061
1062/* Portion of address space to scan in MB */
1063unsigned int sysctl_numa_balancing_scan_size = 256;
1064
1065/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
1066unsigned int sysctl_numa_balancing_scan_delay = 1000;
1067
1068struct numa_group {
1069        refcount_t refcount;
1070
1071        spinlock_t lock; /* nr_tasks, tasks */
1072        int nr_tasks;
1073        pid_t gid;
1074        int active_nodes;
1075
1076        struct rcu_head rcu;
1077        unsigned long total_faults;
1078        unsigned long max_faults_cpu;
1079        /*
1080         * Faults_cpu is used to decide whether memory should move
1081         * towards the CPU. As a consequence, these stats are weighted
1082         * more by CPU use than by memory faults.
1083         */
1084        unsigned long *faults_cpu;
1085        unsigned long faults[0];
1086};
1087
1088/*
1089 * For functions that can be called in multiple contexts that permit reading
1090 * ->numa_group (see struct task_struct for locking rules).
1091 */
1092static struct numa_group *deref_task_numa_group(struct task_struct *p)
1093{
1094        return rcu_dereference_check(p->numa_group, p == current ||
1095                (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
1096}
1097
1098static struct numa_group *deref_curr_numa_group(struct task_struct *p)
1099{
1100        return rcu_dereference_protected(p->numa_group, p == current);
1101}
1102
1103static inline unsigned long group_faults_priv(struct numa_group *ng);
1104static inline unsigned long group_faults_shared(struct numa_group *ng);
1105
1106static unsigned int task_nr_scan_windows(struct task_struct *p)
1107{
1108        unsigned long rss = 0;
1109        unsigned long nr_scan_pages;
1110
1111        /*
1112         * Calculations based on RSS as non-present and empty pages are skipped
1113         * by the PTE scanner and NUMA hinting faults should be trapped based
1114         * on resident pages
1115         */
1116        nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1117        rss = get_mm_rss(p->mm);
1118        if (!rss)
1119                rss = nr_scan_pages;
1120
1121        rss = round_up(rss, nr_scan_pages);
1122        return rss / nr_scan_pages;
1123}
1124
1125/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
1126#define MAX_SCAN_WINDOW 2560
1127
1128static unsigned int task_scan_min(struct task_struct *p)
1129{
1130        unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1131        unsigned int scan, floor;
1132        unsigned int windows = 1;
1133
1134        if (scan_size < MAX_SCAN_WINDOW)
1135                windows = MAX_SCAN_WINDOW / scan_size;
1136        floor = 1000 / windows;
1137
1138        scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1139        return max_t(unsigned int, floor, scan);
1140}
1141
1142static unsigned int task_scan_start(struct task_struct *p)
1143{
1144        unsigned long smin = task_scan_min(p);
1145        unsigned long period = smin;
1146        struct numa_group *ng;
1147
1148        /* Scale the maximum scan period with the amount of shared memory. */
1149        rcu_read_lock();
1150        ng = rcu_dereference(p->numa_group);
1151        if (ng) {
1152                unsigned long shared = group_faults_shared(ng);
1153                unsigned long private = group_faults_priv(ng);
1154
1155                period *= refcount_read(&ng->refcount);
1156                period *= shared + 1;
1157                period /= private + shared + 1;
1158        }
1159        rcu_read_unlock();
1160
1161        return max(smin, period);
1162}
1163
1164static unsigned int task_scan_max(struct task_struct *p)
1165{
1166        unsigned long smin = task_scan_min(p);
1167        unsigned long smax;
1168        struct numa_group *ng;
1169
1170        /* Watch for min being lower than max due to floor calculations */
1171        smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1172
1173        /* Scale the maximum scan period with the amount of shared memory. */
1174        ng = deref_curr_numa_group(p);
1175        if (ng) {
1176                unsigned long shared = group_faults_shared(ng);
1177                unsigned long private = group_faults_priv(ng);
1178                unsigned long period = smax;
1179
1180                period *= refcount_read(&ng->refcount);
1181                period *= shared + 1;
1182                period /= private + shared + 1;
1183
1184                smax = max(smax, period);
1185        }
1186
1187        return max(smin, smax);
1188}
1189
1190static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1191{
1192        rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
1193        rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1194}
1195
1196static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1197{
1198        rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
1199        rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1200}
1201
1202/* Shared or private faults. */
1203#define NR_NUMA_HINT_FAULT_TYPES 2
1204
1205/* Memory and CPU locality */
1206#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1207
1208/* Averaged statistics, and temporary buffers. */
1209#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1210
1211pid_t task_numa_group_id(struct task_struct *p)
1212{
1213        struct numa_group *ng;
1214        pid_t gid = 0;
1215
1216        rcu_read_lock();
1217        ng = rcu_dereference(p->numa_group);
1218        if (ng)
1219                gid = ng->gid;
1220        rcu_read_unlock();
1221
1222        return gid;
1223}
1224
1225/*
1226 * The averaged statistics, shared & private, memory & CPU,
1227 * occupy the first half of the array. The second half of the
1228 * array is for current counters, which are averaged into the
1229 * first set by task_numa_placement.
1230 */
1231static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1232{
1233        return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1234}
1235
1236static inline unsigned long task_faults(struct task_struct *p, int nid)
1237{
1238        if (!p->numa_faults)
1239                return 0;
1240
1241        return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1242                p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1243}
1244
1245static inline unsigned long group_faults(struct task_struct *p, int nid)
1246{
1247        struct numa_group *ng = deref_task_numa_group(p);
1248
1249        if (!ng)
1250                return 0;
1251
1252        return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1253                ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1254}
1255
1256static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1257{
1258        return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1259                group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
1260}
1261
1262static inline unsigned long group_faults_priv(struct numa_group *ng)
1263{
1264        unsigned long faults = 0;
1265        int node;
1266
1267        for_each_online_node(node) {
1268                faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
1269        }
1270
1271        return faults;
1272}
1273
1274static inline unsigned long group_faults_shared(struct numa_group *ng)
1275{
1276        unsigned long faults = 0;
1277        int node;
1278
1279        for_each_online_node(node) {
1280                faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
1281        }
1282
1283        return faults;
1284}
1285
1286/*
1287 * A node triggering more than 1/3 as many NUMA faults as the maximum is
1288 * considered part of a numa group's pseudo-interleaving set. Migrations
1289 * between these nodes are slowed down, to allow things to settle down.
1290 */
1291#define ACTIVE_NODE_FRACTION 3
1292
1293static bool numa_is_active_node(int nid, struct numa_group *ng)
1294{
1295        return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1296}
1297
1298/* Handle placement on systems where not all nodes are directly connected. */
1299static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1300                                        int maxdist, bool task)
1301{
1302        unsigned long score = 0;
1303        int node;
1304
1305        /*
1306         * All nodes are directly connected, and the same distance
1307         * from each other. No need for fancy placement algorithms.
1308         */
1309        if (sched_numa_topology_type == NUMA_DIRECT)
1310                return 0;
1311
1312        /*
1313         * This code is called for each node, introducing N^2 complexity,
1314         * which should be ok given the number of nodes rarely exceeds 8.
1315         */
1316        for_each_online_node(node) {
1317                unsigned long faults;
1318                int dist = node_distance(nid, node);
1319
1320                /*
1321                 * The furthest away nodes in the system are not interesting
1322                 * for placement; nid was already counted.
1323                 */
1324                if (dist == sched_max_numa_distance || node == nid)
1325                        continue;
1326
1327                /*
1328                 * On systems with a backplane NUMA topology, compare groups
1329                 * of nodes, and move tasks towards the group with the most
1330                 * memory accesses. When comparing two nodes at distance
1331                 * "hoplimit", only nodes closer by than "hoplimit" are part
1332                 * of each group. Skip other nodes.
1333                 */
1334                if (sched_numa_topology_type == NUMA_BACKPLANE &&
1335                                        dist >= maxdist)
1336                        continue;
1337
1338                /* Add up the faults from nearby nodes. */
1339                if (task)
1340                        faults = task_faults(p, node);
1341                else
1342                        faults = group_faults(p, node);
1343
1344                /*
1345                 * On systems with a glueless mesh NUMA topology, there are
1346                 * no fixed "groups of nodes". Instead, nodes that are not
1347                 * directly connected bounce traffic through intermediate
1348                 * nodes; a numa_group can occupy any set of nodes.
1349                 * The further away a node is, the less the faults count.
1350                 * This seems to result in good task placement.
1351                 */
1352                if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1353                        faults *= (sched_max_numa_distance - dist);
1354                        faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1355                }
1356
1357                score += faults;
1358        }
1359
1360        return score;
1361}
1362
1363/*
1364 * These return the fraction of accesses done by a particular task, or
1365 * task group, on a particular numa node.  The group weight is given a
1366 * larger multiplier, in order to group tasks together that are almost
1367 * evenly spread out between numa nodes.
1368 */
1369static inline unsigned long task_weight(struct task_struct *p, int nid,
1370                                        int dist)
1371{
1372        unsigned long faults, total_faults;
1373
1374        if (!p->numa_faults)
1375                return 0;
1376
1377        total_faults = p->total_numa_faults;
1378
1379        if (!total_faults)
1380                return 0;
1381
1382        faults = task_faults(p, nid);
1383        faults += score_nearby_nodes(p, nid, dist, true);
1384
1385        return 1000 * faults / total_faults;
1386}
1387
1388static inline unsigned long group_weight(struct task_struct *p, int nid,
1389                                         int dist)
1390{
1391        struct numa_group *ng = deref_task_numa_group(p);
1392        unsigned long faults, total_faults;
1393
1394        if (!ng)
1395                return 0;
1396
1397        total_faults = ng->total_faults;
1398
1399        if (!total_faults)
1400                return 0;
1401
1402        faults = group_faults(p, nid);
1403        faults += score_nearby_nodes(p, nid, dist, false);
1404
1405        return 1000 * faults / total_faults;
1406}
1407
1408bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1409                                int src_nid, int dst_cpu)
1410{
1411        struct numa_group *ng = deref_curr_numa_group(p);
1412        int dst_nid = cpu_to_node(dst_cpu);
1413        int last_cpupid, this_cpupid;
1414
1415        this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1416        last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1417
1418        /*
1419         * Allow first faults or private faults to migrate immediately early in
1420         * the lifetime of a task. The magic number 4 is based on waiting for
1421         * two full passes of the "multi-stage node selection" test that is
1422         * executed below.
1423         */
1424        if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
1425            (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
1426                return true;
1427
1428        /*
1429         * Multi-stage node selection is used in conjunction with a periodic
1430         * migration fault to build a temporal task<->page relation. By using
1431         * a two-stage filter we remove short/unlikely relations.
1432         *
1433         * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1434         * a task's usage of a particular page (n_p) per total usage of this
1435         * page (n_t) (in a given time-span) to a probability.
1436         *
1437         * Our periodic faults will sample this probability and getting the
1438         * same result twice in a row, given these samples are fully
1439         * independent, is then given by P(n)^2, provided our sample period
1440         * is sufficiently short compared to the usage pattern.
1441         *
1442         * This quadric squishes small probabilities, making it less likely we
1443         * act on an unlikely task<->page relation.
1444         */
1445        if (!cpupid_pid_unset(last_cpupid) &&
1446                                cpupid_to_nid(last_cpupid) != dst_nid)
1447                return false;
1448
1449        /* Always allow migrate on private faults */
1450        if (cpupid_match_pid(p, last_cpupid))
1451                return true;
1452
1453        /* A shared fault, but p->numa_group has not been set up yet. */
1454        if (!ng)
1455                return true;
1456
1457        /*
1458         * Destination node is much more heavily used than the source
1459         * node? Allow migration.
1460         */
1461        if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1462                                        ACTIVE_NODE_FRACTION)
1463                return true;
1464
1465        /*
1466         * Distribute memory according to CPU & memory use on each node,
1467         * with 3/4 hysteresis to avoid unnecessary memory migrations:
1468         *
1469         * faults_cpu(dst)   3   faults_cpu(src)
1470         * --------------- * - > ---------------
1471         * faults_mem(dst)   4   faults_mem(src)
1472         */
1473        return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1474               group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1475}
1476
1477static unsigned long cpu_runnable_load(struct rq *rq);
1478
1479/* Cached statistics for all CPUs within a node */
1480struct numa_stats {
1481        unsigned long load;
1482
1483        /* Total compute capacity of CPUs on a node */
1484        unsigned long compute_capacity;
1485};
1486
1487/*
1488 * XXX borrowed from update_sg_lb_stats
1489 */
1490static void update_numa_stats(struct numa_stats *ns, int nid)
1491{
1492        int cpu;
1493
1494        memset(ns, 0, sizeof(*ns));
1495        for_each_cpu(cpu, cpumask_of_node(nid)) {
1496                struct rq *rq = cpu_rq(cpu);
1497
1498                ns->load += cpu_runnable_load(rq);
1499                ns->compute_capacity += capacity_of(cpu);
1500        }
1501
1502}
1503
1504struct task_numa_env {
1505        struct task_struct *p;
1506
1507        int src_cpu, src_nid;
1508        int dst_cpu, dst_nid;
1509
1510        struct numa_stats src_stats, dst_stats;
1511
1512        int imbalance_pct;
1513        int dist;
1514
1515        struct task_struct *best_task;
1516        long best_imp;
1517        int best_cpu;
1518};
1519
1520static void task_numa_assign(struct task_numa_env *env,
1521                             struct task_struct *p, long imp)
1522{
1523        struct rq *rq = cpu_rq(env->dst_cpu);
1524
1525        /* Bail out if run-queue part of active NUMA balance. */
1526        if (xchg(&rq->numa_migrate_on, 1))
1527                return;
1528
1529        /*
1530         * Clear previous best_cpu/rq numa-migrate flag, since task now
1531         * found a better CPU to move/swap.
1532         */
1533        if (env->best_cpu != -1) {
1534                rq = cpu_rq(env->best_cpu);
1535                WRITE_ONCE(rq->numa_migrate_on, 0);
1536        }
1537
1538        if (env->best_task)
1539                put_task_struct(env->best_task);
1540        if (p)
1541                get_task_struct(p);
1542
1543        env->best_task = p;
1544        env->best_imp = imp;
1545        env->best_cpu = env->dst_cpu;
1546}
1547
1548static bool load_too_imbalanced(long src_load, long dst_load,
1549                                struct task_numa_env *env)
1550{
1551        long imb, old_imb;
1552        long orig_src_load, orig_dst_load;
1553        long src_capacity, dst_capacity;
1554
1555        /*
1556         * The load is corrected for the CPU capacity available on each node.
1557         *
1558         * src_load        dst_load
1559         * ------------ vs ---------
1560         * src_capacity    dst_capacity
1561         */
1562        src_capacity = env->src_stats.compute_capacity;
1563        dst_capacity = env->dst_stats.compute_capacity;
1564
1565        imb = abs(dst_load * src_capacity - src_load * dst_capacity);
1566
1567        orig_src_load = env->src_stats.load;
1568        orig_dst_load = env->dst_stats.load;
1569
1570        old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
1571
1572        /* Would this change make things worse? */
1573        return (imb > old_imb);
1574}
1575
1576/*
1577 * Maximum NUMA importance can be 1998 (2*999);
1578 * SMALLIMP @ 30 would be close to 1998/64.
1579 * Used to deter task migration.
1580 */
1581#define SMALLIMP        30
1582
1583/*
1584 * This checks if the overall compute and NUMA accesses of the system would
1585 * be improved if the source tasks was migrated to the target dst_cpu taking
1586 * into account that it might be best if task running on the dst_cpu should
1587 * be exchanged with the source task
1588 */
1589static void task_numa_compare(struct task_numa_env *env,
1590                              long taskimp, long groupimp, bool maymove)
1591{
1592        struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
1593        struct rq *dst_rq = cpu_rq(env->dst_cpu);
1594        long imp = p_ng ? groupimp : taskimp;
1595        struct task_struct *cur;
1596        long src_load, dst_load;
1597        int dist = env->dist;
1598        long moveimp = imp;
1599        long load;
1600
1601        if (READ_ONCE(dst_rq->numa_migrate_on))
1602                return;
1603
1604        rcu_read_lock();
1605        cur = rcu_dereference(dst_rq->curr);
1606        if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
1607                cur = NULL;
1608
1609        /*
1610         * Because we have preemption enabled we can get migrated around and
1611         * end try selecting ourselves (current == env->p) as a swap candidate.
1612         */
1613        if (cur == env->p)
1614                goto unlock;
1615
1616        if (!cur) {
1617                if (maymove && moveimp >= env->best_imp)
1618                        goto assign;
1619                else
1620                        goto unlock;
1621        }
1622
1623        /*
1624         * "imp" is the fault differential for the source task between the
1625         * source and destination node. Calculate the total differential for
1626         * the source task and potential destination task. The more negative
1627         * the value is, the more remote accesses that would be expected to
1628         * be incurred if the tasks were swapped.
1629         */
1630        /* Skip this swap candidate if cannot move to the source cpu */
1631        if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
1632                goto unlock;
1633
1634        /*
1635         * If dst and source tasks are in the same NUMA group, or not
1636         * in any group then look only at task weights.
1637         */
1638        cur_ng = rcu_dereference(cur->numa_group);
1639        if (cur_ng == p_ng) {
1640                imp = taskimp + task_weight(cur, env->src_nid, dist) -
1641                      task_weight(cur, env->dst_nid, dist);
1642                /*
1643                 * Add some hysteresis to prevent swapping the
1644                 * tasks within a group over tiny differences.
1645                 */
1646                if (cur_ng)
1647                        imp -= imp / 16;
1648        } else {
1649                /*
1650                 * Compare the group weights. If a task is all by itself
1651                 * (not part of a group), use the task weight instead.
1652                 */
1653                if (cur_ng && p_ng)
1654                        imp += group_weight(cur, env->src_nid, dist) -
1655                               group_weight(cur, env->dst_nid, dist);
1656                else
1657                        imp += task_weight(cur, env->src_nid, dist) -
1658                               task_weight(cur, env->dst_nid, dist);
1659        }
1660
1661        if (maymove && moveimp > imp && moveimp > env->best_imp) {
1662                imp = moveimp;
1663                cur = NULL;
1664                goto assign;
1665        }
1666
1667        /*
1668         * If the NUMA importance is less than SMALLIMP,
1669         * task migration might only result in ping pong
1670         * of tasks and also hurt performance due to cache
1671         * misses.
1672         */
1673        if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
1674                goto unlock;
1675
1676        /*
1677         * In the overloaded case, try and keep the load balanced.
1678         */
1679        load = task_h_load(env->p) - task_h_load(cur);
1680        if (!load)
1681                goto assign;
1682
1683        dst_load = env->dst_stats.load + load;
1684        src_load = env->src_stats.load - load;
1685
1686        if (load_too_imbalanced(src_load, dst_load, env))
1687                goto unlock;
1688
1689assign:
1690        /*
1691         * One idle CPU per node is evaluated for a task numa move.
1692         * Call select_idle_sibling to maybe find a better one.
1693         */
1694        if (!cur) {
1695                /*
1696                 * select_idle_siblings() uses an per-CPU cpumask that
1697                 * can be used from IRQ context.
1698                 */
1699                local_irq_disable();
1700                env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1701                                                   env->dst_cpu);
1702                local_irq_enable();
1703        }
1704
1705        task_numa_assign(env, cur, imp);
1706unlock:
1707        rcu_read_unlock();
1708}
1709
1710static void task_numa_find_cpu(struct task_numa_env *env,
1711                                long taskimp, long groupimp)
1712{
1713        long src_load, dst_load, load;
1714        bool maymove = false;
1715        int cpu;
1716
1717        load = task_h_load(env->p);
1718        dst_load = env->dst_stats.load + load;
1719        src_load = env->src_stats.load - load;
1720
1721        /*
1722         * If the improvement from just moving env->p direction is better
1723         * than swapping tasks around, check if a move is possible.
1724         */
1725        maymove = !load_too_imbalanced(src_load, dst_load, env);
1726
1727        for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1728                /* Skip this CPU if the source task cannot migrate */
1729                if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
1730                        continue;
1731
1732                env->dst_cpu = cpu;
1733                task_numa_compare(env, taskimp, groupimp, maymove);
1734        }
1735}
1736
1737static int task_numa_migrate(struct task_struct *p)
1738{
1739        struct task_numa_env env = {
1740                .p = p,
1741
1742                .src_cpu = task_cpu(p),
1743                .src_nid = task_node(p),
1744
1745                .imbalance_pct = 112,
1746
1747                .best_task = NULL,
1748                .best_imp = 0,
1749                .best_cpu = -1,
1750        };
1751        unsigned long taskweight, groupweight;
1752        struct sched_domain *sd;
1753        long taskimp, groupimp;
1754        struct numa_group *ng;
1755        struct rq *best_rq;
1756        int nid, ret, dist;
1757
1758        /*
1759         * Pick the lowest SD_NUMA domain, as that would have the smallest
1760         * imbalance and would be the first to start moving tasks about.
1761         *
1762         * And we want to avoid any moving of tasks about, as that would create
1763         * random movement of tasks -- counter the numa conditions we're trying
1764         * to satisfy here.
1765         */
1766        rcu_read_lock();
1767        sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1768        if (sd)
1769                env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1770        rcu_read_unlock();
1771
1772        /*
1773         * Cpusets can break the scheduler domain tree into smaller
1774         * balance domains, some of which do not cross NUMA boundaries.
1775         * Tasks that are "trapped" in such domains cannot be migrated
1776         * elsewhere, so there is no point in (re)trying.
1777         */
1778        if (unlikely(!sd)) {
1779                sched_setnuma(p, task_node(p));
1780                return -EINVAL;
1781        }
1782
1783        env.dst_nid = p->numa_preferred_nid;
1784        dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1785        taskweight = task_weight(p, env.src_nid, dist);
1786        groupweight = group_weight(p, env.src_nid, dist);
1787        update_numa_stats(&env.src_stats, env.src_nid);
1788        taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1789        groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1790        update_numa_stats(&env.dst_stats, env.dst_nid);
1791
1792        /* Try to find a spot on the preferred nid. */
1793        task_numa_find_cpu(&env, taskimp, groupimp);
1794
1795        /*
1796         * Look at other nodes in these cases:
1797         * - there is no space available on the preferred_nid
1798         * - the task is part of a numa_group that is interleaved across
1799         *   multiple NUMA nodes; in order to better consolidate the group,
1800         *   we need to check other locations.
1801         */
1802        ng = deref_curr_numa_group(p);
1803        if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
1804                for_each_online_node(nid) {
1805                        if (nid == env.src_nid || nid == p->numa_preferred_nid)
1806                                continue;
1807
1808                        dist = node_distance(env.src_nid, env.dst_nid);
1809                        if (sched_numa_topology_type == NUMA_BACKPLANE &&
1810                                                dist != env.dist) {
1811                                taskweight = task_weight(p, env.src_nid, dist);
1812                                groupweight = group_weight(p, env.src_nid, dist);
1813                        }
1814
1815                        /* Only consider nodes where both task and groups benefit */
1816                        taskimp = task_weight(p, nid, dist) - taskweight;
1817                        groupimp = group_weight(p, nid, dist) - groupweight;
1818                        if (taskimp < 0 && groupimp < 0)
1819                                continue;
1820
1821                        env.dist = dist;
1822                        env.dst_nid = nid;
1823                        update_numa_stats(&env.dst_stats, env.dst_nid);
1824                        task_numa_find_cpu(&env, taskimp, groupimp);
1825                }
1826        }
1827
1828        /*
1829         * If the task is part of a workload that spans multiple NUMA nodes,
1830         * and is migrating into one of the workload's active nodes, remember
1831         * this node as the task's preferred numa node, so the workload can
1832         * settle down.
1833         * A task that migrated to a second choice node will be better off
1834         * trying for a better one later. Do not set the preferred node here.
1835         */
1836        if (ng) {
1837                if (env.best_cpu == -1)
1838                        nid = env.src_nid;
1839                else
1840                        nid = cpu_to_node(env.best_cpu);
1841
1842                if (nid != p->numa_preferred_nid)
1843                        sched_setnuma(p, nid);
1844        }
1845
1846        /* No better CPU than the current one was found. */
1847        if (env.best_cpu == -1)
1848                return -EAGAIN;
1849
1850        best_rq = cpu_rq(env.best_cpu);
1851        if (env.best_task == NULL) {
1852                ret = migrate_task_to(p, env.best_cpu);
1853                WRITE_ONCE(best_rq->numa_migrate_on, 0);
1854                if (ret != 0)
1855                        trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1856                return ret;
1857        }
1858
1859        ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
1860        WRITE_ONCE(best_rq->numa_migrate_on, 0);
1861
1862        if (ret != 0)
1863                trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1864        put_task_struct(env.best_task);
1865        return ret;
1866}
1867
1868/* Attempt to migrate a task to a CPU on the preferred node. */
1869static void numa_migrate_preferred(struct task_struct *p)
1870{
1871        unsigned long interval = HZ;
1872
1873        /* This task has no NUMA fault statistics yet */
1874        if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
1875                return;
1876
1877        /* Periodically retry migrating the task to the preferred node */
1878        interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1879        p->numa_migrate_retry = jiffies + interval;
1880
1881        /* Success if task is already running on preferred CPU */
1882        if (task_node(p) == p->numa_preferred_nid)
1883                return;
1884
1885        /* Otherwise, try migrate to a CPU on the preferred node */
1886        task_numa_migrate(p);
1887}
1888
1889/*
1890 * Find out how many nodes on the workload is actively running on. Do this by
1891 * tracking the nodes from which NUMA hinting faults are triggered. This can
1892 * be different from the set of nodes where the workload's memory is currently
1893 * located.
1894 */
1895static void numa_group_count_active_nodes(struct numa_group *numa_group)
1896{
1897        unsigned long faults, max_faults = 0;
1898        int nid, active_nodes = 0;
1899
1900        for_each_online_node(nid) {
1901                faults = group_faults_cpu(numa_group, nid);
1902                if (faults > max_faults)
1903                        max_faults = faults;
1904        }
1905
1906        for_each_online_node(nid) {
1907                faults = group_faults_cpu(numa_group, nid);
1908                if (faults * ACTIVE_NODE_FRACTION > max_faults)
1909                        active_nodes++;
1910        }
1911
1912        numa_group->max_faults_cpu = max_faults;
1913        numa_group->active_nodes = active_nodes;
1914}
1915
1916/*
1917 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1918 * increments. The more local the fault statistics are, the higher the scan
1919 * period will be for the next scan window. If local/(local+remote) ratio is
1920 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1921 * the scan period will decrease. Aim for 70% local accesses.
1922 */
1923#define NUMA_PERIOD_SLOTS 10
1924#define NUMA_PERIOD_THRESHOLD 7
1925
1926/*
1927 * Increase the scan period (slow down scanning) if the majority of
1928 * our memory is already on our local node, or if the majority of
1929 * the page accesses are shared with other processes.
1930 * Otherwise, decrease the scan period.
1931 */
1932static void update_task_scan_period(struct task_struct *p,
1933                        unsigned long shared, unsigned long private)
1934{
1935        unsigned int period_slot;
1936        int lr_ratio, ps_ratio;
1937        int diff;
1938
1939        unsigned long remote = p->numa_faults_locality[0];
1940        unsigned long local = p->numa_faults_locality[1];
1941
1942        /*
1943         * If there were no record hinting faults then either the task is
1944         * completely idle or all activity is areas that are not of interest
1945         * to automatic numa balancing. Related to that, if there were failed
1946         * migration then it implies we are migrating too quickly or the local
1947         * node is overloaded. In either case, scan slower
1948         */
1949        if (local + shared == 0 || p->numa_faults_locality[2]) {
1950                p->numa_scan_period = min(p->numa_scan_period_max,
1951                        p->numa_scan_period << 1);
1952
1953                p->mm->numa_next_scan = jiffies +
1954                        msecs_to_jiffies(p->numa_scan_period);
1955
1956                return;
1957        }
1958
1959        /*
1960         * Prepare to scale scan period relative to the current period.
1961         *       == NUMA_PERIOD_THRESHOLD scan period stays the same
1962         *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1963         *       >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1964         */
1965        period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1966        lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1967        ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
1968
1969        if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
1970                /*
1971                 * Most memory accesses are local. There is no need to
1972                 * do fast NUMA scanning, since memory is already local.
1973                 */
1974                int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
1975                if (!slot)
1976                        slot = 1;
1977                diff = slot * period_slot;
1978        } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
1979                /*
1980                 * Most memory accesses are shared with other tasks.
1981                 * There is no point in continuing fast NUMA scanning,
1982                 * since other tasks may just move the memory elsewhere.
1983                 */
1984                int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
1985                if (!slot)
1986                        slot = 1;
1987                diff = slot * period_slot;
1988        } else {
1989                /*
1990                 * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
1991                 * yet they are not on the local NUMA node. Speed up
1992                 * NUMA scanning to get the memory moved over.
1993                 */
1994                int ratio = max(lr_ratio, ps_ratio);
1995                diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1996        }
1997
1998        p->numa_scan_period = clamp(p->numa_scan_period + diff,
1999                        task_scan_min(p), task_scan_max(p));
2000        memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2001}
2002
2003/*
2004 * Get the fraction of time the task has been running since the last
2005 * NUMA placement cycle. The scheduler keeps similar statistics, but
2006 * decays those on a 32ms period, which is orders of magnitude off
2007 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
2008 * stats only if the task is so new there are no NUMA statistics yet.
2009 */
2010static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2011{
2012        u64 runtime, delta, now;
2013        /* Use the start of this time slice to avoid calculations. */
2014        now = p->se.exec_start;
2015        runtime = p->se.sum_exec_runtime;
2016
2017        if (p->last_task_numa_placement) {
2018                delta = runtime - p->last_sum_exec_runtime;
2019                *period = now - p->last_task_numa_placement;
2020
2021                /* Avoid time going backwards, prevent potential divide error: */
2022                if (unlikely((s64)*period < 0))
2023                        *period = 0;
2024        } else {
2025                delta = p->se.avg.load_sum;
2026                *period = LOAD_AVG_MAX;
2027        }
2028
2029        p->last_sum_exec_runtime = runtime;
2030        p->last_task_numa_placement = now;
2031
2032        return delta;
2033}
2034
2035/*
2036 * Determine the preferred nid for a task in a numa_group. This needs to
2037 * be done in a way that produces consistent results with group_weight,
2038 * otherwise workloads might not converge.
2039 */
2040static int preferred_group_nid(struct task_struct *p, int nid)
2041{
2042        nodemask_t nodes;
2043        int dist;
2044
2045        /* Direct connections between all NUMA nodes. */
2046        if (sched_numa_topology_type == NUMA_DIRECT)
2047                return nid;
2048
2049        /*
2050         * On a system with glueless mesh NUMA topology, group_weight
2051         * scores nodes according to the number of NUMA hinting faults on
2052         * both the node itself, and on nearby nodes.
2053         */
2054        if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2055                unsigned long score, max_score = 0;
2056                int node, max_node = nid;
2057
2058                dist = sched_max_numa_distance;
2059
2060                for_each_online_node(node) {
2061                        score = group_weight(p, node, dist);
2062                        if (score > max_score) {
2063                                max_score = score;
2064                                max_node = node;
2065                        }
2066                }
2067                return max_node;
2068        }
2069
2070        /*
2071         * Finding the preferred nid in a system with NUMA backplane
2072         * interconnect topology is more involved. The goal is to locate
2073         * tasks from numa_groups near each other in the system, and
2074         * untangle workloads from different sides of the system. This requires
2075         * searching down the hierarchy of node groups, recursively searching
2076         * inside the highest scoring group of nodes. The nodemask tricks
2077         * keep the complexity of the search down.
2078         */
2079        nodes = node_online_map;
2080        for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2081                unsigned long max_faults = 0;
2082                nodemask_t max_group = NODE_MASK_NONE;
2083                int a, b;
2084
2085                /* Are there nodes at this distance from each other? */
2086                if (!find_numa_distance(dist))
2087                        continue;
2088
2089                for_each_node_mask(a, nodes) {
2090                        unsigned long faults = 0;
2091                        nodemask_t this_group;
2092                        nodes_clear(this_group);
2093
2094                        /* Sum group's NUMA faults; includes a==b case. */
2095                        for_each_node_mask(b, nodes) {
2096                                if (node_distance(a, b) < dist) {
2097                                        faults += group_faults(p, b);
2098                                        node_set(b, this_group);
2099                                        node_clear(b, nodes);
2100                                }
2101                        }
2102
2103                        /* Remember the top group. */
2104                        if (faults > max_faults) {
2105                                max_faults = faults;
2106                                max_group = this_group;
2107                                /*
2108                                 * subtle: at the smallest distance there is
2109                                 * just one node left in each "group", the
2110                                 * winner is the preferred nid.
2111                                 */
2112                                nid = a;
2113                        }
2114                }
2115                /* Next round, evaluate the nodes within max_group. */
2116                if (!max_faults)
2117                        break;
2118                nodes = max_group;
2119        }
2120        return nid;
2121}
2122
2123static void task_numa_placement(struct task_struct *p)
2124{
2125        int seq, nid, max_nid = NUMA_NO_NODE;
2126        unsigned long max_faults = 0;
2127        unsigned long fault_types[2] = { 0, 0 };
2128        unsigned long total_faults;
2129        u64 runtime, period;
2130        spinlock_t *group_lock = NULL;
2131        struct numa_group *ng;
2132
2133        /*
2134         * The p->mm->numa_scan_seq field gets updated without
2135         * exclusive access. Use READ_ONCE() here to ensure
2136         * that the field is read in a single access:
2137         */
2138        seq = READ_ONCE(p->mm->numa_scan_seq);
2139        if (p->numa_scan_seq == seq)
2140                return;
2141        p->numa_scan_seq = seq;
2142        p->numa_scan_period_max = task_scan_max(p);
2143
2144        total_faults = p->numa_faults_locality[0] +
2145                       p->numa_faults_locality[1];
2146        runtime = numa_get_avg_runtime(p, &period);
2147
2148        /* If the task is part of a group prevent parallel updates to group stats */
2149        ng = deref_curr_numa_group(p);
2150        if (ng) {
2151                group_lock = &ng->lock;
2152                spin_lock_irq(group_lock);
2153        }
2154
2155        /* Find the node with the highest number of faults */
2156        for_each_online_node(nid) {
2157                /* Keep track of the offsets in numa_faults array */
2158                int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2159                unsigned long faults = 0, group_faults = 0;
2160                int priv;
2161
2162                for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2163                        long diff, f_diff, f_weight;
2164
2165                        mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2166                        membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2167                        cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2168                        cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2169
2170                        /* Decay existing window, copy faults since last scan */
2171                        diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2172                        fault_types[priv] += p->numa_faults[membuf_idx];
2173                        p->numa_faults[membuf_idx] = 0;
2174
2175                        /*
2176                         * Normalize the faults_from, so all tasks in a group
2177                         * count according to CPU use, instead of by the raw
2178                         * number of faults. Tasks with little runtime have
2179                         * little over-all impact on throughput, and thus their
2180                         * faults are less important.
2181                         */
2182                        f_weight = div64_u64(runtime << 16, period + 1);
2183                        f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2184                                   (total_faults + 1);
2185                        f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2186                        p->numa_faults[cpubuf_idx] = 0;
2187
2188                        p->numa_faults[mem_idx] += diff;
2189                        p->numa_faults[cpu_idx] += f_diff;
2190                        faults += p->numa_faults[mem_idx];
2191                        p->total_numa_faults += diff;
2192                        if (ng) {
2193                                /*
2194                                 * safe because we can only change our own group
2195                                 *
2196                                 * mem_idx represents the offset for a given
2197                                 * nid and priv in a specific region because it
2198                                 * is at the beginning of the numa_faults array.
2199                                 */
2200                                ng->faults[mem_idx] += diff;
2201                                ng->faults_cpu[mem_idx] += f_diff;
2202                                ng->total_faults += diff;
2203                                group_faults += ng->faults[mem_idx];
2204                        }
2205                }
2206
2207                if (!ng) {
2208                        if (faults > max_faults) {
2209                                max_faults = faults;
2210                                max_nid = nid;
2211                        }
2212                } else if (group_faults > max_faults) {
2213                        max_faults = group_faults;
2214                        max_nid = nid;
2215                }
2216        }
2217
2218        if (ng) {
2219                numa_group_count_active_nodes(ng);
2220                spin_unlock_irq(group_lock);
2221                max_nid = preferred_group_nid(p, max_nid);
2222        }
2223
2224        if (max_faults) {
2225                /* Set the new preferred node */
2226                if (max_nid != p->numa_preferred_nid)
2227                        sched_setnuma(p, max_nid);
2228        }
2229
2230        update_task_scan_period(p, fault_types[0], fault_types[1]);
2231}
2232
2233static inline int get_numa_group(struct numa_group *grp)
2234{
2235        return refcount_inc_not_zero(&grp->refcount);
2236}
2237
2238static inline void put_numa_group(struct numa_group *grp)
2239{
2240        if (refcount_dec_and_test(&grp->refcount))
2241                kfree_rcu(grp, rcu);
2242}
2243
2244static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2245                        int *priv)
2246{
2247        struct numa_group *grp, *my_grp;
2248        struct task_struct *tsk;
2249        bool join = false;
2250        int cpu = cpupid_to_cpu(cpupid);
2251        int i;
2252
2253        if (unlikely(!deref_curr_numa_group(p))) {
2254                unsigned int size = sizeof(struct numa_group) +
2255                                    4*nr_node_ids*sizeof(unsigned long);
2256
2257                grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2258                if (!grp)
2259                        return;
2260
2261                refcount_set(&grp->refcount, 1);
2262                grp->active_nodes = 1;
2263                grp->max_faults_cpu = 0;
2264                spin_lock_init(&grp->lock);
2265                grp->gid = p->pid;
2266                /* Second half of the array tracks nids where faults happen */
2267                grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2268                                                nr_node_ids;
2269
2270                for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2271                        grp->faults[i] = p->numa_faults[i];
2272
2273                grp->total_faults = p->total_numa_faults;
2274
2275                grp->nr_tasks++;
2276                rcu_assign_pointer(p->numa_group, grp);
2277        }
2278
2279        rcu_read_lock();
2280        tsk = READ_ONCE(cpu_rq(cpu)->curr);
2281
2282        if (!cpupid_match_pid(tsk, cpupid))
2283                goto no_join;
2284
2285        grp = rcu_dereference(tsk->numa_group);
2286        if (!grp)
2287                goto no_join;
2288
2289        my_grp = deref_curr_numa_group(p);
2290        if (grp == my_grp)
2291                goto no_join;
2292
2293        /*
2294         * Only join the other group if its bigger; if we're the bigger group,
2295         * the other task will join us.
2296         */
2297        if (my_grp->nr_tasks > grp->nr_tasks)
2298                goto no_join;
2299
2300        /*
2301         * Tie-break on the grp address.
2302         */
2303        if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2304                goto no_join;
2305
2306        /* Always join threads in the same process. */
2307        if (tsk->mm == current->mm)
2308                join = true;
2309
2310        /* Simple filter to avoid false positives due to PID collisions */
2311        if (flags & TNF_SHARED)
2312                join = true;
2313
2314        /* Update priv based on whether false sharing was detected */
2315        *priv = !join;
2316
2317        if (join && !get_numa_group(grp))
2318                goto no_join;
2319
2320        rcu_read_unlock();
2321
2322        if (!join)
2323                return;
2324
2325        BUG_ON(irqs_disabled());
2326        double_lock_irq(&my_grp->lock, &grp->lock);
2327
2328        for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2329                my_grp->faults[i] -= p->numa_faults[i];
2330                grp->faults[i] += p->numa_faults[i];
2331        }
2332        my_grp->total_faults -= p->total_numa_faults;
2333        grp->total_faults += p->total_numa_faults;
2334
2335        my_grp->nr_tasks--;
2336        grp->nr_tasks++;
2337
2338        spin_unlock(&my_grp->lock);
2339        spin_unlock_irq(&grp->lock);
2340
2341        rcu_assign_pointer(p->numa_group, grp);
2342
2343        put_numa_group(my_grp);
2344        return;
2345
2346no_join:
2347        rcu_read_unlock();
2348        return;
2349}
2350
2351/*
2352 * Get rid of NUMA staticstics associated with a task (either current or dead).
2353 * If @final is set, the task is dead and has reached refcount zero, so we can
2354 * safely free all relevant data structures. Otherwise, there might be
2355 * concurrent reads from places like load balancing and procfs, and we should
2356 * reset the data back to default state without freeing ->numa_faults.
2357 */
2358void task_numa_free(struct task_struct *p, bool final)
2359{
2360        /* safe: p either is current or is being freed by current */
2361        struct numa_group *grp = rcu_dereference_raw(p->numa_group);
2362        unsigned long *numa_faults = p->numa_faults;
2363        unsigned long flags;
2364        int i;
2365
2366        if (!numa_faults)
2367                return;
2368
2369        if (grp) {
2370                spin_lock_irqsave(&grp->lock, flags);
2371                for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2372                        grp->faults[i] -= p->numa_faults[i];
2373                grp->total_faults -= p->total_numa_faults;
2374
2375                grp->nr_tasks--;
2376                spin_unlock_irqrestore(&grp->lock, flags);
2377                RCU_INIT_POINTER(p->numa_group, NULL);
2378                put_numa_group(grp);
2379        }
2380
2381        if (final) {
2382                p->numa_faults = NULL;
2383                kfree(numa_faults);
2384        } else {
2385                p->total_numa_faults = 0;
2386                for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2387                        numa_faults[i] = 0;
2388        }
2389}
2390
2391/*
2392 * Got a PROT_NONE fault for a page on @node.
2393 */
2394void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2395{
2396        struct task_struct *p = current;
2397        bool migrated = flags & TNF_MIGRATED;
2398        int cpu_node = task_node(current);
2399        int local = !!(flags & TNF_FAULT_LOCAL);
2400        struct numa_group *ng;
2401        int priv;
2402
2403        if (!static_branch_likely(&sched_numa_balancing))
2404                return;
2405
2406        /* for example, ksmd faulting in a user's mm */
2407        if (!p->mm)
2408                return;
2409
2410        /* Allocate buffer to track faults on a per-node basis */
2411        if (unlikely(!p->numa_faults)) {
2412                int size = sizeof(*p->numa_faults) *
2413                           NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2414
2415                p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2416                if (!p->numa_faults)
2417                        return;
2418
2419                p->total_numa_faults = 0;
2420                memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2421        }
2422
2423        /*
2424         * First accesses are treated as private, otherwise consider accesses
2425         * to be private if the accessing pid has not changed
2426         */
2427        if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2428                priv = 1;
2429        } else {
2430                priv = cpupid_match_pid(p, last_cpupid);
2431                if (!priv && !(flags & TNF_NO_GROUP))
2432                        task_numa_group(p, last_cpupid, flags, &priv);
2433        }
2434
2435        /*
2436         * If a workload spans multiple NUMA nodes, a shared fault that
2437         * occurs wholly within the set of nodes that the workload is
2438         * actively using should be counted as local. This allows the
2439         * scan rate to slow down when a workload has settled down.
2440         */
2441        ng = deref_curr_numa_group(p);
2442        if (!priv && !local && ng && ng->active_nodes > 1 &&
2443                                numa_is_active_node(cpu_node, ng) &&
2444                                numa_is_active_node(mem_node, ng))
2445                local = 1;
2446
2447        /*
2448         * Retry to migrate task to preferred node periodically, in case it
2449         * previously failed, or the scheduler moved us.
2450         */
2451        if (time_after(jiffies, p->numa_migrate_retry)) {
2452                task_numa_placement(p);
2453                numa_migrate_preferred(p);
2454        }
2455
2456        if (migrated)
2457                p->numa_pages_migrated += pages;
2458        if (flags & TNF_MIGRATE_FAIL)
2459                p->numa_faults_locality[2] += pages;
2460
2461        p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2462        p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2463        p->numa_faults_locality[local] += pages;
2464}
2465
2466static void reset_ptenuma_scan(struct task_struct *p)
2467{
2468        /*
2469         * We only did a read acquisition of the mmap sem, so
2470         * p->mm->numa_scan_seq is written to without exclusive access
2471         * and the update is not guaranteed to be atomic. That's not
2472         * much of an issue though, since this is just used for
2473         * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2474         * expensive, to avoid any form of compiler optimizations:
2475         */
2476        WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2477        p->mm->numa_scan_offset = 0;
2478}
2479
2480/*
2481 * The expensive part of numa migration is done from task_work context.
2482 * Triggered from task_tick_numa().
2483 */
2484static void task_numa_work(struct callback_head *work)
2485{
2486        unsigned long migrate, next_scan, now = jiffies;
2487        struct task_struct *p = current;
2488        struct mm_struct *mm = p->mm;
2489        u64 runtime = p->se.sum_exec_runtime;
2490        struct vm_area_struct *vma;
2491        unsigned long start, end;
2492        unsigned long nr_pte_updates = 0;
2493        long pages, virtpages;
2494
2495        SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
2496
2497        work->next = work;
2498        /*
2499         * Who cares about NUMA placement when they're dying.
2500         *
2501         * NOTE: make sure not to dereference p->mm before this check,
2502         * exit_task_work() happens _after_ exit_mm() so we could be called
2503         * without p->mm even though we still had it when we enqueued this
2504         * work.
2505         */
2506        if (p->flags & PF_EXITING)
2507                return;
2508
2509        if (!mm->numa_next_scan) {
2510                mm->numa_next_scan = now +
2511                        msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2512        }
2513
2514        /*
2515         * Enforce maximal scan/migration frequency..
2516         */
2517        migrate = mm->numa_next_scan;
2518        if (time_before(now, migrate))
2519                return;
2520
2521        if (p->numa_scan_period == 0) {
2522                p->numa_scan_period_max = task_scan_max(p);
2523                p->numa_scan_period = task_scan_start(p);
2524        }
2525
2526        next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2527        if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2528                return;
2529
2530        /*
2531         * Delay this task enough that another task of this mm will likely win
2532         * the next time around.
2533         */
2534        p->node_stamp += 2 * TICK_NSEC;
2535
2536        start = mm->numa_scan_offset;
2537        pages = sysctl_numa_balancing_scan_size;
2538        pages <<= 20 - PAGE_SHIFT; /* MB in pages */
2539        virtpages = pages * 8;     /* Scan up to this much virtual space */
2540        if (!pages)
2541                return;
2542
2543
2544        if (!down_read_trylock(&mm->mmap_sem))
2545                return;
2546        vma = find_vma(mm, start);
2547        if (!vma) {
2548                reset_ptenuma_scan(p);
2549                start = 0;
2550                vma = mm->mmap;
2551        }
2552        for (; vma; vma = vma->vm_next) {
2553                if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2554                        is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2555                        continue;
2556                }
2557
2558                /*
2559                 * Shared library pages mapped by multiple processes are not
2560                 * migrated as it is expected they are cache replicated. Avoid
2561                 * hinting faults in read-only file-backed mappings or the vdso
2562                 * as migrating the pages will be of marginal benefit.
2563                 */
2564                if (!vma->vm_mm ||
2565                    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2566                        continue;
2567
2568                /*
2569                 * Skip inaccessible VMAs to avoid any confusion between
2570                 * PROT_NONE and NUMA hinting ptes
2571                 */
2572                if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2573                        continue;
2574
2575                do {
2576                        start = max(start, vma->vm_start);
2577                        end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2578                        end = min(end, vma->vm_end);
2579                        nr_pte_updates = change_prot_numa(vma, start, end);
2580
2581                        /*
2582                         * Try to scan sysctl_numa_balancing_size worth of
2583                         * hpages that have at least one present PTE that
2584                         * is not already pte-numa. If the VMA contains
2585                         * areas that are unused or already full of prot_numa
2586                         * PTEs, scan up to virtpages, to skip through those
2587                         * areas faster.
2588                         */
2589                        if (nr_pte_updates)
2590                                pages -= (end - start) >> PAGE_SHIFT;
2591                        virtpages -= (end - start) >> PAGE_SHIFT;
2592
2593                        start = end;
2594                        if (pages <= 0 || virtpages <= 0)
2595                                goto out;
2596
2597                        cond_resched();
2598                } while (end != vma->vm_end);
2599        }
2600
2601out:
2602        /*
2603         * It is possible to reach the end of the VMA list but the last few
2604         * VMAs are not guaranteed to the vma_migratable. If they are not, we
2605         * would find the !migratable VMA on the next scan but not reset the
2606         * scanner to the start so check it now.
2607         */
2608        if (vma)
2609                mm->numa_scan_offset = start;
2610        else
2611                reset_ptenuma_scan(p);
2612        up_read(&mm->mmap_sem);
2613
2614        /*
2615         * Make sure tasks use at least 32x as much time to run other code
2616         * than they used here, to limit NUMA PTE scanning overhead to 3% max.
2617         * Usually update_task_scan_period slows down scanning enough; on an
2618         * overloaded system we need to limit overhead on a per task basis.
2619         */
2620        if (unlikely(p->se.sum_exec_runtime != runtime)) {
2621                u64 diff = p->se.sum_exec_runtime - runtime;
2622                p->node_stamp += 32 * diff;
2623        }
2624}
2625
2626void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
2627{
2628        int mm_users = 0;
2629        struct mm_struct *mm = p->mm;
2630
2631        if (mm) {
2632                mm_users = atomic_read(&mm->mm_users);
2633                if (mm_users == 1) {
2634                        mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2635                        mm->numa_scan_seq = 0;
2636                }
2637        }
2638        p->node_stamp                   = 0;
2639        p->numa_scan_seq                = mm ? mm->numa_scan_seq : 0;
2640        p->numa_scan_period             = sysctl_numa_balancing_scan_delay;
2641        /* Protect against double add, see task_tick_numa and task_numa_work */
2642        p->numa_work.next               = &p->numa_work;
2643        p->numa_faults                  = NULL;
2644        RCU_INIT_POINTER(p->numa_group, NULL);
2645        p->last_task_numa_placement     = 0;
2646        p->last_sum_exec_runtime        = 0;
2647
2648        init_task_work(&p->numa_work, task_numa_work);
2649
2650        /* New address space, reset the preferred nid */
2651        if (!(clone_flags & CLONE_VM)) {
2652                p->numa_preferred_nid = NUMA_NO_NODE;
2653                return;
2654        }
2655
2656        /*
2657         * New thread, keep existing numa_preferred_nid which should be copied
2658         * already by arch_dup_task_struct but stagger when scans start.
2659         */
2660        if (mm) {
2661                unsigned int delay;
2662
2663                delay = min_t(unsigned int, task_scan_max(current),
2664                        current->numa_scan_period * mm_users * NSEC_PER_MSEC);
2665                delay += 2 * TICK_NSEC;
2666                p->node_stamp = delay;
2667        }
2668}
2669
2670/*
2671 * Drive the periodic memory faults..
2672 */
2673static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2674{
2675        struct callback_head *work = &curr->numa_work;
2676        u64 period, now;
2677
2678        /*
2679         * We don't care about NUMA placement if we don't have memory.
2680         */
2681        if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2682                return;
2683
2684        /*
2685         * Using runtime rather than walltime has the dual advantage that
2686         * we (mostly) drive the selection from busy threads and that the
2687         * task needs to have done some actual work before we bother with
2688         * NUMA placement.
2689         */
2690        now = curr->se.sum_exec_runtime;
2691        period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2692
2693        if (now > curr->node_stamp + period) {
2694                if (!curr->node_stamp)
2695                        curr->numa_scan_period = task_scan_start(curr);
2696                curr->node_stamp += period;
2697
2698                if (!time_before(jiffies, curr->mm->numa_next_scan))
2699                        task_work_add(curr, work, true);
2700        }
2701}
2702
2703static void update_scan_period(struct task_struct *p, int new_cpu)
2704{
2705        int src_nid = cpu_to_node(task_cpu(p));
2706        int dst_nid = cpu_to_node(new_cpu);
2707
2708        if (!static_branch_likely(&sched_numa_balancing))
2709                return;
2710
2711        if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
2712                return;
2713
2714        if (src_nid == dst_nid)
2715                return;
2716
2717        /*
2718         * Allow resets if faults have been trapped before one scan
2719         * has completed. This is most likely due to a new task that
2720         * is pulled cross-node due to wakeups or load balancing.
2721         */
2722        if (p->numa_scan_seq) {
2723                /*
2724                 * Avoid scan adjustments if moving to the preferred
2725                 * node or if the task was not previously running on
2726                 * the preferred node.
2727                 */
2728                if (dst_nid == p->numa_preferred_nid ||
2729                    (p->numa_preferred_nid != NUMA_NO_NODE &&
2730                        src_nid != p->numa_preferred_nid))
2731                        return;
2732        }
2733
2734        p->numa_scan_period = task_scan_start(p);
2735}
2736
2737#else
2738static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2739{
2740}
2741
2742static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2743{
2744}
2745
2746static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2747{
2748}
2749
2750static inline void update_scan_period(struct task_struct *p, int new_cpu)
2751{
2752}
2753
2754#endif /* CONFIG_NUMA_BALANCING */
2755
2756static void
2757account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2758{
2759        update_load_add(&cfs_rq->load, se->load.weight);
2760#ifdef CONFIG_SMP
2761        if (entity_is_task(se)) {
2762                struct rq *rq = rq_of(cfs_rq);
2763
2764                account_numa_enqueue(rq, task_of(se));
2765                list_add(&se->group_node, &rq->cfs_tasks);
2766        }
2767#endif
2768        cfs_rq->nr_running++;
2769}
2770
2771static void
2772account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2773{
2774        update_load_sub(&cfs_rq->load, se->load.weight);
2775#ifdef CONFIG_SMP
2776        if (entity_is_task(se)) {
2777                account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2778                list_del_init(&se->group_node);
2779        }
2780#endif
2781        cfs_rq->nr_running--;
2782}
2783
2784/*
2785 * Signed add and clamp on underflow.
2786 *
2787 * Explicitly do a load-store to ensure the intermediate value never hits
2788 * memory. This allows lockless observations without ever seeing the negative
2789 * values.
2790 */
2791#define add_positive(_ptr, _val) do {                           \
2792        typeof(_ptr) ptr = (_ptr);                              \
2793        typeof(_val) val = (_val);                              \
2794        typeof(*ptr) res, var = READ_ONCE(*ptr);                \
2795                                                                \
2796        res = var + val;                                        \
2797                                                                \
2798        if (val < 0 && res > var)                               \
2799                res = 0;                                        \
2800                                                                \
2801        WRITE_ONCE(*ptr, res);                                  \
2802} while (0)
2803
2804/*
2805 * Unsigned subtract and clamp on underflow.
2806 *
2807 * Explicitly do a load-store to ensure the intermediate value never hits
2808 * memory. This allows lockless observations without ever seeing the negative
2809 * values.
2810 */
2811#define sub_positive(_ptr, _val) do {                           \
2812        typeof(_ptr) ptr = (_ptr);                              \
2813        typeof(*ptr) val = (_val);                              \
2814        typeof(*ptr) res, var = READ_ONCE(*ptr);                \
2815        res = var - val;                                        \
2816        if (res > var)                                          \
2817                res = 0;                                        \
2818        WRITE_ONCE(*ptr, res);                                  \
2819} while (0)
2820
2821/*
2822 * Remove and clamp on negative, from a local variable.
2823 *
2824 * A variant of sub_positive(), which does not use explicit load-store
2825 * and is thus optimized for local variable updates.
2826 */
2827#define lsub_positive(_ptr, _val) do {                          \
2828        typeof(_ptr) ptr = (_ptr);                              \
2829        *ptr -= min_t(typeof(*ptr), *ptr, _val);                \
2830} while (0)
2831
2832#ifdef CONFIG_SMP
2833static inline void
2834enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2835{
2836        cfs_rq->runnable_weight += se->runnable_weight;
2837
2838        cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
2839        cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
2840}
2841
2842static inline void
2843dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2844{
2845        cfs_rq->runnable_weight -= se->runnable_weight;
2846
2847        sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
2848        sub_positive(&cfs_rq->avg.runnable_load_sum,
2849                     se_runnable(se) * se->avg.runnable_load_sum);
2850}
2851
2852static inline void
2853enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2854{
2855        cfs_rq->avg.load_avg += se->avg.load_avg;
2856        cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
2857}
2858
2859static inline void
2860dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2861{
2862        sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
2863        sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
2864}
2865#else
2866static inline void
2867enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2868static inline void
2869dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2870static inline void
2871enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2872static inline void
2873dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2874#endif
2875
2876static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2877                            unsigned long weight, unsigned long runnable)
2878{
2879        if (se->on_rq) {
2880                /* commit outstanding execution time */
2881                if (cfs_rq->curr == se)
2882                        update_curr(cfs_rq);
2883                account_entity_dequeue(cfs_rq, se);
2884                dequeue_runnable_load_avg(cfs_rq, se);
2885        }
2886        dequeue_load_avg(cfs_rq, se);
2887
2888        se->runnable_weight = runnable;
2889        update_load_set(&se->load, weight);
2890
2891#ifdef CONFIG_SMP
2892        do {
2893                u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
2894
2895                se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
2896                se->avg.runnable_load_avg =
2897                        div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
2898        } while (0);
2899#endif
2900
2901        enqueue_load_avg(cfs_rq, se);
2902        if (se->on_rq) {
2903                account_entity_enqueue(cfs_rq, se);
2904                enqueue_runnable_load_avg(cfs_rq, se);
2905        }
2906}
2907
2908void reweight_task(struct task_struct *p, int prio)
2909{
2910        struct sched_entity *se = &p->se;
2911        struct cfs_rq *cfs_rq = cfs_rq_of(se);
2912        struct load_weight *load = &se->load;
2913        unsigned long weight = scale_load(sched_prio_to_weight[prio]);
2914
2915        reweight_entity(cfs_rq, se, weight, weight);
2916        load->inv_weight = sched_prio_to_wmult[prio];
2917}
2918
2919#ifdef CONFIG_FAIR_GROUP_SCHED
2920#ifdef CONFIG_SMP
2921/*
2922 * All this does is approximate the hierarchical proportion which includes that
2923 * global sum we all love to hate.
2924 *
2925 * That is, the weight of a group entity, is the proportional share of the
2926 * group weight based on the group runqueue weights. That is:
2927 *
2928 *                     tg->weight * grq->load.weight
2929 *   ge->load.weight = -----------------------------               (1)
2930 *                        \Sum grq->load.weight
2931 *
2932 * Now, because computing that sum is prohibitively expensive to compute (been
2933 * there, done that) we approximate it with this average stuff. The average
2934 * moves slower and therefore the approximation is cheaper and more stable.
2935 *
2936 * So instead of the above, we substitute:
2937 *
2938 *   grq->load.weight -> grq->avg.load_avg                         (2)
2939 *
2940 * which yields the following:
2941 *
2942 *                     tg->weight * grq->avg.load_avg
2943 *   ge->load.weight = ------------------------------              (3)
2944 *                              tg->load_avg
2945 *
2946 * Where: tg->load_avg ~= \Sum grq->avg.load_avg
2947 *
2948 * That is shares_avg, and it is right (given the approximation (2)).
2949 *
2950 * The problem with it is that because the average is slow -- it was designed
2951 * to be exactly that of course -- this leads to transients in boundary
2952 * conditions. In specific, the case where the group was idle and we start the
2953 * one task. It takes time for our CPU's grq->avg.load_avg to build up,
2954 * yielding bad latency etc..
2955 *
2956 * Now, in that special case (1) reduces to:
2957 *
2958 *                     tg->weight * grq->load.weight
2959 *   ge->load.weight = ----------------------------- = tg->weight   (4)
2960 *                          grp->load.weight
2961 *
2962 * That is, the sum collapses because all other CPUs are idle; the UP scenario.
2963 *
2964 * So what we do is modify our approximation (3) to approach (4) in the (near)
2965 * UP case, like:
2966 *
2967 *   ge->load.weight =
2968 *
2969 *              tg->weight * grq->load.weight
2970 *     ---------------------------------------------------         (5)
2971 *     tg->load_avg - grq->avg.load_avg + grq->load.weight
2972 *
2973 * But because grq->load.weight can drop to 0, resulting in a divide by zero,
2974 * we need to use grq->avg.load_avg as its lower bound, which then gives:
2975 *
2976 *
2977 *                     tg->weight * grq->load.weight
2978 *   ge->load.weight = -----------------------------               (6)
2979 *                              tg_load_avg'
2980 *
2981 * Where:
2982 *
2983 *   tg_load_avg' = tg->load_avg - grq->avg.load_avg +
2984 *                  max(grq->load.weight, grq->avg.load_avg)
2985 *
2986 * And that is shares_weight and is icky. In the (near) UP case it approaches
2987 * (4) while in the normal case it approaches (3). It consistently
2988 * overestimates the ge->load.weight and therefore:
2989 *
2990 *   \Sum ge->load.weight >= tg->weight
2991 *
2992 * hence icky!
2993 */
2994static long calc_group_shares(struct cfs_rq *cfs_rq)
2995{
2996        long tg_weight, tg_shares, load, shares;
2997        struct task_group *tg = cfs_rq->tg;
2998
2999        tg_shares = READ_ONCE(tg->shares);
3000
3001        load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
3002
3003        tg_weight = atomic_long_read(&tg->load_avg);
3004
3005        /* Ensure tg_weight >= load */
3006        tg_weight -= cfs_rq->tg_load_avg_contrib;
3007        tg_weight += load;
3008
3009        shares = (tg_shares * load);
3010        if (tg_weight)
3011                shares /= tg_weight;
3012
3013        /*
3014         * MIN_SHARES has to be unscaled here to support per-CPU partitioning
3015         * of a group with small tg->shares value. It is a floor value which is
3016         * assigned as a minimum load.weight to the sched_entity representing
3017         * the group on a CPU.
3018         *
3019         * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
3020         * on an 8-core system with 8 tasks each runnable on one CPU shares has
3021         * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
3022         * case no task is runnable on a CPU MIN_SHARES=2 should be returned
3023         * instead of 0.
3024         */
3025        return clamp_t(long, shares, MIN_SHARES, tg_shares);
3026}
3027
3028/*
3029 * This calculates the effective runnable weight for a group entity based on
3030 * the group entity weight calculated above.
3031 *
3032 * Because of the above approximation (2), our group entity weight is
3033 * an load_avg based ratio (3). This means that it includes blocked load and
3034 * does not represent the runnable weight.
3035 *
3036 * Approximate the group entity's runnable weight per ratio from the group
3037 * runqueue:
3038 *
3039 *                                           grq->avg.runnable_load_avg
3040 *   ge->runnable_weight = ge->load.weight * -------------------------- (7)
3041 *                                               grq->avg.load_avg
3042 *
3043 * However, analogous to above, since the avg numbers are slow, this leads to
3044 * transients in the from-idle case. Instead we use:
3045 *
3046 *   ge->runnable_weight = ge->load.weight *
3047 *
3048 *              max(grq->avg.runnable_load_avg, grq->runnable_weight)
3049 *              -----------------------------------------------------   (8)
3050 *                    max(grq->avg.load_avg, grq->load.weight)
3051 *
3052 * Where these max() serve both to use the 'instant' values to fix the slow
3053 * from-idle and avoid the /0 on to-idle, similar to (6).
3054 */
3055static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
3056{
3057        long runnable, load_avg;
3058
3059        load_avg = max(cfs_rq->avg.load_avg,
3060                       scale_load_down(cfs_rq->load.weight));
3061
3062        runnable = max(cfs_rq->avg.runnable_load_avg,
3063                       scale_load_down(cfs_rq->runnable_weight));
3064
3065        runnable *= shares;
3066        if (load_avg)
3067                runnable /= load_avg;
3068
3069        return clamp_t(long, runnable, MIN_SHARES, shares);
3070}
3071#endif /* CONFIG_SMP */
3072
3073static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
3074
3075/*
3076 * Recomputes the group entity based on the current state of its group
3077 * runqueue.
3078 */
3079static void update_cfs_group(struct sched_entity *se)
3080{
3081        struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3082        long shares, runnable;
3083
3084        if (!gcfs_rq)
3085                return;
3086
3087        if (throttled_hierarchy(gcfs_rq))
3088                return;
3089
3090#ifndef CONFIG_SMP
3091        runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
3092
3093        if (likely(se->load.weight == shares))
3094                return;
3095#else
3096        shares   = calc_group_shares(gcfs_rq);
3097        runnable = calc_group_runnable(gcfs_rq, shares);
3098#endif
3099
3100        reweight_entity(cfs_rq_of(se), se, shares, runnable);
3101}
3102
3103#else /* CONFIG_FAIR_GROUP_SCHED */
3104static inline void update_cfs_group(struct sched_entity *se)
3105{
3106}
3107#endif /* CONFIG_FAIR_GROUP_SCHED */
3108
3109static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
3110{
3111        struct rq *rq = rq_of(cfs_rq);
3112
3113        if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
3114                /*
3115                 * There are a few boundary cases this might miss but it should
3116                 * get called often enough that that should (hopefully) not be
3117                 * a real problem.
3118                 *
3119                 * It will not get called when we go idle, because the idle
3120                 * thread is a different class (!fair), nor will the utilization
3121                 * number include things like RT tasks.
3122                 *
3123                 * As is, the util number is not freq-invariant (we'd have to
3124                 * implement arch_scale_freq_capacity() for that).
3125                 *
3126                 * See cpu_util().
3127                 */
3128                cpufreq_update_util(rq, flags);
3129        }
3130}
3131
3132#ifdef CONFIG_SMP
3133#ifdef CONFIG_FAIR_GROUP_SCHED
3134/**
3135 * update_tg_load_avg - update the tg's load avg
3136 * @cfs_rq: the cfs_rq whose avg changed
3137 * @force: update regardless of how small the difference
3138 *
3139 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
3140 * However, because tg->load_avg is a global value there are performance
3141 * considerations.
3142 *
3143 * In order to avoid having to look at the other cfs_rq's, we use a
3144 * differential update where we store the last value we propagated. This in
3145 * turn allows skipping updates if the differential is 'small'.
3146 *
3147 * Updating tg's load_avg is necessary before update_cfs_share().
3148 */
3149static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
3150{
3151        long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
3152
3153        /*
3154         * No need to update load_avg for root_task_group as it is not used.
3155         */
3156        if (cfs_rq->tg == &root_task_group)
3157                return;
3158
3159        if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3160                atomic_long_add(delta, &cfs_rq->tg->load_avg);
3161                cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
3162        }
3163}
3164
3165/*
3166 * Called within set_task_rq() right before setting a task's CPU. The
3167 * caller only guarantees p->pi_lock is held; no other assumptions,
3168 * including the state of rq->lock, should be made.
3169 */
3170void set_task_rq_fair(struct sched_entity *se,
3171                      struct cfs_rq *prev, struct cfs_rq *next)
3172{
3173        u64 p_last_update_time;
3174        u64 n_last_update_time;
3175
3176        if (!sched_feat(ATTACH_AGE_LOAD))
3177                return;
3178
3179        /*
3180         * We are supposed to update the task to "current" time, then its up to
3181         * date and ready to go to new CPU/cfs_rq. But we have difficulty in
3182         * getting what current time is, so simply throw away the out-of-date
3183         * time. This will result in the wakee task is less decayed, but giving
3184         * the wakee more load sounds not bad.
3185         */
3186        if (!(se->avg.last_update_time && prev))
3187                return;
3188
3189#ifndef CONFIG_64BIT
3190        {
3191                u64 p_last_update_time_copy;
3192                u64 n_last_update_time_copy;
3193
3194                do {
3195                        p_last_update_time_copy = prev->load_last_update_time_copy;
3196                        n_last_update_time_copy = next->load_last_update_time_copy;
3197
3198                        smp_rmb();
3199
3200                        p_last_update_time = prev->avg.last_update_time;
3201                        n_last_update_time = next->avg.last_update_time;
3202
3203                } while (p_last_update_time != p_last_update_time_copy ||
3204                         n_last_update_time != n_last_update_time_copy);
3205        }
3206#else
3207        p_last_update_time = prev->avg.last_update_time;
3208        n_last_update_time = next->avg.last_update_time;
3209#endif
3210        __update_load_avg_blocked_se(p_last_update_time, se);
3211        se->avg.last_update_time = n_last_update_time;
3212}
3213
3214
3215/*
3216 * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
3217 * propagate its contribution. The key to this propagation is the invariant
3218 * that for each group:
3219 *
3220 *   ge->avg == grq->avg                                                (1)
3221 *
3222 * _IFF_ we look at the pure running and runnable sums. Because they
3223 * represent the very same entity, just at different points in the hierarchy.
3224 *
3225 * Per the above update_tg_cfs_util() is trivial and simply copies the running
3226 * sum over (but still wrong, because the group entity and group rq do not have
3227 * their PELT windows aligned).
3228 *
3229 * However, update_tg_cfs_runnable() is more complex. So we have:
3230 *
3231 *   ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg          (2)
3232 *
3233 * And since, like util, the runnable part should be directly transferable,
3234 * the following would _appear_ to be the straight forward approach:
3235 *
3236 *   grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg       (3)
3237 *
3238 * And per (1) we have:
3239 *
3240 *   ge->avg.runnable_avg == grq->avg.runnable_avg
3241 *
3242 * Which gives:
3243 *
3244 *                      ge->load.weight * grq->avg.load_avg
3245 *   ge->avg.load_avg = -----------------------------------             (4)
3246 *                               grq->load.weight
3247 *
3248 * Except that is wrong!
3249 *
3250 * Because while for entities historical weight is not important and we
3251 * really only care about our future and therefore can consider a pure
3252 * runnable sum, runqueues can NOT do this.
3253 *
3254 * We specifically want runqueues to have a load_avg that includes
3255 * historical weights. Those represent the blocked load, the load we expect
3256 * to (shortly) return to us. This only works by keeping the weights as
3257 * integral part of the sum. We therefore cannot decompose as per (3).
3258 *
3259 * Another reason this doesn't work is that runnable isn't a 0-sum entity.
3260 * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
3261 * rq itself is runnable anywhere between 2/3 and 1 depending on how the
3262 * runnable section of these tasks overlap (or not). If they were to perfectly
3263 * align the rq as a whole would be runnable 2/3 of the time. If however we
3264 * always have at least 1 runnable task, the rq as a whole is always runnable.
3265 *
3266 * So we'll have to approximate.. :/
3267 *
3268 * Given the constraint:
3269 *
3270 *   ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
3271 *
3272 * We can construct a rule that adds runnable to a rq by assuming minimal
3273 * overlap.
3274 *
3275 * On removal, we'll assume each task is equally runnable; which yields:
3276 *
3277 *   grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
3278 *
3279 * XXX: only do this for the part of runnable > running ?
3280 *
3281 */
3282
3283static inline void
3284update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3285{
3286        long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
3287
3288        /* Nothing to update */
3289        if (!delta)
3290                return;
3291
3292        /*
3293         * The relation between sum and avg is:
3294         *
3295         *   LOAD_AVG_MAX - 1024 + sa->period_contrib
3296         *
3297         * however, the PELT windows are not aligned between grq and gse.
3298         */
3299
3300        /* Set new sched_entity's utilization */
3301        se->avg.util_avg = gcfs_rq->avg.util_avg;
3302        se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
3303
3304        /* Update parent cfs_rq utilization */
3305        add_positive(&cfs_rq->avg.util_avg, delta);
3306        cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
3307}
3308
3309static inline void
3310update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3311{
3312        long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3313        unsigned long runnable_load_avg, load_avg;
3314        u64 runnable_load_sum, load_sum = 0;
3315        s64 delta_sum;
3316
3317        if (!runnable_sum)
3318                return;
3319
3320        gcfs_rq->prop_runnable_sum = 0;
3321
3322        if (runnable_sum >= 0) {
3323                /*
3324                 * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
3325                 * the CPU is saturated running == runnable.
3326                 */
3327                runnable_sum += se->avg.load_sum;
3328                runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
3329        } else {
3330                /*
3331                 * Estimate the new unweighted runnable_sum of the gcfs_rq by
3332                 * assuming all tasks are equally runnable.
3333                 */
3334                if (scale_load_down(gcfs_rq->load.weight)) {
3335                        load_sum = div_s64(gcfs_rq->avg.load_sum,
3336                                scale_load_down(gcfs_rq->load.weight));
3337                }
3338
3339                /* But make sure to not inflate se's runnable */
3340                runnable_sum = min(se->avg.load_sum, load_sum);
3341        }
3342
3343        /*
3344         * runnable_sum can't be lower than running_sum
3345         * Rescale running sum to be in the same range as runnable sum
3346         * running_sum is in [0 : LOAD_AVG_MAX <<  SCHED_CAPACITY_SHIFT]
3347         * runnable_sum is in [0 : LOAD_AVG_MAX]
3348         */
3349        running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
3350        runnable_sum = max(runnable_sum, running_sum);
3351
3352        load_sum = (s64)se_weight(se) * runnable_sum;
3353        load_avg = div_s64(load_sum, LOAD_AVG_MAX);
3354
3355        delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
3356        delta_avg = load_avg - se->avg.load_avg;
3357
3358        se->avg.load_sum = runnable_sum;
3359        se->avg.load_avg = load_avg;
3360        add_positive(&cfs_rq->avg.load_avg, delta_avg);
3361        add_positive(&cfs_rq->avg.load_sum, delta_sum);
3362
3363        runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
3364        runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
3365        delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
3366        delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
3367
3368        se->avg.runnable_load_sum = runnable_sum;
3369        se->avg.runnable_load_avg = runnable_load_avg;
3370
3371        if (se->on_rq) {
3372                add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
3373                add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
3374        }
3375}
3376
3377static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
3378{
3379        cfs_rq->propagate = 1;
3380        cfs_rq->prop_runnable_sum += runnable_sum;
3381}
3382
3383/* Update task and its cfs_rq load average */
3384static inline int propagate_entity_load_avg(struct sched_entity *se)
3385{
3386        struct cfs_rq *cfs_rq, *gcfs_rq;
3387
3388        if (entity_is_task(se))
3389                return 0;
3390
3391        gcfs_rq = group_cfs_rq(se);
3392        if (!gcfs_rq->propagate)
3393                return 0;
3394
3395        gcfs_rq->propagate = 0;
3396
3397        cfs_rq = cfs_rq_of(se);
3398
3399        add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
3400
3401        update_tg_cfs_util(cfs_rq, se, gcfs_rq);
3402        update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
3403
3404        trace_pelt_cfs_tp(cfs_rq);
3405        trace_pelt_se_tp(se);
3406
3407        return 1;
3408}
3409
3410/*
3411 * Check if we need to update the load and the utilization of a blocked
3412 * group_entity:
3413 */
3414static inline bool skip_blocked_update(struct sched_entity *se)
3415{
3416        struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3417
3418        /*
3419         * If sched_entity still have not zero load or utilization, we have to
3420         * decay it:
3421         */
3422        if (se->avg.load_avg || se->avg.util_avg)
3423                return false;
3424
3425        /*
3426         * If there is a pending propagation, we have to update the load and
3427         * the utilization of the sched_entity:
3428         */
3429        if (gcfs_rq->propagate)
3430                return false;
3431
3432        /*
3433         * Otherwise, the load and the utilization of the sched_entity is
3434         * already zero and there is no pending propagation, so it will be a
3435         * waste of time to try to decay it:
3436         */
3437        return true;
3438}
3439
3440#else /* CONFIG_FAIR_GROUP_SCHED */
3441
3442static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
3443
3444static inline int propagate_entity_load_avg(struct sched_entity *se)
3445{
3446        return 0;
3447}
3448
3449static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
3450
3451#endif /* CONFIG_FAIR_GROUP_SCHED */
3452
3453/**
3454 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
3455 * @now: current time, as per cfs_rq_clock_pelt()
3456 * @cfs_rq: cfs_rq to update
3457 *
3458 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
3459 * avg. The immediate corollary is that all (fair) tasks must be attached, see
3460 * post_init_entity_util_avg().
3461 *
3462 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
3463 *
3464 * Returns true if the load decayed or we removed load.
3465 *
3466 * Since both these conditions indicate a changed cfs_rq->avg.load we should
3467 * call update_tg_load_avg() when this function returns true.
3468 */
3469static inline int
3470update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3471{
3472        unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
3473        struct sched_avg *sa = &cfs_rq->avg;
3474        int decayed = 0;
3475
3476        if (cfs_rq->removed.nr) {
3477                unsigned long r;
3478                u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
3479
3480                raw_spin_lock(&cfs_rq->removed.lock);
3481                swap(cfs_rq->removed.util_avg, removed_util);
3482                swap(cfs_rq->removed.load_avg, removed_load);
3483                swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
3484                cfs_rq->removed.nr = 0;
3485                raw_spin_unlock(&cfs_rq->removed.lock);
3486
3487                r = removed_load;
3488                sub_positive(&sa->load_avg, r);
3489                sub_positive(&sa->load_sum, r * divider);
3490
3491                r = removed_util;
3492                sub_positive(&sa->util_avg, r);
3493                sub_positive(&sa->util_sum, r * divider);
3494
3495                add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
3496
3497                decayed = 1;
3498        }
3499
3500        decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
3501
3502#ifndef CONFIG_64BIT
3503        smp_wmb();
3504        cfs_rq->load_last_update_time_copy = sa->last_update_time;
3505#endif
3506
3507        if (decayed)
3508                cfs_rq_util_change(cfs_rq, 0);
3509
3510        return decayed;
3511}
3512
3513/**
3514 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
3515 * @cfs_rq: cfs_rq to attach to
3516 * @se: sched_entity to attach
3517 * @flags: migration hints
3518 *
3519 * Must call update_cfs_rq_load_avg() before this, since we rely on
3520 * cfs_rq->avg.last_update_time being current.
3521 */
3522static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3523{
3524        u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
3525
3526        /*
3527         * When we attach the @se to the @cfs_rq, we must align the decay
3528         * window because without that, really weird and wonderful things can
3529         * happen.
3530         *
3531         * XXX illustrate
3532         */
3533        se->avg.last_update_time = cfs_rq->avg.last_update_time;
3534        se->avg.period_contrib = cfs_rq->avg.period_contrib;
3535
3536        /*
3537         * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
3538         * period_contrib. This isn't strictly correct, but since we're
3539         * entirely outside of the PELT hierarchy, nobody cares if we truncate
3540         * _sum a little.
3541         */
3542        se->avg.util_sum = se->avg.util_avg * divider;
3543
3544        se->avg.load_sum = divider;
3545        if (se_weight(se)) {
3546                se->avg.load_sum =
3547                        div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
3548        }
3549
3550        se->avg.runnable_load_sum = se->avg.load_sum;
3551
3552        enqueue_load_avg(cfs_rq, se);
3553        cfs_rq->avg.util_avg += se->avg.util_avg;
3554        cfs_rq->avg.util_sum += se->avg.util_sum;
3555
3556        add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
3557
3558        cfs_rq_util_change(cfs_rq, flags);
3559
3560        trace_pelt_cfs_tp(cfs_rq);
3561}
3562
3563/**
3564 * detach_entity_load_avg - detach this entity from its cfs_rq load avg
3565 * @cfs_rq: cfs_rq to detach from
3566 * @se: sched_entity to detach
3567 *
3568 * Must call update_cfs_rq_load_avg() before this, since we rely on
3569 * cfs_rq->avg.last_update_time being current.
3570 */
3571static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3572{
3573        dequeue_load_avg(cfs_rq, se);
3574        sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3575        sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3576
3577        add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3578
3579        cfs_rq_util_change(cfs_rq, 0);
3580
3581        trace_pelt_cfs_tp(cfs_rq);
3582}
3583
3584/*
3585 * Optional action to be done while updating the load average
3586 */
3587#define UPDATE_TG       0x1
3588#define SKIP_AGE_LOAD   0x2
3589#define DO_ATTACH       0x4
3590
3591/* Update task and its cfs_rq load average */
3592static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3593{
3594        u64 now = cfs_rq_clock_pelt(cfs_rq);
3595        int decayed;
3596
3597        /*
3598         * Track task load average for carrying it to new CPU after migrated, and
3599         * track group sched_entity load average for task_h_load calc in migration
3600         */
3601        if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
3602                __update_load_avg_se(now, cfs_rq, se);
3603
3604        decayed  = update_cfs_rq_load_avg(now, cfs_rq);
3605        decayed |= propagate_entity_load_avg(se);
3606
3607        if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
3608
3609                /*
3610                 * DO_ATTACH means we're here from enqueue_entity().
3611                 * !last_update_time means we've passed through
3612                 * migrate_task_rq_fair() indicating we migrated.
3613                 *
3614                 * IOW we're enqueueing a task on a new CPU.
3615                 */
3616                attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
3617                update_tg_load_avg(cfs_rq, 0);
3618
3619        } else if (decayed && (flags & UPDATE_TG))
3620                update_tg_load_avg(cfs_rq, 0);
3621}
3622
3623#ifndef CONFIG_64BIT
3624static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3625{
3626        u64 last_update_time_copy;
3627        u64 last_update_time;
3628
3629        do {
3630                last_update_time_copy = cfs_rq->load_last_update_time_copy;
3631                smp_rmb();
3632                last_update_time = cfs_rq->avg.last_update_time;
3633        } while (last_update_time != last_update_time_copy);
3634
3635        return last_update_time;
3636}
3637#else
3638static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3639{
3640        return cfs_rq->avg.last_update_time;
3641}
3642#endif
3643
3644/*
3645 * Synchronize entity load avg of dequeued entity without locking
3646 * the previous rq.
3647 */
3648static void sync_entity_load_avg(struct sched_entity *se)
3649{
3650        struct cfs_rq *cfs_rq = cfs_rq_of(se);
3651        u64 last_update_time;
3652
3653        last_update_time = cfs_rq_last_update_time(cfs_rq);
3654        __update_load_avg_blocked_se(last_update_time, se);
3655}
3656
3657/*
3658 * Task first catches up with cfs_rq, and then subtract
3659 * itself from the cfs_rq (task must be off the queue now).
3660 */
3661static void remove_entity_load_avg(struct sched_entity *se)
3662{
3663        struct cfs_rq *cfs_rq = cfs_rq_of(se);
3664        unsigned long flags;
3665
3666        /*
3667         * tasks cannot exit without having gone through wake_up_new_task() ->
3668         * post_init_entity_util_avg() which will have added things to the
3669         * cfs_rq, so we can remove unconditionally.
3670         */
3671
3672        sync_entity_load_avg(se);
3673
3674        raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
3675        ++cfs_rq->removed.nr;
3676        cfs_rq->removed.util_avg        += se->avg.util_avg;
3677        cfs_rq->removed.load_avg        += se->avg.load_avg;
3678        cfs_rq->removed.runnable_sum    += se->avg.load_sum; /* == runnable_sum */
3679        raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
3680}
3681
3682static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
3683{
3684        return cfs_rq->avg.runnable_load_avg;
3685}
3686
3687static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3688{
3689        return cfs_rq->avg.load_avg;
3690}
3691
3692static inline unsigned long task_util(struct task_struct *p)
3693{
3694        return READ_ONCE(p->se.avg.util_avg);
3695}
3696
3697static inline unsigned long _task_util_est(struct task_struct *p)
3698{
3699        struct util_est ue = READ_ONCE(p->se.avg.util_est);
3700
3701        return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
3702}
3703
3704static inline unsigned long task_util_est(struct task_struct *p)
3705{
3706        return max(task_util(p), _task_util_est(p));
3707}
3708
3709static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
3710                                    struct task_struct *p)
3711{
3712        unsigned int enqueued;
3713
3714        if (!sched_feat(UTIL_EST))
3715                return;
3716
3717        /* Update root cfs_rq's estimated utilization */
3718        enqueued  = cfs_rq->avg.util_est.enqueued;
3719        enqueued += _task_util_est(p);
3720        WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3721}
3722
3723/*
3724 * Check if a (signed) value is within a specified (unsigned) margin,
3725 * based on the observation that:
3726 *
3727 *     abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
3728 *
3729 * NOTE: this only works when value + maring < INT_MAX.
3730 */
3731static inline bool within_margin(int value, int margin)
3732{
3733        return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
3734}
3735
3736static void
3737util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
3738{
3739        long last_ewma_diff;
3740        struct util_est ue;
3741        int cpu;
3742
3743        if (!sched_feat(UTIL_EST))
3744                return;
3745
3746        /* Update root cfs_rq's estimated utilization */
3747        ue.enqueued  = cfs_rq->avg.util_est.enqueued;
3748        ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
3749        WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
3750
3751        /*
3752         * Skip update of task's estimated utilization when the task has not
3753         * yet completed an activation, e.g. being migrated.
3754         */
3755        if (!task_sleep)
3756                return;
3757
3758        /*
3759         * If the PELT values haven't changed since enqueue time,
3760         * skip the util_est update.
3761         */
3762        ue = p->se.avg.util_est;
3763        if (ue.enqueued & UTIL_AVG_UNCHANGED)
3764                return;
3765
3766        /*
3767         * Skip update of task's estimated utilization when its EWMA is
3768         * already ~1% close to its last activation value.
3769         */
3770        ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
3771        last_ewma_diff = ue.enqueued - ue.ewma;
3772        if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
3773                return;
3774
3775        /*
3776         * To avoid overestimation of actual task utilization, skip updates if
3777         * we cannot grant there is idle time in this CPU.
3778         */
3779        cpu = cpu_of(rq_of(cfs_rq));
3780        if (task_util(p) > capacity_orig_of(cpu))
3781                return;
3782
3783        /*
3784         * Update Task's estimated utilization
3785         *
3786         * When *p completes an activation we can consolidate another sample
3787         * of the task size. This is done by storing the current PELT value
3788         * as ue.enqueued and by using this value to update the Exponential
3789         * Weighted Moving Average (EWMA):
3790         *
3791         *  ewma(t) = w *  task_util(p) + (1-w) * ewma(t-1)
3792         *          = w *  task_util(p) +         ewma(t-1)  - w * ewma(t-1)
3793         *          = w * (task_util(p) -         ewma(t-1)) +     ewma(t-1)
3794         *          = w * (      last_ewma_diff            ) +     ewma(t-1)
3795         *          = w * (last_ewma_diff  +  ewma(t-1) / w)
3796         *
3797         * Where 'w' is the weight of new samples, which is configured to be
3798         * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
3799         */
3800        ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
3801        ue.ewma  += last_ewma_diff;
3802        ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
3803        WRITE_ONCE(p->se.avg.util_est, ue);
3804}
3805
3806static inline int task_fits_capacity(struct task_struct *p, long capacity)
3807{
3808        return fits_capacity(task_util_est(p), capacity);
3809}
3810
3811static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
3812{
3813        if (!static_branch_unlikely(&sched_asym_cpucapacity))
3814                return;
3815
3816        if (!p) {
3817                rq->misfit_task_load = 0;
3818                return;
3819        }
3820
3821        if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
3822                rq->misfit_task_load = 0;
3823                return;
3824        }
3825
3826        rq->misfit_task_load = task_h_load(p);
3827}
3828
3829#else /* CONFIG_SMP */
3830
3831#define UPDATE_TG       0x0
3832#define SKIP_AGE_LOAD   0x0
3833#define DO_ATTACH       0x0
3834
3835static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
3836{
3837        cfs_rq_util_change(cfs_rq, 0);
3838}
3839
3840static inline void remove_entity_load_avg(struct sched_entity *se) {}
3841
3842static inline void
3843attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
3844static inline void
3845detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3846
3847static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
3848{
3849        return 0;
3850}
3851
3852static inline void
3853util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
3854
3855static inline void
3856util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
3857                 bool task_sleep) {}
3858static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
3859
3860#endif /* CONFIG_SMP */
3861
3862static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3863{
3864#ifdef CONFIG_SCHED_DEBUG
3865        s64 d = se->vruntime - cfs_rq->min_vruntime;
3866
3867        if (d < 0)
3868                d = -d;
3869
3870        if (d > 3*sysctl_sched_latency)
3871                schedstat_inc(cfs_rq->nr_spread_over);
3872#endif
3873}
3874
3875static void
3876place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
3877{
3878        u64 vruntime = cfs_rq->min_vruntime;
3879
3880        /*
3881         * The 'current' period is already promised to the current tasks,
3882         * however the extra weight of the new task will slow them down a
3883         * little, place the new task so that it fits in the slot that
3884         * stays open at the end.
3885         */
3886        if (initial && sched_feat(START_DEBIT))
3887                vruntime += sched_vslice(cfs_rq, se);
3888
3889        /* sleeps up to a single latency don't count. */
3890        if (!initial) {
3891                unsigned long thresh = sysctl_sched_latency;
3892
3893                /*
3894                 * Halve their sleep time's effect, to allow
3895                 * for a gentler effect of sleepers:
3896                 */
3897                if (sched_feat(GENTLE_FAIR_SLEEPERS))
3898                        thresh >>= 1;
3899
3900                vruntime -= thresh;
3901        }
3902
3903        /* ensure we never gain time by being placed backwards. */
3904        se->vruntime = max_vruntime(se->vruntime, vruntime);
3905}
3906
3907static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
3908
3909static inline void check_schedstat_required(void)
3910{
3911#ifdef CONFIG_SCHEDSTATS
3912        if (schedstat_enabled())
3913                return;
3914
3915        /* Force schedstat enabled if a dependent tracepoint is active */
3916        if (trace_sched_stat_wait_enabled()    ||
3917                        trace_sched_stat_sleep_enabled()   ||
3918                        trace_sched_stat_iowait_enabled()  ||
3919                        trace_sched_stat_blocked_enabled() ||
3920                        trace_sched_stat_runtime_enabled())  {
3921                printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
3922                             "stat_blocked and stat_runtime require the "
3923                             "kernel parameter schedstats=enable or "
3924                             "kernel.sched_schedstats=1\n");
3925        }
3926#endif
3927}
3928
3929
3930/*
3931 * MIGRATION
3932 *
3933 *      dequeue
3934 *        update_curr()
3935 *          update_min_vruntime()
3936 *        vruntime -= min_vruntime
3937 *
3938 *      enqueue
3939 *        update_curr()
3940 *          update_min_vruntime()
3941 *        vruntime += min_vruntime
3942 *
3943 * this way the vruntime transition between RQs is done when both
3944 * min_vruntime are up-to-date.
3945 *
3946 * WAKEUP (remote)
3947 *
3948 *      ->migrate_task_rq_fair() (p->state == TASK_WAKING)
3949 *        vruntime -= min_vruntime
3950 *
3951 *      enqueue
3952 *        update_curr()
3953 *          update_min_vruntime()
3954 *        vruntime += min_vruntime
3955 *
3956 * this way we don't have the most up-to-date min_vruntime on the originating
3957 * CPU and an up-to-date min_vruntime on the destination CPU.
3958 */
3959
3960static void
3961enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3962{
3963        bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
3964        bool curr = cfs_rq->curr == se;
3965
3966        /*
3967         * If we're the current task, we must renormalise before calling
3968         * update_curr().
3969         */
3970        if (renorm && curr)
3971                se->vruntime += cfs_rq->min_vruntime;
3972
3973        update_curr(cfs_rq);
3974
3975        /*
3976         * Otherwise, renormalise after, such that we're placed at the current
3977         * moment in time, instead of some random moment in the past. Being
3978         * placed in the past could significantly boost this task to the
3979         * fairness detriment of existing tasks.
3980         */
3981        if (renorm && !curr)
3982                se->vruntime += cfs_rq->min_vruntime;
3983
3984        /*
3985         * When enqueuing a sched_entity, we must:
3986         *   - Update loads to have both entity and cfs_rq synced with now.
3987         *   - Add its load to cfs_rq->runnable_avg
3988         *   - For group_entity, update its weight to reflect the new share of
3989         *     its group cfs_rq
3990         *   - Add its new weight to cfs_rq->load.weight
3991         */
3992        update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
3993        update_cfs_group(se);
3994        enqueue_runnable_load_avg(cfs_rq, se);
3995        account_entity_enqueue(cfs_rq, se);
3996
3997        if (flags & ENQUEUE_WAKEUP)
3998                place_entity(cfs_rq, se, 0);
3999
4000        check_schedstat_required();
4001        update_stats_enqueue(cfs_rq, se, flags);
4002        check_spread(cfs_rq, se);
4003        if (!curr)
4004                __enqueue_entity(cfs_rq, se);
4005        se->on_rq = 1;
4006
4007        if (cfs_rq->nr_running == 1) {
4008                list_add_leaf_cfs_rq(cfs_rq);
4009                check_enqueue_throttle(cfs_rq);
4010        }
4011}
4012
4013static void __clear_buddies_last(struct sched_entity *se)
4014{
4015        for_each_sched_entity(se) {
4016                struct cfs_rq *cfs_rq = cfs_rq_of(se);
4017                if (cfs_rq->last != se)
4018                        break;
4019
4020                cfs_rq->last = NULL;
4021        }
4022}
4023
4024static void __clear_buddies_next(struct sched_entity *se)
4025{
4026        for_each_sched_entity(se) {
4027                struct cfs_rq *cfs_rq = cfs_rq_of(se);
4028                if (cfs_rq->next != se)
4029                        break;
4030
4031                cfs_rq->next = NULL;
4032        }
4033}
4034
4035static void __clear_buddies_skip(struct sched_entity *se)
4036{
4037        for_each_sched_entity(se) {
4038                struct cfs_rq *cfs_rq = cfs_rq_of(se);
4039                if (cfs_rq->skip != se)
4040                        break;
4041
4042                cfs_rq->skip = NULL;
4043        }
4044}
4045
4046static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
4047{
4048        if (cfs_rq->last == se)
4049                __clear_buddies_last(se);
4050
4051        if (cfs_rq->next == se)
4052                __clear_buddies_next(se);
4053
4054        if (cfs_rq->skip == se)
4055                __clear_buddies_skip(se);
4056}
4057
4058static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4059
4060static void
4061dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4062{
4063        /*
4064         * Update run-time statistics of the 'current'.
4065         */
4066        update_curr(cfs_rq);
4067
4068        /*
4069         * When dequeuing a sched_entity, we must:
4070         *   - Update loads to have both entity and cfs_rq synced with now.
4071         *   - Subtract its load from the cfs_rq->runnable_avg.
4072         *   - Subtract its previous weight from cfs_rq->load.weight.
4073         *   - For group entity, update its weight to reflect the new share
4074         *     of its group cfs_rq.
4075         */
4076        update_load_avg(cfs_rq, se, UPDATE_TG);
4077        dequeue_runnable_load_avg(cfs_rq, se);
4078
4079        update_stats_dequeue(cfs_rq, se, flags);
4080
4081        clear_buddies(cfs_rq, se);
4082
4083        if (se != cfs_rq->curr)
4084                __dequeue_entity(cfs_rq, se);
4085        se->on_rq = 0;
4086        account_entity_dequeue(cfs_rq, se);
4087
4088        /*
4089         * Normalize after update_curr(); which will also have moved
4090         * min_vruntime if @se is the one holding it back. But before doing
4091         * update_min_vruntime() again, which will discount @se's position and
4092         * can move min_vruntime forward still more.
4093         */
4094        if (!(flags & DEQUEUE_SLEEP))
4095                se->vruntime -= cfs_rq->min_vruntime;
4096
4097        /* return excess runtime on last dequeue */
4098        return_cfs_rq_runtime(cfs_rq);
4099
4100        update_cfs_group(se);
4101
4102        /*
4103         * Now advance min_vruntime if @se was the entity holding it back,
4104         * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
4105         * put back on, and if we advance min_vruntime, we'll be placed back
4106         * further than we started -- ie. we'll be penalized.
4107         */
4108        if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
4109                update_min_vruntime(cfs_rq);
4110}
4111
4112/*
4113 * Preempt the current task with a newly woken task if needed:
4114 */
4115static void
4116check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4117{
4118        unsigned long ideal_runtime, delta_exec;
4119        struct sched_entity *se;
4120        s64 delta;
4121
4122        ideal_runtime = sched_slice(cfs_rq, curr);
4123        delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
4124        if (delta_exec > ideal_runtime) {
4125                resched_curr(rq_of(cfs_rq));
4126                /*
4127                 * The current task ran long enough, ensure it doesn't get
4128                 * re-elected due to buddy favours.
4129                 */
4130                clear_buddies(cfs_rq, curr);
4131                return;
4132        }
4133
4134        /*
4135         * Ensure that a task that missed wakeup preemption by a
4136         * narrow margin doesn't have to wait for a full slice.
4137         * This also mitigates buddy induced latencies under load.
4138         */
4139        if (delta_exec < sysctl_sched_min_granularity)
4140                return;
4141
4142        se = __pick_first_entity(cfs_rq);
4143        delta = curr->vruntime - se->vruntime;
4144
4145        if (delta < 0)
4146                return;
4147
4148        if (delta > ideal_runtime)
4149                resched_curr(rq_of(cfs_rq));
4150}
4151
4152static void
4153set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
4154{
4155        /* 'current' is not kept within the tree. */
4156        if (se->on_rq) {
4157                /*
4158                 * Any task has to be enqueued before it get to execute on
4159                 * a CPU. So account for the time it spent waiting on the
4160                 * runqueue.
4161                 */
4162                update_stats_wait_end(cfs_rq, se);
4163                __dequeue_entity(cfs_rq, se);
4164                update_load_avg(cfs_rq, se, UPDATE_TG);
4165        }
4166
4167        update_stats_curr_start(cfs_rq, se);
4168        cfs_rq->curr = se;
4169
4170        /*
4171         * Track our maximum slice length, if the CPU's load is at
4172         * least twice that of our own weight (i.e. dont track it
4173         * when there are only lesser-weight tasks around):
4174         */
4175        if (schedstat_enabled() &&
4176            rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
4177                schedstat_set(se->statistics.slice_max,
4178                        max((u64)schedstat_val(se->statistics.slice_max),
4179                            se->sum_exec_runtime - se->prev_sum_exec_runtime));
4180        }
4181
4182        se->prev_sum_exec_runtime = se->sum_exec_runtime;
4183}
4184
4185static int
4186wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
4187
4188/*
4189 * Pick the next process, keeping these things in mind, in this order:
4190 * 1) keep things fair between processes/task groups
4191 * 2) pick the "next" process, since someone really wants that to run
4192 * 3) pick the "last" process, for cache locality
4193 * 4) do not run the "skip" process, if something else is available
4194 */
4195static struct sched_entity *
4196pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4197{
4198        struct sched_entity *left = __pick_first_entity(cfs_rq);
4199        struct sched_entity *se;
4200
4201        /*
4202         * If curr is set we have to see if its left of the leftmost entity
4203         * still in the tree, provided there was anything in the tree at all.
4204         */
4205        if (!left || (curr && entity_before(curr, left)))
4206                left = curr;
4207
4208        se = left; /* ideally we run the leftmost entity */
4209
4210        /*
4211         * Avoid running the skip buddy, if running something else can
4212         * be done without getting too unfair.
4213         */
4214        if (cfs_rq->skip == se) {
4215                struct sched_entity *second;
4216
4217                if (se == curr) {
4218                        second = __pick_first_entity(cfs_rq);
4219                } else {
4220                        second = __pick_next_entity(se);
4221                        if (!second || (curr && entity_before(curr, second)))
4222                                second = curr;
4223                }
4224
4225                if (second && wakeup_preempt_entity(second, left) < 1)
4226                        se = second;
4227        }
4228
4229        /*
4230         * Prefer last buddy, try to return the CPU to a preempted task.
4231         */
4232        if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
4233                se = cfs_rq->last;
4234
4235        /*
4236         * Someone really wants this to run. If it's not unfair, run it.
4237         */
4238        if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
4239                se = cfs_rq->next;
4240
4241        clear_buddies(cfs_rq, se);
4242
4243        return se;
4244}
4245
4246static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4247
4248static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
4249{
4250        /*
4251         * If still on the runqueue then deactivate_task()
4252         * was not called and update_curr() has to be done:
4253         */
4254        if (prev->on_rq)
4255                update_curr(cfs_rq);
4256
4257        /* throttle cfs_rqs exceeding runtime */
4258        check_cfs_rq_runtime(cfs_rq);
4259
4260        check_spread(cfs_rq, prev);
4261
4262        if (prev->on_rq) {
4263                update_stats_wait_start(cfs_rq, prev);
4264                /* Put 'current' back into the tree. */
4265                __enqueue_entity(cfs_rq, prev);
4266                /* in !on_rq case, update occurred at dequeue */
4267                update_load_avg(cfs_rq, prev, 0);
4268        }
4269        cfs_rq->curr = NULL;
4270}
4271
4272static void
4273entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
4274{
4275        /*
4276         * Update run-time statistics of the 'current'.
4277         */
4278        update_curr(cfs_rq);
4279
4280        /*
4281         * Ensure that runnable average is periodically updated.
4282         */
4283        update_load_avg(cfs_rq, curr, UPDATE_TG);
4284        update_cfs_group(curr);
4285
4286#ifdef CONFIG_SCHED_HRTICK
4287        /*
4288         * queued ticks are scheduled to match the slice, so don't bother
4289         * validating it and just reschedule.
4290         */
4291        if (queued) {
4292                resched_curr(rq_of(cfs_rq));
4293                return;
4294        }
4295        /*
4296         * don't let the period tick interfere with the hrtick preemption
4297         */
4298        if (!sched_feat(DOUBLE_TICK) &&
4299                        hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4300                return;
4301#endif
4302
4303        if (cfs_rq->nr_running > 1)
4304                check_preempt_tick(cfs_rq, curr);
4305}
4306
4307
4308/**************************************************
4309 * CFS bandwidth control machinery
4310 */
4311
4312#ifdef CONFIG_CFS_BANDWIDTH
4313
4314#ifdef CONFIG_JUMP_LABEL
4315static struct static_key __cfs_bandwidth_used;
4316
4317static inline bool cfs_bandwidth_used(void)
4318{
4319        return static_key_false(&__cfs_bandwidth_used);
4320}
4321
4322void cfs_bandwidth_usage_inc(void)
4323{
4324        static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
4325}
4326
4327void cfs_bandwidth_usage_dec(void)
4328{
4329        static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
4330}
4331#else /* CONFIG_JUMP_LABEL */
4332static bool cfs_bandwidth_used(void)
4333{
4334        return true;
4335}
4336
4337void cfs_bandwidth_usage_inc(void) {}
4338void cfs_bandwidth_usage_dec(void) {}
4339#endif /* CONFIG_JUMP_LABEL */
4340
4341/*
4342 * default period for cfs group bandwidth.
4343 * default: 0.1s, units: nanoseconds
4344 */
4345static inline u64 default_cfs_period(void)
4346{
4347        return 100000000ULL;
4348}
4349
4350static inline u64 sched_cfs_bandwidth_slice(void)
4351{
4352        return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
4353}
4354
4355/*
4356 * Replenish runtime according to assigned quota. We use sched_clock_cpu
4357 * directly instead of rq->clock to avoid adding additional synchronization
4358 * around rq->lock.
4359 *
4360 * requires cfs_b->lock
4361 */
4362void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
4363{
4364        if (cfs_b->quota != RUNTIME_INF)
4365                cfs_b->runtime = cfs_b->quota;
4366}
4367
4368static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4369{
4370        return &tg->cfs_bandwidth;
4371}
4372
4373/* returns 0 on failure to allocate runtime */
4374static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4375{
4376        struct task_group *tg = cfs_rq->tg;
4377        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
4378        u64 amount = 0, min_amount;
4379
4380        /* note: this is a positive sum as runtime_remaining <= 0 */
4381        min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
4382
4383        raw_spin_lock(&cfs_b->lock);
4384        if (cfs_b->quota == RUNTIME_INF)
4385                amount = min_amount;
4386        else {
4387                start_cfs_bandwidth(cfs_b);
4388
4389                if (cfs_b->runtime > 0) {
4390                        amount = min(cfs_b->runtime, min_amount);
4391                        cfs_b->runtime -= amount;
4392                        cfs_b->idle = 0;
4393                }
4394        }
4395        raw_spin_unlock(&cfs_b->lock);
4396
4397        cfs_rq->runtime_remaining += amount;
4398
4399        return cfs_rq->runtime_remaining > 0;
4400}
4401
4402static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4403{
4404        /* dock delta_exec before expiring quota (as it could span periods) */
4405        cfs_rq->runtime_remaining -= delta_exec;
4406
4407        if (likely(cfs_rq->runtime_remaining > 0))
4408                return;
4409
4410        if (cfs_rq->throttled)
4411                return;
4412        /*
4413         * if we're unable to extend our runtime we resched so that the active
4414         * hierarchy can be throttled
4415         */
4416        if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
4417                resched_curr(rq_of(cfs_rq));
4418}
4419
4420static __always_inline
4421void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4422{
4423        if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
4424                return;
4425
4426        __account_cfs_rq_runtime(cfs_rq, delta_exec);
4427}
4428
4429static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4430{
4431        return cfs_bandwidth_used() && cfs_rq->throttled;
4432}
4433
4434/* check whether cfs_rq, or any parent, is throttled */
4435static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4436{
4437        return cfs_bandwidth_used() && cfs_rq->throttle_count;
4438}
4439
4440/*
4441 * Ensure that neither of the group entities corresponding to src_cpu or
4442 * dest_cpu are members of a throttled hierarchy when performing group
4443 * load-balance operations.
4444 */
4445static inline int throttled_lb_pair(struct task_group *tg,
4446                                    int src_cpu, int dest_cpu)
4447{
4448        struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
4449
4450        src_cfs_rq = tg->cfs_rq[src_cpu];
4451        dest_cfs_rq = tg->cfs_rq[dest_cpu];
4452
4453        return throttled_hierarchy(src_cfs_rq) ||
4454               throttled_hierarchy(dest_cfs_rq);
4455}
4456
4457static int tg_unthrottle_up(struct task_group *tg, void *data)
4458{
4459        struct rq *rq = data;
4460        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4461
4462        cfs_rq->throttle_count--;
4463        if (!cfs_rq->throttle_count) {
4464                cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
4465                                             cfs_rq->throttled_clock_task;
4466
4467                /* Add cfs_rq with already running entity in the list */
4468                if (cfs_rq->nr_running >= 1)
4469                        list_add_leaf_cfs_rq(cfs_rq);
4470        }
4471
4472        return 0;
4473}
4474
4475static int tg_throttle_down(struct task_group *tg, void *data)
4476{
4477        struct rq *rq = data;
4478        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4479
4480        /* group is entering throttled state, stop time */
4481        if (!cfs_rq->throttle_count) {
4482                cfs_rq->throttled_clock_task = rq_clock_task(rq);
4483                list_del_leaf_cfs_rq(cfs_rq);
4484        }
4485        cfs_rq->throttle_count++;
4486
4487        return 0;
4488}
4489
4490static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
4491{
4492        struct rq *rq = rq_of(cfs_rq);
4493        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4494        struct sched_entity *se;
4495        long task_delta, idle_task_delta, dequeue = 1;
4496        bool empty;
4497
4498        se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
4499
4500        /* freeze hierarchy runnable averages while throttled */
4501        rcu_read_lock();
4502        walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
4503        rcu_read_unlock();
4504
4505        task_delta = cfs_rq->h_nr_running;
4506        idle_task_delta = cfs_rq->idle_h_nr_running;
4507        for_each_sched_entity(se) {
4508                struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4509                /* throttled entity or throttle-on-deactivate */
4510                if (!se->on_rq)
4511                        break;
4512
4513                if (dequeue)
4514                        dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
4515                qcfs_rq->h_nr_running -= task_delta;
4516                qcfs_rq->idle_h_nr_running -= idle_task_delta;
4517
4518                if (qcfs_rq->load.weight)
4519                        dequeue = 0;
4520        }
4521
4522        if (!se)
4523                sub_nr_running(rq, task_delta);
4524
4525        cfs_rq->throttled = 1;
4526        cfs_rq->throttled_clock = rq_clock(rq);
4527        raw_spin_lock(&cfs_b->lock);
4528        empty = list_empty(&cfs_b->throttled_cfs_rq);
4529
4530        /*
4531         * Add to the _head_ of the list, so that an already-started
4532         * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
4533         * not running add to the tail so that later runqueues don't get starved.
4534         */
4535        if (cfs_b->distribute_running)
4536                list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4537        else
4538                list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4539
4540        /*
4541         * If we're the first throttled task, make sure the bandwidth
4542         * timer is running.
4543         */
4544        if (empty)
4545                start_cfs_bandwidth(cfs_b);
4546
4547        raw_spin_unlock(&cfs_b->lock);
4548}
4549
4550void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
4551{
4552        struct rq *rq = rq_of(cfs_rq);
4553        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4554        struct sched_entity *se;
4555        int enqueue = 1;
4556        long task_delta, idle_task_delta;
4557
4558        se = cfs_rq->tg->se[cpu_of(rq)];
4559
4560        cfs_rq->throttled = 0;
4561
4562        update_rq_clock(rq);
4563
4564        raw_spin_lock(&cfs_b->lock);
4565        cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
4566        list_del_rcu(&cfs_rq->throttled_list);
4567        raw_spin_unlock(&cfs_b->lock);
4568
4569        /* update hierarchical throttle state */
4570        walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
4571
4572        if (!cfs_rq->load.weight)
4573                return;
4574
4575        task_delta = cfs_rq->h_nr_running;
4576        idle_task_delta = cfs_rq->idle_h_nr_running;
4577        for_each_sched_entity(se) {
4578                if (se->on_rq)
4579                        enqueue = 0;
4580
4581                cfs_rq = cfs_rq_of(se);
4582                if (enqueue)
4583                        enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4584                cfs_rq->h_nr_running += task_delta;
4585                cfs_rq->idle_h_nr_running += idle_task_delta;
4586
4587                if (cfs_rq_throttled(cfs_rq))
4588                        break;
4589        }
4590
4591        assert_list_leaf_cfs_rq(rq);
4592
4593        if (!se)
4594                add_nr_running(rq, task_delta);
4595
4596        /* Determine whether we need to wake up potentially idle CPU: */
4597        if (rq->curr == rq->idle && rq->cfs.nr_running)
4598                resched_curr(rq);
4599}
4600
4601static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
4602{
4603        struct cfs_rq *cfs_rq;
4604        u64 runtime;
4605        u64 starting_runtime = remaining;
4606
4607        rcu_read_lock();
4608        list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
4609                                throttled_list) {
4610                struct rq *rq = rq_of(cfs_rq);
4611                struct rq_flags rf;
4612
4613                rq_lock_irqsave(rq, &rf);
4614                if (!cfs_rq_throttled(cfs_rq))
4615                        goto next;
4616
4617                /* By the above check, this should never be true */
4618                SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
4619
4620                runtime = -cfs_rq->runtime_remaining + 1;
4621                if (runtime > remaining)
4622                        runtime = remaining;
4623                remaining -= runtime;
4624
4625                cfs_rq->runtime_remaining += runtime;
4626
4627                /* we check whether we're throttled above */
4628                if (cfs_rq->runtime_remaining > 0)
4629                        unthrottle_cfs_rq(cfs_rq);
4630
4631next:
4632                rq_unlock_irqrestore(rq, &rf);
4633
4634                if (!remaining)
4635                        break;
4636        }
4637        rcu_read_unlock();
4638
4639        return starting_runtime - remaining;
4640}
4641
4642/*
4643 * Responsible for refilling a task_group's bandwidth and unthrottling its
4644 * cfs_rqs as appropriate. If there has been no activity within the last
4645 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
4646 * used to track this state.
4647 */
4648static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
4649{
4650        u64 runtime;
4651        int throttled;
4652
4653        /* no need to continue the timer with no bandwidth constraint */
4654        if (cfs_b->quota == RUNTIME_INF)
4655                goto out_deactivate;
4656
4657        throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4658        cfs_b->nr_periods += overrun;
4659
4660        /*
4661         * idle depends on !throttled (for the case of a large deficit), and if
4662         * we're going inactive then everything else can be deferred
4663         */
4664        if (cfs_b->idle && !throttled)
4665                goto out_deactivate;
4666
4667        __refill_cfs_bandwidth_runtime(cfs_b);
4668
4669        if (!throttled) {
4670                /* mark as potentially idle for the upcoming period */
4671                cfs_b->idle = 1;
4672                return 0;
4673        }
4674
4675        /* account preceding periods in which throttling occurred */
4676        cfs_b->nr_throttled += overrun;
4677
4678        /*
4679         * This check is repeated as we are holding onto the new bandwidth while
4680         * we unthrottle. This can potentially race with an unthrottled group
4681         * trying to acquire new bandwidth from the global pool. This can result
4682         * in us over-using our runtime if it is all used during this loop, but
4683         * only by limited amounts in that extreme case.
4684         */
4685        while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
4686                runtime = cfs_b->runtime;
4687                cfs_b->distribute_running = 1;
4688                raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4689                /* we can't nest cfs_b->lock while distributing bandwidth */
4690                runtime = distribute_cfs_runtime(cfs_b, runtime);
4691                raw_spin_lock_irqsave(&cfs_b->lock, flags);
4692
4693                cfs_b->distribute_running = 0;
4694                throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4695
4696                lsub_positive(&cfs_b->runtime, runtime);
4697        }
4698
4699        /*
4700         * While we are ensured activity in the period following an
4701         * unthrottle, this also covers the case in which the new bandwidth is
4702         * insufficient to cover the existing bandwidth deficit.  (Forcing the
4703         * timer to remain active while there are any throttled entities.)
4704         */
4705        cfs_b->idle = 0;
4706
4707        return 0;
4708
4709out_deactivate:
4710        return 1;
4711}
4712
4713/* a cfs_rq won't donate quota below this amount */
4714static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
4715/* minimum remaining period time to redistribute slack quota */
4716static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
4717/* how long we wait to gather additional slack before distributing */
4718static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
4719
4720/*
4721 * Are we near the end of the current quota period?
4722 *
4723 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
4724 * hrtimer base being cleared by hrtimer_start. In the case of
4725 * migrate_hrtimers, base is never cleared, so we are fine.
4726 */
4727static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
4728{
4729        struct hrtimer *refresh_timer = &cfs_b->period_timer;
4730        u64 remaining;
4731
4732        /* if the call-back is running a quota refresh is already occurring */
4733        if (hrtimer_callback_running(refresh_timer))
4734                return 1;
4735
4736        /* is a quota refresh about to occur? */
4737        remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
4738        if (remaining < min_expire)
4739                return 1;
4740
4741        return 0;
4742}
4743
4744static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
4745{
4746        u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
4747
4748        /* if there's a quota refresh soon don't bother with slack */
4749        if (runtime_refresh_within(cfs_b, min_left))
4750                return;
4751
4752        /* don't push forwards an existing deferred unthrottle */
4753        if (cfs_b->slack_started)
4754                return;
4755        cfs_b->slack_started = true;
4756
4757        hrtimer_start(&cfs_b->slack_timer,
4758                        ns_to_ktime(cfs_bandwidth_slack_period),
4759                        HRTIMER_MODE_REL);
4760}
4761
4762/* we know any runtime found here is valid as update_curr() precedes return */
4763static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4764{
4765        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4766        s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
4767
4768        if (slack_runtime <= 0)
4769                return;
4770
4771        raw_spin_lock(&cfs_b->lock);
4772        if (cfs_b->quota != RUNTIME_INF) {
4773                cfs_b->runtime += slack_runtime;
4774
4775                /* we are under rq->lock, defer unthrottling using a timer */
4776                if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
4777                    !list_empty(&cfs_b->throttled_cfs_rq))
4778                        start_cfs_slack_bandwidth(cfs_b);
4779        }
4780        raw_spin_unlock(&cfs_b->lock);
4781
4782        /* even if it's not valid for return we don't want to try again */
4783        cfs_rq->runtime_remaining -= slack_runtime;
4784}
4785
4786static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4787{
4788        if (!cfs_bandwidth_used())
4789                return;
4790
4791        if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
4792                return;
4793
4794        __return_cfs_rq_runtime(cfs_rq);
4795}
4796
4797/*
4798 * This is done with a timer (instead of inline with bandwidth return) since
4799 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
4800 */
4801static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4802{
4803        u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
4804        unsigned long flags;
4805
4806        /* confirm we're still not at a refresh boundary */
4807        raw_spin_lock_irqsave(&cfs_b->lock, flags);
4808        cfs_b->slack_started = false;
4809        if (cfs_b->distribute_running) {
4810                raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4811                return;
4812        }
4813
4814        if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
4815                raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4816                return;
4817        }
4818
4819        if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
4820                runtime = cfs_b->runtime;
4821
4822        if (runtime)
4823                cfs_b->distribute_running = 1;
4824
4825        raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4826
4827        if (!runtime)
4828                return;
4829
4830        runtime = distribute_cfs_runtime(cfs_b, runtime);
4831
4832        raw_spin_lock_irqsave(&cfs_b->lock, flags);
4833        lsub_positive(&cfs_b->runtime, runtime);
4834        cfs_b->distribute_running = 0;
4835        raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4836}
4837
4838/*
4839 * When a group wakes up we want to make sure that its quota is not already
4840 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
4841 * runtime as update_curr() throttling can not not trigger until it's on-rq.
4842 */
4843static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4844{
4845        if (!cfs_bandwidth_used())
4846                return;
4847
4848        /* an active group must be handled by the update_curr()->put() path */
4849        if (!cfs_rq->runtime_enabled || cfs_rq->curr)
4850                return;
4851
4852        /* ensure the group is not already throttled */
4853        if (cfs_rq_throttled(cfs_rq))
4854                return;
4855
4856        /* update runtime allocation */
4857        account_cfs_rq_runtime(cfs_rq, 0);
4858        if (cfs_rq->runtime_remaining <= 0)
4859                throttle_cfs_rq(cfs_rq);
4860}
4861
4862static void sync_throttle(struct task_group *tg, int cpu)
4863{
4864        struct cfs_rq *pcfs_rq, *cfs_rq;
4865
4866        if (!cfs_bandwidth_used())
4867                return;
4868
4869        if (!tg->parent)
4870                return;
4871
4872        cfs_rq = tg->cfs_rq[cpu];
4873        pcfs_rq = tg->parent->cfs_rq[cpu];
4874
4875        cfs_rq->throttle_count = pcfs_rq->throttle_count;
4876        cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
4877}
4878
4879/* conditionally throttle active cfs_rq's from put_prev_entity() */
4880static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4881{
4882        if (!cfs_bandwidth_used())
4883                return false;
4884
4885        if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
4886                return false;
4887
4888        /*
4889         * it's possible for a throttled entity to be forced into a running
4890         * state (e.g. set_curr_task), in this case we're finished.
4891         */
4892        if (cfs_rq_throttled(cfs_rq))
4893                return true;
4894
4895        throttle_cfs_rq(cfs_rq);
4896        return true;
4897}
4898
4899static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
4900{
4901        struct cfs_bandwidth *cfs_b =
4902                container_of(timer, struct cfs_bandwidth, slack_timer);
4903
4904        do_sched_cfs_slack_timer(cfs_b);
4905
4906        return HRTIMER_NORESTART;
4907}
4908
4909extern const u64 max_cfs_quota_period;
4910
4911static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4912{
4913        struct cfs_bandwidth *cfs_b =
4914                container_of(timer, struct cfs_bandwidth, period_timer);
4915        unsigned long flags;
4916        int overrun;
4917        int idle = 0;
4918        int count = 0;
4919
4920        raw_spin_lock_irqsave(&cfs_b->lock, flags);
4921        for (;;) {
4922                overrun = hrtimer_forward_now(timer, cfs_b->period);
4923                if (!overrun)
4924                        break;
4925
4926                if (++count > 3) {
4927                        u64 new, old = ktime_to_ns(cfs_b->period);
4928
4929                        /*
4930                         * Grow period by a factor of 2 to avoid losing precision.
4931                         * Precision loss in the quota/period ratio can cause __cfs_schedulable
4932                         * to fail.
4933                         */
4934                        new = old * 2;
4935                        if (new < max_cfs_quota_period) {
4936                                cfs_b->period = ns_to_ktime(new);
4937                                cfs_b->quota *= 2;
4938
4939                                pr_warn_ratelimited(
4940        "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
4941                                        smp_processor_id(),
4942                                        div_u64(new, NSEC_PER_USEC),
4943                                        div_u64(cfs_b->quota, NSEC_PER_USEC));
4944                        } else {
4945                                pr_warn_ratelimited(
4946        "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
4947                                        smp_processor_id(),
4948                                        div_u64(old, NSEC_PER_USEC),
4949                                        div_u64(cfs_b->quota, NSEC_PER_USEC));
4950                        }
4951
4952                        /* reset count so we don't come right back in here */
4953                        count = 0;
4954                }
4955
4956                idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
4957        }
4958        if (idle)
4959                cfs_b->period_active = 0;
4960        raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4961
4962        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
4963}
4964
4965void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4966{
4967        raw_spin_lock_init(&cfs_b->lock);
4968        cfs_b->runtime = 0;
4969        cfs_b->quota = RUNTIME_INF;
4970        cfs_b->period = ns_to_ktime(default_cfs_period());
4971
4972        INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
4973        hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
4974        cfs_b->period_timer.function = sched_cfs_period_timer;
4975        hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4976        cfs_b->slack_timer.function = sched_cfs_slack_timer;
4977        cfs_b->distribute_running = 0;
4978        cfs_b->slack_started = false;
4979}
4980
4981static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4982{
4983        cfs_rq->runtime_enabled = 0;
4984        INIT_LIST_HEAD(&cfs_rq->throttled_list);
4985}
4986
4987void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4988{
4989        lockdep_assert_held(&cfs_b->lock);
4990
4991        if (cfs_b->period_active)
4992                return;
4993
4994        cfs_b->period_active = 1;
4995        hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
4996        hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
4997}
4998
4999static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5000{
5001        /* init_cfs_bandwidth() was not called */
5002        if (!cfs_b->throttled_cfs_rq.next)
5003                return;
5004
5005        hrtimer_cancel(&cfs_b->period_timer);
5006        hrtimer_cancel(&cfs_b->slack_timer);
5007}
5008
5009/*
5010 * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
5011 *
5012 * The race is harmless, since modifying bandwidth settings of unhooked group
5013 * bits doesn't do much.
5014 */
5015
5016/* cpu online calback */
5017static void __maybe_unused update_runtime_enabled(struct rq *rq)
5018{
5019        struct task_group *tg;
5020
5021        lockdep_assert_held(&rq->lock);
5022
5023        rcu_read_lock();
5024        list_for_each_entry_rcu(tg, &task_groups, list) {
5025                struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
5026                struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5027
5028                raw_spin_lock(&cfs_b->lock);
5029                cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
5030                raw_spin_unlock(&cfs_b->lock);
5031        }
5032        rcu_read_unlock();
5033}
5034
5035/* cpu offline callback */
5036static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
5037{
5038        struct task_group *tg;
5039
5040        lockdep_assert_held(&rq->lock);
5041
5042        rcu_read_lock();
5043        list_for_each_entry_rcu(tg, &task_groups, list) {
5044                struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5045
5046                if (!cfs_rq->runtime_enabled)
5047                        continue;
5048
5049                /*
5050                 * clock_task is not advancing so we just need to make sure
5051                 * there's some valid quota amount
5052                 */
5053                cfs_rq->runtime_remaining = 1;
5054                /*
5055                 * Offline rq is schedulable till CPU is completely disabled
5056                 * in take_cpu_down(), so we prevent new cfs throttling here.
5057                 */
5058                cfs_rq->runtime_enabled = 0;
5059
5060                if (cfs_rq_throttled(cfs_rq))
5061                        unthrottle_cfs_rq(cfs_rq);
5062        }
5063        rcu_read_unlock();
5064}
5065
5066#else /* CONFIG_CFS_BANDWIDTH */
5067
5068static inline bool cfs_bandwidth_used(void)
5069{
5070        return false;
5071}
5072
5073static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
5074static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
5075static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
5076static inline void sync_throttle(struct task_group *tg, int cpu) {}
5077static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5078
5079static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5080{
5081        return 0;
5082}
5083
5084static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5085{
5086        return 0;
5087}
5088
5089static inline int throttled_lb_pair(struct task_group *tg,
5090                                    int src_cpu, int dest_cpu)
5091{
5092        return 0;
5093}
5094
5095void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5096
5097#ifdef CONFIG_FAIR_GROUP_SCHED
5098static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5099#endif
5100
5101static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5102{
5103        return NULL;
5104}
5105static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5106static inline void update_runtime_enabled(struct rq *rq) {}
5107static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
5108
5109#endif /* CONFIG_CFS_BANDWIDTH */
5110
5111/**************************************************
5112 * CFS operations on tasks:
5113 */
5114
5115#ifdef CONFIG_SCHED_HRTICK
5116static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
5117{
5118        struct sched_entity *se = &p->se;
5119        struct cfs_rq *cfs_rq = cfs_rq_of(se);
5120
5121        SCHED_WARN_ON(task_rq(p) != rq);
5122
5123        if (rq->cfs.h_nr_running > 1) {
5124                u64 slice = sched_slice(cfs_rq, se);
5125                u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
5126                s64 delta = slice - ran;
5127
5128                if (delta < 0) {
5129                        if (rq->curr == p)
5130                                resched_curr(rq);
5131                        return;
5132                }
5133                hrtick_start(rq, delta);
5134        }
5135}
5136
5137/*
5138 * called from enqueue/dequeue and updates the hrtick when the
5139 * current task is from our class and nr_running is low enough
5140 * to matter.
5141 */
5142static void hrtick_update(struct rq *rq)
5143{
5144        struct task_struct *curr = rq->curr;
5145
5146        if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
5147                return;
5148
5149        if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
5150                hrtick_start_fair(rq, curr);
5151}
5152#else /* !CONFIG_SCHED_HRTICK */
5153static inline void
5154hrtick_start_fair(struct rq *rq, struct task_struct *p)
5155{
5156}
5157
5158static inline void hrtick_update(struct rq *rq)
5159{
5160}
5161#endif
5162
5163#ifdef CONFIG_SMP
5164static inline unsigned long cpu_util(int cpu);
5165
5166static inline bool cpu_overutilized(int cpu)
5167{
5168        return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
5169}
5170
5171static inline void update_overutilized_status(struct rq *rq)
5172{
5173        if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
5174                WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
5175                trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
5176        }
5177}
5178#else
5179static inline void update_overutilized_status(struct rq *rq) { }
5180#endif
5181
5182/*
5183 * The enqueue_task method is called before nr_running is
5184 * increased. Here we update the fair scheduling stats and
5185 * then put the task into the rbtree:
5186 */
5187static void
5188enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5189{
5190        struct cfs_rq *cfs_rq;
5191        struct sched_entity *se = &p->se;
5192        int idle_h_nr_running = task_has_idle_policy(p);
5193
5194        /*
5195         * The code below (indirectly) updates schedutil which looks at
5196         * the cfs_rq utilization to select a frequency.
5197         * Let's add the task's estimated utilization to the cfs_rq's
5198         * estimated utilization, before we update schedutil.
5199         */
5200        util_est_enqueue(&rq->cfs, p);
5201
5202        /*
5203         * If in_iowait is set, the code below may not trigger any cpufreq
5204         * utilization updates, so do it here explicitly with the IOWAIT flag
5205         * passed.
5206         */
5207        if (p->in_iowait)
5208                cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
5209
5210        for_each_sched_entity(se) {
5211                if (se->on_rq)
5212                        break;
5213                cfs_rq = cfs_rq_of(se);
5214                enqueue_entity(cfs_rq, se, flags);
5215
5216                /*
5217                 * end evaluation on encountering a throttled cfs_rq
5218                 *
5219                 * note: in the case of encountering a throttled cfs_rq we will
5220                 * post the final h_nr_running increment below.
5221                 */
5222                if (cfs_rq_throttled(cfs_rq))
5223                        break;
5224                cfs_rq->h_nr_running++;
5225                cfs_rq->idle_h_nr_running += idle_h_nr_running;
5226
5227                flags = ENQUEUE_WAKEUP;
5228        }
5229
5230        for_each_sched_entity(se) {
5231                cfs_rq = cfs_rq_of(se);
5232                cfs_rq->h_nr_running++;
5233                cfs_rq->idle_h_nr_running += idle_h_nr_running;
5234
5235                if (cfs_rq_throttled(cfs_rq))
5236                        break;
5237
5238                update_load_avg(cfs_rq, se, UPDATE_TG);
5239                update_cfs_group(se);
5240        }
5241
5242        if (!se) {
5243                add_nr_running(rq, 1);
5244                /*
5245                 * Since new tasks are assigned an initial util_avg equal to
5246                 * half of the spare capacity of their CPU, tiny tasks have the
5247                 * ability to cross the overutilized threshold, which will
5248                 * result in the load balancer ruining all the task placement
5249                 * done by EAS. As a way to mitigate that effect, do not account
5250                 * for the first enqueue operation of new tasks during the
5251                 * overutilized flag detection.
5252                 *
5253                 * A better way of solving this problem would be to wait for
5254                 * the PELT signals of tasks to converge before taking them
5255                 * into account, but that is not straightforward to implement,
5256                 * and the following generally works well enough in practice.
5257                 */
5258                if (flags & ENQUEUE_WAKEUP)
5259                        update_overutilized_status(rq);
5260
5261        }
5262
5263        if (cfs_bandwidth_used()) {
5264                /*
5265                 * When bandwidth control is enabled; the cfs_rq_throttled()
5266                 * breaks in the above iteration can result in incomplete
5267                 * leaf list maintenance, resulting in triggering the assertion
5268                 * below.
5269                 */
5270                for_each_sched_entity(se) {
5271                        cfs_rq = cfs_rq_of(se);
5272
5273                        if (list_add_leaf_cfs_rq(cfs_rq))
5274                                break;
5275                }
5276        }
5277
5278        assert_list_leaf_cfs_rq(rq);
5279
5280        hrtick_update(rq);
5281}
5282
5283static void set_next_buddy(struct sched_entity *se);
5284
5285/*
5286 * The dequeue_task method is called before nr_running is
5287 * decreased. We remove the task from the rbtree and
5288 * update the fair scheduling stats:
5289 */
5290static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5291{
5292        struct cfs_rq *cfs_rq;
5293        struct sched_entity *se = &p->se;
5294        int task_sleep = flags & DEQUEUE_SLEEP;
5295        int idle_h_nr_running = task_has_idle_policy(p);
5296
5297        for_each_sched_entity(se) {
5298                cfs_rq = cfs_rq_of(se);
5299                dequeue_entity(cfs_rq, se, flags);
5300
5301                /*
5302                 * end evaluation on encountering a throttled cfs_rq
5303                 *
5304                 * note: in the case of encountering a throttled cfs_rq we will
5305                 * post the final h_nr_running decrement below.
5306                */
5307                if (cfs_rq_throttled(cfs_rq))
5308                        break;
5309                cfs_rq->h_nr_running--;
5310                cfs_rq->idle_h_nr_running -= idle_h_nr_running;
5311
5312                /* Don't dequeue parent if it has other entities besides us */
5313                if (cfs_rq->load.weight) {
5314                        /* Avoid re-evaluating load for this entity: */
5315                        se = parent_entity(se);
5316                        /*
5317                         * Bias pick_next to pick a task from this cfs_rq, as
5318                         * p is sleeping when it is within its sched_slice.
5319                         */
5320                        if (task_sleep && se && !throttled_hierarchy(cfs_rq))
5321                                set_next_buddy(se);
5322                        break;
5323                }
5324                flags |= DEQUEUE_SLEEP;
5325        }
5326
5327        for_each_sched_entity(se) {
5328                cfs_rq = cfs_rq_of(se);
5329                cfs_rq->h_nr_running--;
5330                cfs_rq->idle_h_nr_running -= idle_h_nr_running;
5331
5332                if (cfs_rq_throttled(cfs_rq))
5333                        break;
5334
5335                update_load_avg(cfs_rq, se, UPDATE_TG);
5336                update_cfs_group(se);
5337        }
5338
5339        if (!se)
5340                sub_nr_running(rq, 1);
5341
5342        util_est_dequeue(&rq->cfs, p, task_sleep);
5343        hrtick_update(rq);
5344}
5345
5346#ifdef CONFIG_SMP
5347
5348/* Working cpumask for: load_balance, load_balance_newidle. */
5349DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5350DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5351
5352#ifdef CONFIG_NO_HZ_COMMON
5353
5354static struct {
5355        cpumask_var_t idle_cpus_mask;
5356        atomic_t nr_cpus;
5357        int has_blocked;                /* Idle CPUS has blocked load */
5358        unsigned long next_balance;     /* in jiffy units */
5359        unsigned long next_blocked;     /* Next update of blocked load in jiffies */
5360} nohz ____cacheline_aligned;
5361
5362#endif /* CONFIG_NO_HZ_COMMON */
5363
5364/* CPU only has SCHED_IDLE tasks enqueued */
5365static int sched_idle_cpu(int cpu)
5366{
5367        struct rq *rq = cpu_rq(cpu);
5368
5369        return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
5370                        rq->nr_running);
5371}
5372
5373static unsigned long cpu_runnable_load(struct rq *rq)
5374{
5375        return cfs_rq_runnable_load_avg(&rq->cfs);
5376}
5377
5378static unsigned long capacity_of(int cpu)
5379{
5380        return cpu_rq(cpu)->cpu_capacity;
5381}
5382
5383static unsigned long cpu_avg_load_per_task(int cpu)
5384{
5385        struct rq *rq = cpu_rq(cpu);
5386        unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
5387        unsigned long load_avg = cpu_runnable_load(rq);
5388
5389        if (nr_running)
5390                return load_avg / nr_running;
5391
5392        return 0;
5393}
5394
5395static void record_wakee(struct task_struct *p)
5396{
5397        /*
5398         * Only decay a single time; tasks that have less then 1 wakeup per
5399         * jiffy will not have built up many flips.
5400         */
5401        if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
5402                current->wakee_flips >>= 1;
5403                current->wakee_flip_decay_ts = jiffies;
5404        }
5405
5406        if (current->last_wakee != p) {
5407                current->last_wakee = p;
5408                current->wakee_flips++;
5409        }
5410}
5411
5412/*
5413 * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
5414 *
5415 * A waker of many should wake a different task than the one last awakened
5416 * at a frequency roughly N times higher than one of its wakees.
5417 *
5418 * In order to determine whether we should let the load spread vs consolidating
5419 * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
5420 * partner, and a factor of lls_size higher frequency in the other.
5421 *
5422 * With both conditions met, we can be relatively sure that the relationship is
5423 * non-monogamous, with partner count exceeding socket size.
5424 *
5425 * Waker/wakee being client/server, worker/dispatcher, interrupt source or
5426 * whatever is irrelevant, spread criteria is apparent partner count exceeds
5427 * socket size.
5428 */
5429static int wake_wide(struct task_struct *p)
5430{
5431        unsigned int master = current->wakee_flips;
5432        unsigned int slave = p->wakee_flips;
5433        int factor = this_cpu_read(sd_llc_size);
5434
5435        if (master < slave)
5436                swap(master, slave);
5437        if (slave < factor || master < slave * factor)
5438                return 0;
5439        return 1;
5440}
5441
5442/*
5443 * The purpose of wake_affine() is to quickly determine on which CPU we can run
5444 * soonest. For the purpose of speed we only consider the waking and previous
5445 * CPU.
5446 *
5447 * wake_affine_idle() - only considers 'now', it check if the waking CPU is
5448 *                      cache-affine and is (or will be) idle.
5449 *
5450 * wake_affine_weight() - considers the weight to reflect the average
5451 *                        scheduling latency of the CPUs. This seems to work
5452 *                        for the overloaded case.
5453 */
5454static int
5455wake_affine_idle(int this_cpu, int prev_cpu, int sync)
5456{
5457        /*
5458         * If this_cpu is idle, it implies the wakeup is from interrupt
5459         * context. Only allow the move if cache is shared. Otherwise an
5460         * interrupt intensive workload could force all tasks onto one
5461         * node depending on the IO topology or IRQ affinity settings.
5462         *
5463         * If the prev_cpu is idle and cache affine then avoid a migration.
5464         * There is no guarantee that the cache hot data from an interrupt
5465         * is more important than cache hot data on the prev_cpu and from
5466         * a cpufreq perspective, it's better to have higher utilisation
5467         * on one CPU.
5468         */
5469        if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
5470                return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
5471
5472        if (sync && cpu_rq(this_cpu)->nr_running == 1)
5473                return this_cpu;
5474
5475        return nr_cpumask_bits;
5476}
5477
5478static int
5479wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5480                   int this_cpu, int prev_cpu, int sync)
5481{
5482        s64 this_eff_load, prev_eff_load;
5483        unsigned long task_load;
5484
5485        this_eff_load = cpu_runnable_load(cpu_rq(this_cpu));
5486
5487        if (sync) {
5488                unsigned long current_load = task_h_load(current);
5489
5490                if (current_load > this_eff_load)
5491                        return this_cpu;
5492
5493                this_eff_load -= current_load;
5494        }
5495
5496        task_load = task_h_load(p);
5497
5498        this_eff_load += task_load;
5499        if (sched_feat(WA_BIAS))
5500                this_eff_load *= 100;
5501        this_eff_load *= capacity_of(prev_cpu);
5502
5503        prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu));
5504        prev_eff_load -= task_load;
5505        if (sched_feat(WA_BIAS))
5506                prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
5507        prev_eff_load *= capacity_of(this_cpu);
5508
5509        /*
5510         * If sync, adjust the weight of prev_eff_load such that if
5511         * prev_eff == this_eff that select_idle_sibling() will consider
5512         * stacking the wakee on top of the waker if no other CPU is
5513         * idle.
5514         */
5515        if (sync)
5516                prev_eff_load += 1;
5517
5518        return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
5519}
5520
5521static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5522                       int this_cpu, int prev_cpu, int sync)
5523{
5524        int target = nr_cpumask_bits;
5525
5526        if (sched_feat(WA_IDLE))
5527                target = wake_affine_idle(this_cpu, prev_cpu, sync);
5528
5529        if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
5530                target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
5531
5532        schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
5533        if (target == nr_cpumask_bits)
5534                return prev_cpu;
5535
5536        schedstat_inc(sd->ttwu_move_affine);
5537        schedstat_inc(p->se.statistics.nr_wakeups_affine);
5538        return target;
5539}
5540
5541static unsigned long cpu_util_without(int cpu, struct task_struct *p);
5542
5543static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
5544{
5545        return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
5546}
5547
5548/*
5549 * find_idlest_group finds and returns the least busy CPU group within the
5550 * domain.
5551 *
5552 * Assumes p is allowed on at least one CPU in sd.
5553 */
5554static struct sched_group *
5555find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5556                  int this_cpu, int sd_flag)
5557{
5558        struct sched_group *idlest = NULL, *group = sd->groups;
5559        struct sched_group *most_spare_sg = NULL;
5560        unsigned long min_runnable_load = ULONG_MAX;
5561        unsigned long this_runnable_load = ULONG_MAX;
5562        unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
5563        unsigned long most_spare = 0, this_spare = 0;
5564        int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
5565        unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
5566                                (sd->imbalance_pct-100) / 100;
5567
5568        do {
5569                unsigned long load, avg_load, runnable_load;
5570                unsigned long spare_cap, max_spare_cap;
5571                int local_group;
5572                int i;
5573
5574                /* Skip over this group if it has no CPUs allowed */
5575                if (!cpumask_intersects(sched_group_span(group),
5576                                        p->cpus_ptr))
5577                        continue;
5578
5579                local_group = cpumask_test_cpu(this_cpu,
5580                                               sched_group_span(group));
5581
5582                /*
5583                 * Tally up the load of all CPUs in the group and find
5584                 * the group containing the CPU with most spare capacity.
5585                 */
5586                avg_load = 0;
5587                runnable_load = 0;
5588                max_spare_cap = 0;
5589
5590                for_each_cpu(i, sched_group_span(group)) {
5591                        load = cpu_runnable_load(cpu_rq(i));
5592                        runnable_load += load;
5593
5594                        avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
5595
5596                        spare_cap = capacity_spare_without(i, p);
5597
5598                        if (spare_cap > max_spare_cap)
5599                                max_spare_cap = spare_cap;
5600                }
5601
5602                /* Adjust by relative CPU capacity of the group */
5603                avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
5604                                        group->sgc->capacity;
5605                runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
5606                                        group->sgc->capacity;
5607
5608                if (local_group) {
5609                        this_runnable_load = runnable_load;
5610                        this_avg_load = avg_load;
5611                        this_spare = max_spare_cap;
5612                } else {
5613                        if (min_runnable_load > (runnable_load + imbalance)) {
5614                                /*
5615                                 * The runnable load is significantly smaller
5616                                 * so we can pick this new CPU:
5617                                 */
5618                                min_runnable_load = runnable_load;
5619                                min_avg_load = avg_load;
5620                                idlest = group;
5621                        } else if ((runnable_load < (min_runnable_load + imbalance)) &&
5622                                   (100*min_avg_load > imbalance_scale*avg_load)) {
5623                                /*
5624                                 * The runnable loads are close so take the
5625                                 * blocked load into account through avg_load:
5626                                 */
5627                                min_avg_load = avg_load;
5628                                idlest = group;
5629                        }
5630
5631                        if (most_spare < max_spare_cap) {
5632                                most_spare = max_spare_cap;
5633                                most_spare_sg = group;
5634                        }
5635                }
5636        } while (group = group->next, group != sd->groups);
5637
5638        /*
5639         * The cross-over point between using spare capacity or least load
5640         * is too conservative for high utilization tasks on partially
5641         * utilized systems if we require spare_capacity > task_util(p),
5642         * so we allow for some task stuffing by using
5643         * spare_capacity > task_util(p)/2.
5644         *
5645         * Spare capacity can't be used for fork because the utilization has
5646         * not been set yet, we must first select a rq to compute the initial
5647         * utilization.
5648         */
5649        if (sd_flag & SD_BALANCE_FORK)
5650                goto skip_spare;
5651
5652        if (this_spare > task_util(p) / 2 &&
5653            imbalance_scale*this_spare > 100*most_spare)
5654                return NULL;
5655
5656        if (most_spare > task_util(p) / 2)
5657                return most_spare_sg;
5658
5659skip_spare:
5660        if (!idlest)
5661                return NULL;
5662
5663        /*
5664         * When comparing groups across NUMA domains, it's possible for the
5665         * local domain to be very lightly loaded relative to the remote
5666         * domains but "imbalance" skews the comparison making remote CPUs
5667         * look much more favourable. When considering cross-domain, add
5668         * imbalance to the runnable load on the remote node and consider
5669         * staying local.
5670         */
5671        if ((sd->flags & SD_NUMA) &&
5672            min_runnable_load + imbalance >= this_runnable_load)
5673                return NULL;
5674
5675        if (min_runnable_load > (this_runnable_load + imbalance))
5676                return NULL;
5677
5678        if ((this_runnable_load < (min_runnable_load + imbalance)) &&
5679             (100*this_avg_load < imbalance_scale*min_avg_load))
5680                return NULL;
5681
5682        return idlest;
5683}
5684
5685/*
5686 * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
5687 */
5688static int
5689find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5690{
5691        unsigned long load, min_load = ULONG_MAX;
5692        unsigned int min_exit_latency = UINT_MAX;
5693        u64 latest_idle_timestamp = 0;
5694        int least_loaded_cpu = this_cpu;
5695        int shallowest_idle_cpu = -1, si_cpu = -1;
5696        int i;
5697
5698        /* Check if we have any choice: */
5699        if (group->group_weight == 1)
5700                return cpumask_first(sched_group_span(group));
5701
5702        /* Traverse only the allowed CPUs */
5703        for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
5704                if (available_idle_cpu(i)) {
5705                        struct rq *rq = cpu_rq(i);
5706                        struct cpuidle_state *idle = idle_get_state(rq);
5707                        if (idle && idle->exit_latency < min_exit_latency) {
5708                                /*
5709                                 * We give priority to a CPU whose idle state
5710                                 * has the smallest exit latency irrespective
5711                                 * of any idle timestamp.
5712                                 */
5713                                min_exit_latency = idle->exit_latency;
5714                                latest_idle_timestamp = rq->idle_stamp;
5715                                shallowest_idle_cpu = i;
5716                        } else if ((!idle || idle->exit_latency == min_exit_latency) &&
5717                                   rq->idle_stamp > latest_idle_timestamp) {
5718                                /*
5719                                 * If equal or no active idle state, then
5720                                 * the most recently idled CPU might have
5721                                 * a warmer cache.
5722                                 */
5723                                latest_idle_timestamp = rq->idle_stamp;
5724                                shallowest_idle_cpu = i;
5725                        }
5726                } else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
5727                        if (sched_idle_cpu(i)) {
5728                                si_cpu = i;
5729                                continue;
5730                        }
5731
5732                        load = cpu_runnable_load(cpu_rq(i));
5733                        if (load < min_load) {
5734                                min_load = load;
5735                                least_loaded_cpu = i;
5736                        }
5737                }
5738        }
5739
5740        if (shallowest_idle_cpu != -1)
5741                return shallowest_idle_cpu;
5742        if (si_cpu != -1)
5743                return si_cpu;
5744        return least_loaded_cpu;
5745}
5746
5747static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
5748                                  int cpu, int prev_cpu, int sd_flag)
5749{
5750        int new_cpu = cpu;
5751
5752        if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
5753                return prev_cpu;
5754
5755        /*
5756         * We need task's util for capacity_spare_without, sync it up to
5757         * prev_cpu's last_update_time.
5758         */
5759        if (!(sd_flag & SD_BALANCE_FORK))
5760                sync_entity_load_avg(&p->se);
5761
5762        while (sd) {
5763                struct sched_group *group;
5764                struct sched_domain *tmp;
5765                int weight;
5766
5767                if (!(sd->flags & sd_flag)) {
5768                        sd = sd->child;
5769                        continue;
5770                }
5771
5772                group = find_idlest_group(sd, p, cpu, sd_flag);
5773                if (!group) {
5774                        sd = sd->child;
5775                        continue;
5776                }
5777
5778                new_cpu = find_idlest_group_cpu(group, p, cpu);
5779                if (new_cpu == cpu) {
5780                        /* Now try balancing at a lower domain level of 'cpu': */
5781                        sd = sd->child;
5782                        continue;
5783                }
5784
5785                /* Now try balancing at a lower domain level of 'new_cpu': */
5786                cpu = new_cpu;
5787                weight = sd->span_weight;
5788                sd = NULL;
5789                for_each_domain(cpu, tmp) {
5790                        if (weight <= tmp->span_weight)
5791                                break;
5792                        if (tmp->flags & sd_flag)
5793                                sd = tmp;
5794                }
5795        }
5796
5797        return new_cpu;
5798}
5799
5800#ifdef CONFIG_SCHED_SMT
5801DEFINE_STATIC_KEY_FALSE(sched_smt_present);
5802EXPORT_SYMBOL_GPL(sched_smt_present);
5803
5804static inline void set_idle_cores(int cpu, int val)
5805{
5806        struct sched_domain_shared *sds;
5807
5808        sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5809        if (sds)
5810                WRITE_ONCE(sds->has_idle_cores, val);
5811}
5812
5813static inline bool test_idle_cores(int cpu, bool def)
5814{
5815        struct sched_domain_shared *sds;
5816
5817        sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5818        if (sds)
5819                return READ_ONCE(sds->has_idle_cores);
5820
5821        return def;
5822}
5823
5824/*
5825 * Scans the local SMT mask to see if the entire core is idle, and records this
5826 * information in sd_llc_shared->has_idle_cores.
5827 *
5828 * Since SMT siblings share all cache levels, inspecting this limited remote
5829 * state should be fairly cheap.
5830 */
5831void __update_idle_core(struct rq *rq)
5832{
5833        int core = cpu_of(rq);
5834        int cpu;
5835
5836        rcu_read_lock();
5837        if (test_idle_cores(core, true))
5838                goto unlock;
5839
5840        for_each_cpu(cpu, cpu_smt_mask(core)) {
5841                if (cpu == core)
5842                        continue;
5843
5844                if (!available_idle_cpu(cpu))
5845                        goto unlock;
5846        }
5847
5848        set_idle_cores(core, 1);
5849unlock:
5850        rcu_read_unlock();
5851}
5852
5853/*
5854 * Scan the entire LLC domain for idle cores; this dynamically switches off if
5855 * there are no idle cores left in the system; tracked through
5856 * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
5857 */
5858static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
5859{
5860        struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
5861        int core, cpu;
5862
5863        if (!static_branch_likely(&sched_smt_present))
5864                return -1;
5865
5866        if (!test_idle_cores(target, fals