linux/kernel/sched/core.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *  kernel/sched/core.c
   4 *
   5 *  Core kernel scheduler code and related syscalls
   6 *
   7 *  Copyright (C) 1991-2002  Linus Torvalds
   8 */
   9#define CREATE_TRACE_POINTS
  10#include <trace/events/sched.h>
  11#undef CREATE_TRACE_POINTS
  12
  13#include "sched.h"
  14
  15#include <linux/nospec.h>
  16
  17#include <linux/kcov.h>
  18#include <linux/scs.h>
  19
  20#include <asm/switch_to.h>
  21#include <asm/tlb.h>
  22
  23#include "../workqueue_internal.h"
  24#include "../../fs/io-wq.h"
  25#include "../smpboot.h"
  26
  27#include "pelt.h"
  28#include "smp.h"
  29
  30/*
  31 * Export tracepoints that act as a bare tracehook (ie: have no trace event
  32 * associated with them) to allow external modules to probe them.
  33 */
  34EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
  35EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
  36EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
  37EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
  38EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
  39EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
  40EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
  41EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
  42EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
  43EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
  44
  45DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
  46
  47#ifdef CONFIG_SCHED_DEBUG
  48/*
  49 * Debugging: various feature bits
  50 *
  51 * If SCHED_DEBUG is disabled, each compilation unit has its own copy of
  52 * sysctl_sched_features, defined in sched.h, to allow constants propagation
  53 * at compile time and compiler optimization based on features default.
  54 */
  55#define SCHED_FEAT(name, enabled)       \
  56        (1UL << __SCHED_FEAT_##name) * enabled |
  57const_debug unsigned int sysctl_sched_features =
  58#include "features.h"
  59        0;
  60#undef SCHED_FEAT
  61
  62/*
  63 * Print a warning if need_resched is set for the given duration (if
  64 * LATENCY_WARN is enabled).
  65 *
  66 * If sysctl_resched_latency_warn_once is set, only one warning will be shown
  67 * per boot.
  68 */
  69__read_mostly int sysctl_resched_latency_warn_ms = 100;
  70__read_mostly int sysctl_resched_latency_warn_once = 1;
  71#endif /* CONFIG_SCHED_DEBUG */
  72
  73/*
  74 * Number of tasks to iterate in a single balance run.
  75 * Limited because this is done with IRQs disabled.
  76 */
  77const_debug unsigned int sysctl_sched_nr_migrate = 32;
  78
  79/*
  80 * period over which we measure -rt task CPU usage in us.
  81 * default: 1s
  82 */
  83unsigned int sysctl_sched_rt_period = 1000000;
  84
  85__read_mostly int scheduler_running;
  86
  87#ifdef CONFIG_SCHED_CORE
  88
  89DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
  90
  91/* kernel prio, less is more */
  92static inline int __task_prio(struct task_struct *p)
  93{
  94        if (p->sched_class == &stop_sched_class) /* trumps deadline */
  95                return -2;
  96
  97        if (rt_prio(p->prio)) /* includes deadline */
  98                return p->prio; /* [-1, 99] */
  99
 100        if (p->sched_class == &idle_sched_class)
 101                return MAX_RT_PRIO + NICE_WIDTH; /* 140 */
 102
 103        return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */
 104}
 105
 106/*
 107 * l(a,b)
 108 * le(a,b) := !l(b,a)
 109 * g(a,b)  := l(b,a)
 110 * ge(a,b) := !l(a,b)
 111 */
 112
 113/* real prio, less is less */
 114static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
 115{
 116
 117        int pa = __task_prio(a), pb = __task_prio(b);
 118
 119        if (-pa < -pb)
 120                return true;
 121
 122        if (-pb < -pa)
 123                return false;
 124
 125        if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
 126                return !dl_time_before(a->dl.deadline, b->dl.deadline);
 127
 128        if (pa == MAX_RT_PRIO + MAX_NICE)       /* fair */
 129                return cfs_prio_less(a, b, in_fi);
 130
 131        return false;
 132}
 133
 134static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)
 135{
 136        if (a->core_cookie < b->core_cookie)
 137                return true;
 138
 139        if (a->core_cookie > b->core_cookie)
 140                return false;
 141
 142        /* flip prio, so high prio is leftmost */
 143        if (prio_less(b, a, task_rq(a)->core->core_forceidle))
 144                return true;
 145
 146        return false;
 147}
 148
 149#define __node_2_sc(node) rb_entry((node), struct task_struct, core_node)
 150
 151static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b)
 152{
 153        return __sched_core_less(__node_2_sc(a), __node_2_sc(b));
 154}
 155
 156static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node)
 157{
 158        const struct task_struct *p = __node_2_sc(node);
 159        unsigned long cookie = (unsigned long)key;
 160
 161        if (cookie < p->core_cookie)
 162                return -1;
 163
 164        if (cookie > p->core_cookie)
 165                return 1;
 166
 167        return 0;
 168}
 169
 170void sched_core_enqueue(struct rq *rq, struct task_struct *p)
 171{
 172        rq->core->core_task_seq++;
 173
 174        if (!p->core_cookie)
 175                return;
 176
 177        rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
 178}
 179
 180void sched_core_dequeue(struct rq *rq, struct task_struct *p)
 181{
 182        rq->core->core_task_seq++;
 183
 184        if (!sched_core_enqueued(p))
 185                return;
 186
 187        rb_erase(&p->core_node, &rq->core_tree);
 188        RB_CLEAR_NODE(&p->core_node);
 189}
 190
 191/*
 192 * Find left-most (aka, highest priority) task matching @cookie.
 193 */
 194static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
 195{
 196        struct rb_node *node;
 197
 198        node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
 199        /*
 200         * The idle task always matches any cookie!
 201         */
 202        if (!node)
 203                return idle_sched_class.pick_task(rq);
 204
 205        return __node_2_sc(node);
 206}
 207
 208static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
 209{
 210        struct rb_node *node = &p->core_node;
 211
 212        node = rb_next(node);
 213        if (!node)
 214                return NULL;
 215
 216        p = container_of(node, struct task_struct, core_node);
 217        if (p->core_cookie != cookie)
 218                return NULL;
 219
 220        return p;
 221}
 222
 223/*
 224 * Magic required such that:
 225 *
 226 *      raw_spin_rq_lock(rq);
 227 *      ...
 228 *      raw_spin_rq_unlock(rq);
 229 *
 230 * ends up locking and unlocking the _same_ lock, and all CPUs
 231 * always agree on what rq has what lock.
 232 *
 233 * XXX entirely possible to selectively enable cores, don't bother for now.
 234 */
 235
 236static DEFINE_MUTEX(sched_core_mutex);
 237static atomic_t sched_core_count;
 238static struct cpumask sched_core_mask;
 239
 240static void sched_core_lock(int cpu, unsigned long *flags)
 241{
 242        const struct cpumask *smt_mask = cpu_smt_mask(cpu);
 243        int t, i = 0;
 244
 245        local_irq_save(*flags);
 246        for_each_cpu(t, smt_mask)
 247                raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
 248}
 249
 250static void sched_core_unlock(int cpu, unsigned long *flags)
 251{
 252        const struct cpumask *smt_mask = cpu_smt_mask(cpu);
 253        int t;
 254
 255        for_each_cpu(t, smt_mask)
 256                raw_spin_unlock(&cpu_rq(t)->__lock);
 257        local_irq_restore(*flags);
 258}
 259
 260static void __sched_core_flip(bool enabled)
 261{
 262        unsigned long flags;
 263        int cpu, t;
 264
 265        cpus_read_lock();
 266
 267        /*
 268         * Toggle the online cores, one by one.
 269         */
 270        cpumask_copy(&sched_core_mask, cpu_online_mask);
 271        for_each_cpu(cpu, &sched_core_mask) {
 272                const struct cpumask *smt_mask = cpu_smt_mask(cpu);
 273
 274                sched_core_lock(cpu, &flags);
 275
 276                for_each_cpu(t, smt_mask)
 277                        cpu_rq(t)->core_enabled = enabled;
 278
 279                sched_core_unlock(cpu, &flags);
 280
 281                cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
 282        }
 283
 284        /*
 285         * Toggle the offline CPUs.
 286         */
 287        cpumask_copy(&sched_core_mask, cpu_possible_mask);
 288        cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask);
 289
 290        for_each_cpu(cpu, &sched_core_mask)
 291                cpu_rq(cpu)->core_enabled = enabled;
 292
 293        cpus_read_unlock();
 294}
 295
 296static void sched_core_assert_empty(void)
 297{
 298        int cpu;
 299
 300        for_each_possible_cpu(cpu)
 301                WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree));
 302}
 303
 304static void __sched_core_enable(void)
 305{
 306        static_branch_enable(&__sched_core_enabled);
 307        /*
 308         * Ensure all previous instances of raw_spin_rq_*lock() have finished
 309         * and future ones will observe !sched_core_disabled().
 310         */
 311        synchronize_rcu();
 312        __sched_core_flip(true);
 313        sched_core_assert_empty();
 314}
 315
 316static void __sched_core_disable(void)
 317{
 318        sched_core_assert_empty();
 319        __sched_core_flip(false);
 320        static_branch_disable(&__sched_core_enabled);
 321}
 322
 323void sched_core_get(void)
 324{
 325        if (atomic_inc_not_zero(&sched_core_count))
 326                return;
 327
 328        mutex_lock(&sched_core_mutex);
 329        if (!atomic_read(&sched_core_count))
 330                __sched_core_enable();
 331
 332        smp_mb__before_atomic();
 333        atomic_inc(&sched_core_count);
 334        mutex_unlock(&sched_core_mutex);
 335}
 336
 337static void __sched_core_put(struct work_struct *work)
 338{
 339        if (atomic_dec_and_mutex_lock(&sched_core_count, &sched_core_mutex)) {
 340                __sched_core_disable();
 341                mutex_unlock(&sched_core_mutex);
 342        }
 343}
 344
 345void sched_core_put(void)
 346{
 347        static DECLARE_WORK(_work, __sched_core_put);
 348
 349        /*
 350         * "There can be only one"
 351         *
 352         * Either this is the last one, or we don't actually need to do any
 353         * 'work'. If it is the last *again*, we rely on
 354         * WORK_STRUCT_PENDING_BIT.
 355         */
 356        if (!atomic_add_unless(&sched_core_count, -1, 1))
 357                schedule_work(&_work);
 358}
 359
 360#else /* !CONFIG_SCHED_CORE */
 361
 362static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
 363static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
 364
 365#endif /* CONFIG_SCHED_CORE */
 366
 367/*
 368 * part of the period that we allow rt tasks to run in us.
 369 * default: 0.95s
 370 */
 371int sysctl_sched_rt_runtime = 950000;
 372
 373
 374/*
 375 * Serialization rules:
 376 *
 377 * Lock order:
 378 *
 379 *   p->pi_lock
 380 *     rq->lock
 381 *       hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
 382 *
 383 *  rq1->lock
 384 *    rq2->lock  where: rq1 < rq2
 385 *
 386 * Regular state:
 387 *
 388 * Normal scheduling state is serialized by rq->lock. __schedule() takes the
 389 * local CPU's rq->lock, it optionally removes the task from the runqueue and
 390 * always looks at the local rq data structures to find the most eligible task
 391 * to run next.
 392 *
 393 * Task enqueue is also under rq->lock, possibly taken from another CPU.
 394 * Wakeups from another LLC domain might use an IPI to transfer the enqueue to
 395 * the local CPU to avoid bouncing the runqueue state around [ see
 396 * ttwu_queue_wakelist() ]
 397 *
 398 * Task wakeup, specifically wakeups that involve migration, are horribly
 399 * complicated to avoid having to take two rq->locks.
 400 *
 401 * Special state:
 402 *
 403 * System-calls and anything external will use task_rq_lock() which acquires
 404 * both p->pi_lock and rq->lock. As a consequence the state they change is
 405 * stable while holding either lock:
 406 *
 407 *  - sched_setaffinity()/
 408 *    set_cpus_allowed_ptr():   p->cpus_ptr, p->nr_cpus_allowed
 409 *  - set_user_nice():          p->se.load, p->*prio
 410 *  - __sched_setscheduler():   p->sched_class, p->policy, p->*prio,
 411 *                              p->se.load, p->rt_priority,
 412 *                              p->dl.dl_{runtime, deadline, period, flags, bw, density}
 413 *  - sched_setnuma():          p->numa_preferred_nid
 414 *  - sched_move_task()/
 415 *    cpu_cgroup_fork():        p->sched_task_group
 416 *  - uclamp_update_active()    p->uclamp*
 417 *
 418 * p->state <- TASK_*:
 419 *
 420 *   is changed locklessly using set_current_state(), __set_current_state() or
 421 *   set_special_state(), see their respective comments, or by
 422 *   try_to_wake_up(). This latter uses p->pi_lock to serialize against
 423 *   concurrent self.
 424 *
 425 * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
 426 *
 427 *   is set by activate_task() and cleared by deactivate_task(), under
 428 *   rq->lock. Non-zero indicates the task is runnable, the special
 429 *   ON_RQ_MIGRATING state is used for migration without holding both
 430 *   rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
 431 *
 432 * p->on_cpu <- { 0, 1 }:
 433 *
 434 *   is set by prepare_task() and cleared by finish_task() such that it will be
 435 *   set before p is scheduled-in and cleared after p is scheduled-out, both
 436 *   under rq->lock. Non-zero indicates the task is running on its CPU.
 437 *
 438 *   [ The astute reader will observe that it is possible for two tasks on one
 439 *     CPU to have ->on_cpu = 1 at the same time. ]
 440 *
 441 * task_cpu(p): is changed by set_task_cpu(), the rules are:
 442 *
 443 *  - Don't call set_task_cpu() on a blocked task:
 444 *
 445 *    We don't care what CPU we're not running on, this simplifies hotplug,
 446 *    the CPU assignment of blocked tasks isn't required to be valid.
 447 *
 448 *  - for try_to_wake_up(), called under p->pi_lock:
 449 *
 450 *    This allows try_to_wake_up() to only take one rq->lock, see its comment.
 451 *
 452 *  - for migration called under rq->lock:
 453 *    [ see task_on_rq_migrating() in task_rq_lock() ]
 454 *
 455 *    o move_queued_task()
 456 *    o detach_task()
 457 *
 458 *  - for migration called under double_rq_lock():
 459 *
 460 *    o __migrate_swap_task()
 461 *    o push_rt_task() / pull_rt_task()
 462 *    o push_dl_task() / pull_dl_task()
 463 *    o dl_task_offline_migration()
 464 *
 465 */
 466
 467void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
 468{
 469        raw_spinlock_t *lock;
 470
 471        /* Matches synchronize_rcu() in __sched_core_enable() */
 472        preempt_disable();
 473        if (sched_core_disabled()) {
 474                raw_spin_lock_nested(&rq->__lock, subclass);
 475                /* preempt_count *MUST* be > 1 */
 476                preempt_enable_no_resched();
 477                return;
 478        }
 479
 480        for (;;) {
 481                lock = __rq_lockp(rq);
 482                raw_spin_lock_nested(lock, subclass);
 483                if (likely(lock == __rq_lockp(rq))) {
 484                        /* preempt_count *MUST* be > 1 */
 485                        preempt_enable_no_resched();
 486                        return;
 487                }
 488                raw_spin_unlock(lock);
 489        }
 490}
 491
 492bool raw_spin_rq_trylock(struct rq *rq)
 493{
 494        raw_spinlock_t *lock;
 495        bool ret;
 496
 497        /* Matches synchronize_rcu() in __sched_core_enable() */
 498        preempt_disable();
 499        if (sched_core_disabled()) {
 500                ret = raw_spin_trylock(&rq->__lock);
 501                preempt_enable();
 502                return ret;
 503        }
 504
 505        for (;;) {
 506                lock = __rq_lockp(rq);
 507                ret = raw_spin_trylock(lock);
 508                if (!ret || (likely(lock == __rq_lockp(rq)))) {
 509                        preempt_enable();
 510                        return ret;
 511                }
 512                raw_spin_unlock(lock);
 513        }
 514}
 515
 516void raw_spin_rq_unlock(struct rq *rq)
 517{
 518        raw_spin_unlock(rq_lockp(rq));
 519}
 520
 521#ifdef CONFIG_SMP
 522/*
 523 * double_rq_lock - safely lock two runqueues
 524 */
 525void double_rq_lock(struct rq *rq1, struct rq *rq2)
 526{
 527        lockdep_assert_irqs_disabled();
 528
 529        if (rq_order_less(rq2, rq1))
 530                swap(rq1, rq2);
 531
 532        raw_spin_rq_lock(rq1);
 533        if (__rq_lockp(rq1) == __rq_lockp(rq2))
 534                return;
 535
 536        raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
 537}
 538#endif
 539
 540/*
 541 * __task_rq_lock - lock the rq @p resides on.
 542 */
 543struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 544        __acquires(rq->lock)
 545{
 546        struct rq *rq;
 547
 548        lockdep_assert_held(&p->pi_lock);
 549
 550        for (;;) {
 551                rq = task_rq(p);
 552                raw_spin_rq_lock(rq);
 553                if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
 554                        rq_pin_lock(rq, rf);
 555                        return rq;
 556                }
 557                raw_spin_rq_unlock(rq);
 558
 559                while (unlikely(task_on_rq_migrating(p)))
 560                        cpu_relax();
 561        }
 562}
 563
 564/*
 565 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
 566 */
 567struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 568        __acquires(p->pi_lock)
 569        __acquires(rq->lock)
 570{
 571        struct rq *rq;
 572
 573        for (;;) {
 574                raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
 575                rq = task_rq(p);
 576                raw_spin_rq_lock(rq);
 577                /*
 578                 *      move_queued_task()              task_rq_lock()
 579                 *
 580                 *      ACQUIRE (rq->lock)
 581                 *      [S] ->on_rq = MIGRATING         [L] rq = task_rq()
 582                 *      WMB (__set_task_cpu())          ACQUIRE (rq->lock);
 583                 *      [S] ->cpu = new_cpu             [L] task_rq()
 584                 *                                      [L] ->on_rq
 585                 *      RELEASE (rq->lock)
 586                 *
 587                 * If we observe the old CPU in task_rq_lock(), the acquire of
 588                 * the old rq->lock will fully serialize against the stores.
 589                 *
 590                 * If we observe the new CPU in task_rq_lock(), the address
 591                 * dependency headed by '[L] rq = task_rq()' and the acquire
 592                 * will pair with the WMB to ensure we then also see migrating.
 593                 */
 594                if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
 595                        rq_pin_lock(rq, rf);
 596                        return rq;
 597                }
 598                raw_spin_rq_unlock(rq);
 599                raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
 600
 601                while (unlikely(task_on_rq_migrating(p)))
 602                        cpu_relax();
 603        }
 604}
 605
 606/*
 607 * RQ-clock updating methods:
 608 */
 609
 610static void update_rq_clock_task(struct rq *rq, s64 delta)
 611{
 612/*
 613 * In theory, the compile should just see 0 here, and optimize out the call
 614 * to sched_rt_avg_update. But I don't trust it...
 615 */
 616        s64 __maybe_unused steal = 0, irq_delta = 0;
 617
 618#ifdef CONFIG_IRQ_TIME_ACCOUNTING
 619        irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
 620
 621        /*
 622         * Since irq_time is only updated on {soft,}irq_exit, we might run into
 623         * this case when a previous update_rq_clock() happened inside a
 624         * {soft,}irq region.
 625         *
 626         * When this happens, we stop ->clock_task and only update the
 627         * prev_irq_time stamp to account for the part that fit, so that a next
 628         * update will consume the rest. This ensures ->clock_task is
 629         * monotonic.
 630         *
 631         * It does however cause some slight miss-attribution of {soft,}irq
 632         * time, a more accurate solution would be to update the irq_time using
 633         * the current rq->clock timestamp, except that would require using
 634         * atomic ops.
 635         */
 636        if (irq_delta > delta)
 637                irq_delta = delta;
 638
 639        rq->prev_irq_time += irq_delta;
 640        delta -= irq_delta;
 641#endif
 642#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
 643        if (static_key_false((&paravirt_steal_rq_enabled))) {
 644                steal = paravirt_steal_clock(cpu_of(rq));
 645                steal -= rq->prev_steal_time_rq;
 646
 647                if (unlikely(steal > delta))
 648                        steal = delta;
 649
 650                rq->prev_steal_time_rq += steal;
 651                delta -= steal;
 652        }
 653#endif
 654
 655        rq->clock_task += delta;
 656
 657#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
 658        if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
 659                update_irq_load_avg(rq, irq_delta + steal);
 660#endif
 661        update_rq_clock_pelt(rq, delta);
 662}
 663
 664void update_rq_clock(struct rq *rq)
 665{
 666        s64 delta;
 667
 668        lockdep_assert_rq_held(rq);
 669
 670        if (rq->clock_update_flags & RQCF_ACT_SKIP)
 671                return;
 672
 673#ifdef CONFIG_SCHED_DEBUG
 674        if (sched_feat(WARN_DOUBLE_CLOCK))
 675                SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
 676        rq->clock_update_flags |= RQCF_UPDATED;
 677#endif
 678
 679        delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
 680        if (delta < 0)
 681                return;
 682        rq->clock += delta;
 683        update_rq_clock_task(rq, delta);
 684}
 685
 686#ifdef CONFIG_SCHED_HRTICK
 687/*
 688 * Use HR-timers to deliver accurate preemption points.
 689 */
 690
 691static void hrtick_clear(struct rq *rq)
 692{
 693        if (hrtimer_active(&rq->hrtick_timer))
 694                hrtimer_cancel(&rq->hrtick_timer);
 695}
 696
 697/*
 698 * High-resolution timer tick.
 699 * Runs from hardirq context with interrupts disabled.
 700 */
 701static enum hrtimer_restart hrtick(struct hrtimer *timer)
 702{
 703        struct rq *rq = container_of(timer, struct rq, hrtick_timer);
 704        struct rq_flags rf;
 705
 706        WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
 707
 708        rq_lock(rq, &rf);
 709        update_rq_clock(rq);
 710        rq->curr->sched_class->task_tick(rq, rq->curr, 1);
 711        rq_unlock(rq, &rf);
 712
 713        return HRTIMER_NORESTART;
 714}
 715
 716#ifdef CONFIG_SMP
 717
 718static void __hrtick_restart(struct rq *rq)
 719{
 720        struct hrtimer *timer = &rq->hrtick_timer;
 721        ktime_t time = rq->hrtick_time;
 722
 723        hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
 724}
 725
 726/*
 727 * called from hardirq (IPI) context
 728 */
 729static void __hrtick_start(void *arg)
 730{
 731        struct rq *rq = arg;
 732        struct rq_flags rf;
 733
 734        rq_lock(rq, &rf);
 735        __hrtick_restart(rq);
 736        rq_unlock(rq, &rf);
 737}
 738
 739/*
 740 * Called to set the hrtick timer state.
 741 *
 742 * called with rq->lock held and irqs disabled
 743 */
 744void hrtick_start(struct rq *rq, u64 delay)
 745{
 746        struct hrtimer *timer = &rq->hrtick_timer;
 747        s64 delta;
 748
 749        /*
 750         * Don't schedule slices shorter than 10000ns, that just
 751         * doesn't make sense and can cause timer DoS.
 752         */
 753        delta = max_t(s64, delay, 10000LL);
 754        rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
 755
 756        if (rq == this_rq())
 757                __hrtick_restart(rq);
 758        else
 759                smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
 760}
 761
 762#else
 763/*
 764 * Called to set the hrtick timer state.
 765 *
 766 * called with rq->lock held and irqs disabled
 767 */
 768void hrtick_start(struct rq *rq, u64 delay)
 769{
 770        /*
 771         * Don't schedule slices shorter than 10000ns, that just
 772         * doesn't make sense. Rely on vruntime for fairness.
 773         */
 774        delay = max_t(u64, delay, 10000LL);
 775        hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
 776                      HRTIMER_MODE_REL_PINNED_HARD);
 777}
 778
 779#endif /* CONFIG_SMP */
 780
 781static void hrtick_rq_init(struct rq *rq)
 782{
 783#ifdef CONFIG_SMP
 784        INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
 785#endif
 786        hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
 787        rq->hrtick_timer.function = hrtick;
 788}
 789#else   /* CONFIG_SCHED_HRTICK */
 790static inline void hrtick_clear(struct rq *rq)
 791{
 792}
 793
 794static inline void hrtick_rq_init(struct rq *rq)
 795{
 796}
 797#endif  /* CONFIG_SCHED_HRTICK */
 798
 799/*
 800 * cmpxchg based fetch_or, macro so it works for different integer types
 801 */
 802#define fetch_or(ptr, mask)                                             \
 803        ({                                                              \
 804                typeof(ptr) _ptr = (ptr);                               \
 805                typeof(mask) _mask = (mask);                            \
 806                typeof(*_ptr) _old, _val = *_ptr;                       \
 807                                                                        \
 808                for (;;) {                                              \
 809                        _old = cmpxchg(_ptr, _val, _val | _mask);       \
 810                        if (_old == _val)                               \
 811                                break;                                  \
 812                        _val = _old;                                    \
 813                }                                                       \
 814        _old;                                                           \
 815})
 816
 817#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
 818/*
 819 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
 820 * this avoids any races wrt polling state changes and thereby avoids
 821 * spurious IPIs.
 822 */
 823static bool set_nr_and_not_polling(struct task_struct *p)
 824{
 825        struct thread_info *ti = task_thread_info(p);
 826        return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
 827}
 828
 829/*
 830 * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
 831 *
 832 * If this returns true, then the idle task promises to call
 833 * sched_ttwu_pending() and reschedule soon.
 834 */
 835static bool set_nr_if_polling(struct task_struct *p)
 836{
 837        struct thread_info *ti = task_thread_info(p);
 838        typeof(ti->flags) old, val = READ_ONCE(ti->flags);
 839
 840        for (;;) {
 841                if (!(val & _TIF_POLLING_NRFLAG))
 842                        return false;
 843                if (val & _TIF_NEED_RESCHED)
 844                        return true;
 845                old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
 846                if (old == val)
 847                        break;
 848                val = old;
 849        }
 850        return true;
 851}
 852
 853#else
 854static bool set_nr_and_not_polling(struct task_struct *p)
 855{
 856        set_tsk_need_resched(p);
 857        return true;
 858}
 859
 860#ifdef CONFIG_SMP
 861static bool set_nr_if_polling(struct task_struct *p)
 862{
 863        return false;
 864}
 865#endif
 866#endif
 867
 868static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
 869{
 870        struct wake_q_node *node = &task->wake_q;
 871
 872        /*
 873         * Atomically grab the task, if ->wake_q is !nil already it means
 874         * it's already queued (either by us or someone else) and will get the
 875         * wakeup due to that.
 876         *
 877         * In order to ensure that a pending wakeup will observe our pending
 878         * state, even in the failed case, an explicit smp_mb() must be used.
 879         */
 880        smp_mb__before_atomic();
 881        if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
 882                return false;
 883
 884        /*
 885         * The head is context local, there can be no concurrency.
 886         */
 887        *head->lastp = node;
 888        head->lastp = &node->next;
 889        return true;
 890}
 891
 892/**
 893 * wake_q_add() - queue a wakeup for 'later' waking.
 894 * @head: the wake_q_head to add @task to
 895 * @task: the task to queue for 'later' wakeup
 896 *
 897 * Queue a task for later wakeup, most likely by the wake_up_q() call in the
 898 * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
 899 * instantly.
 900 *
 901 * This function must be used as-if it were wake_up_process(); IOW the task
 902 * must be ready to be woken at this location.
 903 */
 904void wake_q_add(struct wake_q_head *head, struct task_struct *task)
 905{
 906        if (__wake_q_add(head, task))
 907                get_task_struct(task);
 908}
 909
 910/**
 911 * wake_q_add_safe() - safely queue a wakeup for 'later' waking.
 912 * @head: the wake_q_head to add @task to
 913 * @task: the task to queue for 'later' wakeup
 914 *
 915 * Queue a task for later wakeup, most likely by the wake_up_q() call in the
 916 * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
 917 * instantly.
 918 *
 919 * This function must be used as-if it were wake_up_process(); IOW the task
 920 * must be ready to be woken at this location.
 921 *
 922 * This function is essentially a task-safe equivalent to wake_q_add(). Callers
 923 * that already hold reference to @task can call the 'safe' version and trust
 924 * wake_q to do the right thing depending whether or not the @task is already
 925 * queued for wakeup.
 926 */
 927void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
 928{
 929        if (!__wake_q_add(head, task))
 930                put_task_struct(task);
 931}
 932
 933void wake_up_q(struct wake_q_head *head)
 934{
 935        struct wake_q_node *node = head->first;
 936
 937        while (node != WAKE_Q_TAIL) {
 938                struct task_struct *task;
 939
 940                task = container_of(node, struct task_struct, wake_q);
 941                /* Task can safely be re-inserted now: */
 942                node = node->next;
 943                task->wake_q.next = NULL;
 944
 945                /*
 946                 * wake_up_process() executes a full barrier, which pairs with
 947                 * the queueing in wake_q_add() so as not to miss wakeups.
 948                 */
 949                wake_up_process(task);
 950                put_task_struct(task);
 951        }
 952}
 953
 954/*
 955 * resched_curr - mark rq's current task 'to be rescheduled now'.
 956 *
 957 * On UP this means the setting of the need_resched flag, on SMP it
 958 * might also involve a cross-CPU call to trigger the scheduler on
 959 * the target CPU.
 960 */
 961void resched_curr(struct rq *rq)
 962{
 963        struct task_struct *curr = rq->curr;
 964        int cpu;
 965
 966        lockdep_assert_rq_held(rq);
 967
 968        if (test_tsk_need_resched(curr))
 969                return;
 970
 971        cpu = cpu_of(rq);
 972
 973        if (cpu == smp_processor_id()) {
 974                set_tsk_need_resched(curr);
 975                set_preempt_need_resched();
 976                return;
 977        }
 978
 979        if (set_nr_and_not_polling(curr))
 980                smp_send_reschedule(cpu);
 981        else
 982                trace_sched_wake_idle_without_ipi(cpu);
 983}
 984
 985void resched_cpu(int cpu)
 986{
 987        struct rq *rq = cpu_rq(cpu);
 988        unsigned long flags;
 989
 990        raw_spin_rq_lock_irqsave(rq, flags);
 991        if (cpu_online(cpu) || cpu == smp_processor_id())
 992                resched_curr(rq);
 993        raw_spin_rq_unlock_irqrestore(rq, flags);
 994}
 995
 996#ifdef CONFIG_SMP
 997#ifdef CONFIG_NO_HZ_COMMON
 998/*
 999 * In the semi idle case, use the nearest busy CPU for migrating timers
1000 * from an idle CPU.  This is good for power-savings.
1001 *
1002 * We don't do similar optimization for completely idle system, as
1003 * selecting an idle CPU will add more delays to the timers than intended
1004 * (as that CPU's timer base may not be uptodate wrt jiffies etc).
1005 */
1006int get_nohz_timer_target(void)
1007{
1008        int i, cpu = smp_processor_id(), default_cpu = -1;
1009        struct sched_domain *sd;
1010        const struct cpumask *hk_mask;
1011
1012        if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
1013                if (!idle_cpu(cpu))
1014                        return cpu;
1015                default_cpu = cpu;
1016        }
1017
1018        hk_mask = housekeeping_cpumask(HK_FLAG_TIMER);
1019
1020        rcu_read_lock();
1021        for_each_domain(cpu, sd) {
1022                for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {
1023                        if (cpu == i)
1024                                continue;
1025
1026                        if (!idle_cpu(i)) {
1027                                cpu = i;
1028                                goto unlock;
1029                        }
1030                }
1031        }
1032
1033        if (default_cpu == -1)
1034                default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
1035        cpu = default_cpu;
1036unlock:
1037        rcu_read_unlock();
1038        return cpu;
1039}
1040
1041/*
1042 * When add_timer_on() enqueues a timer into the timer wheel of an
1043 * idle CPU then this timer might expire before the next timer event
1044 * which is scheduled to wake up that CPU. In case of a completely
1045 * idle system the next event might even be infinite time into the
1046 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
1047 * leaves the inner idle loop so the newly added timer is taken into
1048 * account when the CPU goes back to idle and evaluates the timer
1049 * wheel for the next timer event.
1050 */
1051static void wake_up_idle_cpu(int cpu)
1052{
1053        struct rq *rq = cpu_rq(cpu);
1054
1055        if (cpu == smp_processor_id())
1056                return;
1057
1058        if (set_nr_and_not_polling(rq->idle))
1059                smp_send_reschedule(cpu);
1060        else
1061                trace_sched_wake_idle_without_ipi(cpu);
1062}
1063
1064static bool wake_up_full_nohz_cpu(int cpu)
1065{
1066        /*
1067         * We just need the target to call irq_exit() and re-evaluate
1068         * the next tick. The nohz full kick at least implies that.
1069         * If needed we can still optimize that later with an
1070         * empty IRQ.
1071         */
1072        if (cpu_is_offline(cpu))
1073                return true;  /* Don't try to wake offline CPUs. */
1074        if (tick_nohz_full_cpu(cpu)) {
1075                if (cpu != smp_processor_id() ||
1076                    tick_nohz_tick_stopped())
1077                        tick_nohz_full_kick_cpu(cpu);
1078                return true;
1079        }
1080
1081        return false;
1082}
1083
1084/*
1085 * Wake up the specified CPU.  If the CPU is going offline, it is the
1086 * caller's responsibility to deal with the lost wakeup, for example,
1087 * by hooking into the CPU_DEAD notifier like timers and hrtimers do.
1088 */
1089void wake_up_nohz_cpu(int cpu)
1090{
1091        if (!wake_up_full_nohz_cpu(cpu))
1092                wake_up_idle_cpu(cpu);
1093}
1094
1095static void nohz_csd_func(void *info)
1096{
1097        struct rq *rq = info;
1098        int cpu = cpu_of(rq);
1099        unsigned int flags;
1100
1101        /*
1102         * Release the rq::nohz_csd.
1103         */
1104        flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, nohz_flags(cpu));
1105        WARN_ON(!(flags & NOHZ_KICK_MASK));
1106
1107        rq->idle_balance = idle_cpu(cpu);
1108        if (rq->idle_balance && !need_resched()) {
1109                rq->nohz_idle_balance = flags;
1110                raise_softirq_irqoff(SCHED_SOFTIRQ);
1111        }
1112}
1113
1114#endif /* CONFIG_NO_HZ_COMMON */
1115
1116#ifdef CONFIG_NO_HZ_FULL
1117bool sched_can_stop_tick(struct rq *rq)
1118{
1119        int fifo_nr_running;
1120
1121        /* Deadline tasks, even if single, need the tick */
1122        if (rq->dl.dl_nr_running)
1123                return false;
1124
1125        /*
1126         * If there are more than one RR tasks, we need the tick to affect the
1127         * actual RR behaviour.
1128         */
1129        if (rq->rt.rr_nr_running) {
1130                if (rq->rt.rr_nr_running == 1)
1131                        return true;
1132                else
1133                        return false;
1134        }
1135
1136        /*
1137         * If there's no RR tasks, but FIFO tasks, we can skip the tick, no
1138         * forced preemption between FIFO tasks.
1139         */
1140        fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
1141        if (fifo_nr_running)
1142                return true;
1143
1144        /*
1145         * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
1146         * if there's more than one we need the tick for involuntary
1147         * preemption.
1148         */
1149        if (rq->nr_running > 1)
1150                return false;
1151
1152        return true;
1153}
1154#endif /* CONFIG_NO_HZ_FULL */
1155#endif /* CONFIG_SMP */
1156
1157#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
1158                        (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
1159/*
1160 * Iterate task_group tree rooted at *from, calling @down when first entering a
1161 * node and @up when leaving it for the final time.
1162 *
1163 * Caller must hold rcu_lock or sufficient equivalent.
1164 */
1165int walk_tg_tree_from(struct task_group *from,
1166                             tg_visitor down, tg_visitor up, void *data)
1167{
1168        struct task_group *parent, *child;
1169        int ret;
1170
1171        parent = from;
1172
1173down:
1174        ret = (*down)(parent, data);
1175        if (ret)
1176                goto out;
1177        list_for_each_entry_rcu(child, &parent->children, siblings) {
1178                parent = child;
1179                goto down;
1180
1181up:
1182                continue;
1183        }
1184        ret = (*up)(parent, data);
1185        if (ret || parent == from)
1186                goto out;
1187
1188        child = parent;
1189        parent = parent->parent;
1190        if (parent)
1191                goto up;
1192out:
1193        return ret;
1194}
1195
1196int tg_nop(struct task_group *tg, void *data)
1197{
1198        return 0;
1199}
1200#endif
1201
1202static void set_load_weight(struct task_struct *p, bool update_load)
1203{
1204        int prio = p->static_prio - MAX_RT_PRIO;
1205        struct load_weight *load = &p->se.load;
1206
1207        /*
1208         * SCHED_IDLE tasks get minimal weight:
1209         */
1210        if (task_has_idle_policy(p)) {
1211                load->weight = scale_load(WEIGHT_IDLEPRIO);
1212                load->inv_weight = WMULT_IDLEPRIO;
1213                return;
1214        }
1215
1216        /*
1217         * SCHED_OTHER tasks have to update their load when changing their
1218         * weight
1219         */
1220        if (update_load && p->sched_class == &fair_sched_class) {
1221                reweight_task(p, prio);
1222        } else {
1223                load->weight = scale_load(sched_prio_to_weight[prio]);
1224                load->inv_weight = sched_prio_to_wmult[prio];
1225        }
1226}
1227
1228#ifdef CONFIG_UCLAMP_TASK
1229/*
1230 * Serializes updates of utilization clamp values
1231 *
1232 * The (slow-path) user-space triggers utilization clamp value updates which
1233 * can require updates on (fast-path) scheduler's data structures used to
1234 * support enqueue/dequeue operations.
1235 * While the per-CPU rq lock protects fast-path update operations, user-space
1236 * requests are serialized using a mutex to reduce the risk of conflicting
1237 * updates or API abuses.
1238 */
1239static DEFINE_MUTEX(uclamp_mutex);
1240
1241/* Max allowed minimum utilization */
1242unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
1243
1244/* Max allowed maximum utilization */
1245unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
1246
1247/*
1248 * By default RT tasks run at the maximum performance point/capacity of the
1249 * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
1250 * SCHED_CAPACITY_SCALE.
1251 *
1252 * This knob allows admins to change the default behavior when uclamp is being
1253 * used. In battery powered devices, particularly, running at the maximum
1254 * capacity and frequency will increase energy consumption and shorten the
1255 * battery life.
1256 *
1257 * This knob only affects RT tasks that their uclamp_se->user_defined == false.
1258 *
1259 * This knob will not override the system default sched_util_clamp_min defined
1260 * above.
1261 */
1262unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
1263
1264/* All clamps are required to be less or equal than these values */
1265static struct uclamp_se uclamp_default[UCLAMP_CNT];
1266
1267/*
1268 * This static key is used to reduce the uclamp overhead in the fast path. It
1269 * primarily disables the call to uclamp_rq_{inc, dec}() in
1270 * enqueue/dequeue_task().
1271 *
1272 * This allows users to continue to enable uclamp in their kernel config with
1273 * minimum uclamp overhead in the fast path.
1274 *
1275 * As soon as userspace modifies any of the uclamp knobs, the static key is
1276 * enabled, since we have an actual users that make use of uclamp
1277 * functionality.
1278 *
1279 * The knobs that would enable this static key are:
1280 *
1281 *   * A task modifying its uclamp value with sched_setattr().
1282 *   * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs.
1283 *   * An admin modifying the cgroup cpu.uclamp.{min, max}
1284 */
1285DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
1286
1287/* Integer rounded range for each bucket */
1288#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
1289
1290#define for_each_clamp_id(clamp_id) \
1291        for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
1292
1293static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
1294{
1295        return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
1296}
1297
1298static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
1299{
1300        if (clamp_id == UCLAMP_MIN)
1301                return 0;
1302        return SCHED_CAPACITY_SCALE;
1303}
1304
1305static inline void uclamp_se_set(struct uclamp_se *uc_se,
1306                                 unsigned int value, bool user_defined)
1307{
1308        uc_se->value = value;
1309        uc_se->bucket_id = uclamp_bucket_id(value);
1310        uc_se->user_defined = user_defined;
1311}
1312
1313static inline unsigned int
1314uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
1315                  unsigned int clamp_value)
1316{
1317        /*
1318         * Avoid blocked utilization pushing up the frequency when we go
1319         * idle (which drops the max-clamp) by retaining the last known
1320         * max-clamp.
1321         */
1322        if (clamp_id == UCLAMP_MAX) {
1323                rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
1324                return clamp_value;
1325        }
1326
1327        return uclamp_none(UCLAMP_MIN);
1328}
1329
1330static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
1331                                     unsigned int clamp_value)
1332{
1333        /* Reset max-clamp retention only on idle exit */
1334        if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
1335                return;
1336
1337        WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
1338}
1339
1340static inline
1341unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
1342                                   unsigned int clamp_value)
1343{
1344        struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
1345        int bucket_id = UCLAMP_BUCKETS - 1;
1346
1347        /*
1348         * Since both min and max clamps are max aggregated, find the
1349         * top most bucket with tasks in.
1350         */
1351        for ( ; bucket_id >= 0; bucket_id--) {
1352                if (!bucket[bucket_id].tasks)
1353                        continue;
1354                return bucket[bucket_id].value;
1355        }
1356
1357        /* No tasks -- default clamp values */
1358        return uclamp_idle_value(rq, clamp_id, clamp_value);
1359}
1360
1361static void __uclamp_update_util_min_rt_default(struct task_struct *p)
1362{
1363        unsigned int default_util_min;
1364        struct uclamp_se *uc_se;
1365
1366        lockdep_assert_held(&p->pi_lock);
1367
1368        uc_se = &p->uclamp_req[UCLAMP_MIN];
1369
1370        /* Only sync if user didn't override the default */
1371        if (uc_se->user_defined)
1372                return;
1373
1374        default_util_min = sysctl_sched_uclamp_util_min_rt_default;
1375        uclamp_se_set(uc_se, default_util_min, false);
1376}
1377
1378static void uclamp_update_util_min_rt_default(struct task_struct *p)
1379{
1380        struct rq_flags rf;
1381        struct rq *rq;
1382
1383        if (!rt_task(p))
1384                return;
1385
1386        /* Protect updates to p->uclamp_* */
1387        rq = task_rq_lock(p, &rf);
1388        __uclamp_update_util_min_rt_default(p);
1389        task_rq_unlock(rq, p, &rf);
1390}
1391
1392static void uclamp_sync_util_min_rt_default(void)
1393{
1394        struct task_struct *g, *p;
1395
1396        /*
1397         * copy_process()                       sysctl_uclamp
1398         *                                        uclamp_min_rt = X;
1399         *   write_lock(&tasklist_lock)           read_lock(&tasklist_lock)
1400         *   // link thread                       smp_mb__after_spinlock()
1401         *   write_unlock(&tasklist_lock)         read_unlock(&tasklist_lock);
1402         *   sched_post_fork()                    for_each_process_thread()
1403         *     __uclamp_sync_rt()                   __uclamp_sync_rt()
1404         *
1405         * Ensures that either sched_post_fork() will observe the new
1406         * uclamp_min_rt or for_each_process_thread() will observe the new
1407         * task.
1408         */
1409        read_lock(&tasklist_lock);
1410        smp_mb__after_spinlock();
1411        read_unlock(&tasklist_lock);
1412
1413        rcu_read_lock();
1414        for_each_process_thread(g, p)
1415                uclamp_update_util_min_rt_default(p);
1416        rcu_read_unlock();
1417}
1418
1419static inline struct uclamp_se
1420uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
1421{
1422        /* Copy by value as we could modify it */
1423        struct uclamp_se uc_req = p->uclamp_req[clamp_id];
1424#ifdef CONFIG_UCLAMP_TASK_GROUP
1425        unsigned int tg_min, tg_max, value;
1426
1427        /*
1428         * Tasks in autogroups or root task group will be
1429         * restricted by system defaults.
1430         */
1431        if (task_group_is_autogroup(task_group(p)))
1432                return uc_req;
1433        if (task_group(p) == &root_task_group)
1434                return uc_req;
1435
1436        tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
1437        tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
1438        value = uc_req.value;
1439        value = clamp(value, tg_min, tg_max);
1440        uclamp_se_set(&uc_req, value, false);
1441#endif
1442
1443        return uc_req;
1444}
1445
1446/*
1447 * The effective clamp bucket index of a task depends on, by increasing
1448 * priority:
1449 * - the task specific clamp value, when explicitly requested from userspace
1450 * - the task group effective clamp value, for tasks not either in the root
1451 *   group or in an autogroup
1452 * - the system default clamp value, defined by the sysadmin
1453 */
1454static inline struct uclamp_se
1455uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
1456{
1457        struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
1458        struct uclamp_se uc_max = uclamp_default[clamp_id];
1459
1460        /* System default restrictions always apply */
1461        if (unlikely(uc_req.value > uc_max.value))
1462                return uc_max;
1463
1464        return uc_req;
1465}
1466
1467unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
1468{
1469        struct uclamp_se uc_eff;
1470
1471        /* Task currently refcounted: use back-annotated (effective) value */
1472        if (p->uclamp[clamp_id].active)
1473                return (unsigned long)p->uclamp[clamp_id].value;
1474
1475        uc_eff = uclamp_eff_get(p, clamp_id);
1476
1477        return (unsigned long)uc_eff.value;
1478}
1479
1480/*
1481 * When a task is enqueued on a rq, the clamp bucket currently defined by the
1482 * task's uclamp::bucket_id is refcounted on that rq. This also immediately
1483 * updates the rq's clamp value if required.
1484 *
1485 * Tasks can have a task-specific value requested from user-space, track
1486 * within each bucket the maximum value for tasks refcounted in it.
1487 * This "local max aggregation" allows to track the exact "requested" value
1488 * for each bucket when all its RUNNABLE tasks require the same clamp.
1489 */
1490static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
1491                                    enum uclamp_id clamp_id)
1492{
1493        struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1494        struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1495        struct uclamp_bucket *bucket;
1496
1497        lockdep_assert_rq_held(rq);
1498
1499        /* Update task effective clamp */
1500        p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
1501
1502        bucket = &uc_rq->bucket[uc_se->bucket_id];
1503        bucket->tasks++;
1504        uc_se->active = true;
1505
1506        uclamp_idle_reset(rq, clamp_id, uc_se->value);
1507
1508        /*
1509         * Local max aggregation: rq buckets always track the max
1510         * "requested" clamp value of its RUNNABLE tasks.
1511         */
1512        if (bucket->tasks == 1 || uc_se->value > bucket->value)
1513                bucket->value = uc_se->value;
1514
1515        if (uc_se->value > READ_ONCE(uc_rq->value))
1516                WRITE_ONCE(uc_rq->value, uc_se->value);
1517}
1518
1519/*
1520 * When a task is dequeued from a rq, the clamp bucket refcounted by the task
1521 * is released. If this is the last task reference counting the rq's max
1522 * active clamp value, then the rq's clamp value is updated.
1523 *
1524 * Both refcounted tasks and rq's cached clamp values are expected to be
1525 * always valid. If it's detected they are not, as defensive programming,
1526 * enforce the expected state and warn.
1527 */
1528static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
1529                                    enum uclamp_id clamp_id)
1530{
1531        struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1532        struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1533        struct uclamp_bucket *bucket;
1534        unsigned int bkt_clamp;
1535        unsigned int rq_clamp;
1536
1537        lockdep_assert_rq_held(rq);
1538
1539        /*
1540         * If sched_uclamp_used was enabled after task @p was enqueued,
1541         * we could end up with unbalanced call to uclamp_rq_dec_id().
1542         *
1543         * In this case the uc_se->active flag should be false since no uclamp
1544         * accounting was performed at enqueue time and we can just return
1545         * here.
1546         *
1547         * Need to be careful of the following enqueue/dequeue ordering
1548         * problem too
1549         *
1550         *      enqueue(taskA)
1551         *      // sched_uclamp_used gets enabled
1552         *      enqueue(taskB)
1553         *      dequeue(taskA)
1554         *      // Must not decrement bucket->tasks here
1555         *      dequeue(taskB)
1556         *
1557         * where we could end up with stale data in uc_se and
1558         * bucket[uc_se->bucket_id].
1559         *
1560         * The following check here eliminates the possibility of such race.
1561         */
1562        if (unlikely(!uc_se->active))
1563                return;
1564
1565        bucket = &uc_rq->bucket[uc_se->bucket_id];
1566
1567        SCHED_WARN_ON(!bucket->tasks);
1568        if (likely(bucket->tasks))
1569                bucket->tasks--;
1570
1571        uc_se->active = false;
1572
1573        /*
1574         * Keep "local max aggregation" simple and accept to (possibly)
1575         * overboost some RUNNABLE tasks in the same bucket.
1576         * The rq clamp bucket value is reset to its base value whenever
1577         * there are no more RUNNABLE tasks refcounting it.
1578         */
1579        if (likely(bucket->tasks))
1580                return;
1581
1582        rq_clamp = READ_ONCE(uc_rq->value);
1583        /*
1584         * Defensive programming: this should never happen. If it happens,
1585         * e.g. due to future modification, warn and fixup the expected value.
1586         */
1587        SCHED_WARN_ON(bucket->value > rq_clamp);
1588        if (bucket->value >= rq_clamp) {
1589                bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
1590                WRITE_ONCE(uc_rq->value, bkt_clamp);
1591        }
1592}
1593
1594static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
1595{
1596        enum uclamp_id clamp_id;
1597
1598        /*
1599         * Avoid any overhead until uclamp is actually used by the userspace.
1600         *
1601         * The condition is constructed such that a NOP is generated when
1602         * sched_uclamp_used is disabled.
1603         */
1604        if (!static_branch_unlikely(&sched_uclamp_used))
1605                return;
1606
1607        if (unlikely(!p->sched_class->uclamp_enabled))
1608                return;
1609
1610        for_each_clamp_id(clamp_id)
1611                uclamp_rq_inc_id(rq, p, clamp_id);
1612
1613        /* Reset clamp idle holding when there is one RUNNABLE task */
1614        if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
1615                rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1616}
1617
1618static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
1619{
1620        enum uclamp_id clamp_id;
1621
1622        /*
1623         * Avoid any overhead until uclamp is actually used by the userspace.
1624         *
1625         * The condition is constructed such that a NOP is generated when
1626         * sched_uclamp_used is disabled.
1627         */
1628        if (!static_branch_unlikely(&sched_uclamp_used))
1629                return;
1630
1631        if (unlikely(!p->sched_class->uclamp_enabled))
1632                return;
1633
1634        for_each_clamp_id(clamp_id)
1635                uclamp_rq_dec_id(rq, p, clamp_id);
1636}
1637
1638static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
1639                                      enum uclamp_id clamp_id)
1640{
1641        if (!p->uclamp[clamp_id].active)
1642                return;
1643
1644        uclamp_rq_dec_id(rq, p, clamp_id);
1645        uclamp_rq_inc_id(rq, p, clamp_id);
1646
1647        /*
1648         * Make sure to clear the idle flag if we've transiently reached 0
1649         * active tasks on rq.
1650         */
1651        if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
1652                rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1653}
1654
1655static inline void
1656uclamp_update_active(struct task_struct *p)
1657{
1658        enum uclamp_id clamp_id;
1659        struct rq_flags rf;
1660        struct rq *rq;
1661
1662        /*
1663         * Lock the task and the rq where the task is (or was) queued.
1664         *
1665         * We might lock the (previous) rq of a !RUNNABLE task, but that's the
1666         * price to pay to safely serialize util_{min,max} updates with
1667         * enqueues, dequeues and migration operations.
1668         * This is the same locking schema used by __set_cpus_allowed_ptr().
1669         */
1670        rq = task_rq_lock(p, &rf);
1671
1672        /*
1673         * Setting the clamp bucket is serialized by task_rq_lock().
1674         * If the task is not yet RUNNABLE and its task_struct is not
1675         * affecting a valid clamp bucket, the next time it's enqueued,
1676         * it will already see the updated clamp bucket value.
1677         */
1678        for_each_clamp_id(clamp_id)
1679                uclamp_rq_reinc_id(rq, p, clamp_id);
1680
1681        task_rq_unlock(rq, p, &rf);
1682}
1683
1684#ifdef CONFIG_UCLAMP_TASK_GROUP
1685static inline void
1686uclamp_update_active_tasks(struct cgroup_subsys_state *css)
1687{
1688        struct css_task_iter it;
1689        struct task_struct *p;
1690
1691        css_task_iter_start(css, 0, &it);
1692        while ((p = css_task_iter_next(&it)))
1693                uclamp_update_active(p);
1694        css_task_iter_end(&it);
1695}
1696
1697static void cpu_util_update_eff(struct cgroup_subsys_state *css);
1698static void uclamp_update_root_tg(void)
1699{
1700        struct task_group *tg = &root_task_group;
1701
1702        uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
1703                      sysctl_sched_uclamp_util_min, false);
1704        uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
1705                      sysctl_sched_uclamp_util_max, false);
1706
1707        rcu_read_lock();
1708        cpu_util_update_eff(&root_task_group.css);
1709        rcu_read_unlock();
1710}
1711#else
1712static void uclamp_update_root_tg(void) { }
1713#endif
1714
1715int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1716                                void *buffer, size_t *lenp, loff_t *ppos)
1717{
1718        bool update_root_tg = false;
1719        int old_min, old_max, old_min_rt;
1720        int result;
1721
1722        mutex_lock(&uclamp_mutex);
1723        old_min = sysctl_sched_uclamp_util_min;
1724        old_max = sysctl_sched_uclamp_util_max;
1725        old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
1726
1727        result = proc_dointvec(table, write, buffer, lenp, ppos);
1728        if (result)
1729                goto undo;
1730        if (!write)
1731                goto done;
1732
1733        if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
1734            sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
1735            sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
1736
1737                result = -EINVAL;
1738                goto undo;
1739        }
1740
1741        if (old_min != sysctl_sched_uclamp_util_min) {
1742                uclamp_se_set(&uclamp_default[UCLAMP_MIN],
1743                              sysctl_sched_uclamp_util_min, false);
1744                update_root_tg = true;
1745        }
1746        if (old_max != sysctl_sched_uclamp_util_max) {
1747                uclamp_se_set(&uclamp_default[UCLAMP_MAX],
1748                              sysctl_sched_uclamp_util_max, false);
1749                update_root_tg = true;
1750        }
1751
1752        if (update_root_tg) {
1753                static_branch_enable(&sched_uclamp_used);
1754                uclamp_update_root_tg();
1755        }
1756
1757        if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
1758                static_branch_enable(&sched_uclamp_used);
1759                uclamp_sync_util_min_rt_default();
1760        }
1761
1762        /*
1763         * We update all RUNNABLE tasks only when task groups are in use.
1764         * Otherwise, keep it simple and do just a lazy update at each next
1765         * task enqueue time.
1766         */
1767
1768        goto done;
1769
1770undo:
1771        sysctl_sched_uclamp_util_min = old_min;
1772        sysctl_sched_uclamp_util_max = old_max;
1773        sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
1774done:
1775        mutex_unlock(&uclamp_mutex);
1776
1777        return result;
1778}
1779
1780static int uclamp_validate(struct task_struct *p,
1781                           const struct sched_attr *attr)
1782{
1783        int util_min = p->uclamp_req[UCLAMP_MIN].value;
1784        int util_max = p->uclamp_req[UCLAMP_MAX].value;
1785
1786        if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1787                util_min = attr->sched_util_min;
1788
1789                if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
1790                        return -EINVAL;
1791        }
1792
1793        if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1794                util_max = attr->sched_util_max;
1795
1796                if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
1797                        return -EINVAL;
1798        }
1799
1800        if (util_min != -1 && util_max != -1 && util_min > util_max)
1801                return -EINVAL;
1802
1803        /*
1804         * We have valid uclamp attributes; make sure uclamp is enabled.
1805         *
1806         * We need to do that here, because enabling static branches is a
1807         * blocking operation which obviously cannot be done while holding
1808         * scheduler locks.
1809         */
1810        static_branch_enable(&sched_uclamp_used);
1811
1812        return 0;
1813}
1814
1815static bool uclamp_reset(const struct sched_attr *attr,
1816                         enum uclamp_id clamp_id,
1817                         struct uclamp_se *uc_se)
1818{
1819        /* Reset on sched class change for a non user-defined clamp value. */
1820        if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
1821            !uc_se->user_defined)
1822                return true;
1823
1824        /* Reset on sched_util_{min,max} == -1. */
1825        if (clamp_id == UCLAMP_MIN &&
1826            attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1827            attr->sched_util_min == -1) {
1828                return true;
1829        }
1830
1831        if (clamp_id == UCLAMP_MAX &&
1832            attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1833            attr->sched_util_max == -1) {
1834                return true;
1835        }
1836
1837        return false;
1838}
1839
1840static void __setscheduler_uclamp(struct task_struct *p,
1841                                  const struct sched_attr *attr)
1842{
1843        enum uclamp_id clamp_id;
1844
1845        for_each_clamp_id(clamp_id) {
1846                struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
1847                unsigned int value;
1848
1849                if (!uclamp_reset(attr, clamp_id, uc_se))
1850                        continue;
1851
1852                /*
1853                 * RT by default have a 100% boost value that could be modified
1854                 * at runtime.
1855                 */
1856                if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
1857                        value = sysctl_sched_uclamp_util_min_rt_default;
1858                else
1859                        value = uclamp_none(clamp_id);
1860
1861                uclamp_se_set(uc_se, value, false);
1862
1863        }
1864
1865        if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
1866                return;
1867
1868        if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1869            attr->sched_util_min != -1) {
1870                uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
1871                              attr->sched_util_min, true);
1872        }
1873
1874        if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1875            attr->sched_util_max != -1) {
1876                uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
1877                              attr->sched_util_max, true);
1878        }
1879}
1880
1881static void uclamp_fork(struct task_struct *p)
1882{
1883        enum uclamp_id clamp_id;
1884
1885        /*
1886         * We don't need to hold task_rq_lock() when updating p->uclamp_* here
1887         * as the task is still at its early fork stages.
1888         */
1889        for_each_clamp_id(clamp_id)
1890                p->uclamp[clamp_id].active = false;
1891
1892        if (likely(!p->sched_reset_on_fork))
1893                return;
1894
1895        for_each_clamp_id(clamp_id) {
1896                uclamp_se_set(&p->uclamp_req[clamp_id],
1897                              uclamp_none(clamp_id), false);
1898        }
1899}
1900
1901static void uclamp_post_fork(struct task_struct *p)
1902{
1903        uclamp_update_util_min_rt_default(p);
1904}
1905
1906static void __init init_uclamp_rq(struct rq *rq)
1907{
1908        enum uclamp_id clamp_id;
1909        struct uclamp_rq *uc_rq = rq->uclamp;
1910
1911        for_each_clamp_id(clamp_id) {
1912                uc_rq[clamp_id] = (struct uclamp_rq) {
1913                        .value = uclamp_none(clamp_id)
1914                };
1915        }
1916
1917        rq->uclamp_flags = 0;
1918}
1919
1920static void __init init_uclamp(void)
1921{
1922        struct uclamp_se uc_max = {};
1923        enum uclamp_id clamp_id;
1924        int cpu;
1925
1926        for_each_possible_cpu(cpu)
1927                init_uclamp_rq(cpu_rq(cpu));
1928
1929        for_each_clamp_id(clamp_id) {
1930                uclamp_se_set(&init_task.uclamp_req[clamp_id],
1931                              uclamp_none(clamp_id), false);
1932        }
1933
1934        /* System defaults allow max clamp values for both indexes */
1935        uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
1936        for_each_clamp_id(clamp_id) {
1937                uclamp_default[clamp_id] = uc_max;
1938#ifdef CONFIG_UCLAMP_TASK_GROUP
1939                root_task_group.uclamp_req[clamp_id] = uc_max;
1940                root_task_group.uclamp[clamp_id] = uc_max;
1941#endif
1942        }
1943}
1944
1945#else /* CONFIG_UCLAMP_TASK */
1946static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
1947static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
1948static inline int uclamp_validate(struct task_struct *p,
1949                                  const struct sched_attr *attr)
1950{
1951        return -EOPNOTSUPP;
1952}
1953static void __setscheduler_uclamp(struct task_struct *p,
1954                                  const struct sched_attr *attr) { }
1955static inline void uclamp_fork(struct task_struct *p) { }
1956static inline void uclamp_post_fork(struct task_struct *p) { }
1957static inline void init_uclamp(void) { }
1958#endif /* CONFIG_UCLAMP_TASK */
1959
1960bool sched_task_on_rq(struct task_struct *p)
1961{
1962        return task_on_rq_queued(p);
1963}
1964
1965static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1966{
1967        if (!(flags & ENQUEUE_NOCLOCK))
1968                update_rq_clock(rq);
1969
1970        if (!(flags & ENQUEUE_RESTORE)) {
1971                sched_info_enqueue(rq, p);
1972                psi_enqueue(p, flags & ENQUEUE_WAKEUP);
1973        }
1974
1975        uclamp_rq_inc(rq, p);
1976        p->sched_class->enqueue_task(rq, p, flags);
1977
1978        if (sched_core_enabled(rq))
1979                sched_core_enqueue(rq, p);
1980}
1981
1982static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1983{
1984        if (sched_core_enabled(rq))
1985                sched_core_dequeue(rq, p);
1986
1987        if (!(flags & DEQUEUE_NOCLOCK))
1988                update_rq_clock(rq);
1989
1990        if (!(flags & DEQUEUE_SAVE)) {
1991                sched_info_dequeue(rq, p);
1992                psi_dequeue(p, flags & DEQUEUE_SLEEP);
1993        }
1994
1995        uclamp_rq_dec(rq, p);
1996        p->sched_class->dequeue_task(rq, p, flags);
1997}
1998
1999void activate_task(struct rq *rq, struct task_struct *p, int flags)
2000{
2001        enqueue_task(rq, p, flags);
2002
2003        p->on_rq = TASK_ON_RQ_QUEUED;
2004}
2005
2006void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
2007{
2008        p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
2009
2010        dequeue_task(rq, p, flags);
2011}
2012
2013static inline int __normal_prio(int policy, int rt_prio, int nice)
2014{
2015        int prio;
2016
2017        if (dl_policy(policy))
2018                prio = MAX_DL_PRIO - 1;
2019        else if (rt_policy(policy))
2020                prio = MAX_RT_PRIO - 1 - rt_prio;
2021        else
2022                prio = NICE_TO_PRIO(nice);
2023
2024        return prio;
2025}
2026
2027/*
2028 * Calculate the expected normal priority: i.e. priority
2029 * without taking RT-inheritance into account. Might be
2030 * boosted by interactivity modifiers. Changes upon fork,
2031 * setprio syscalls, and whenever the interactivity
2032 * estimator recalculates.
2033 */
2034static inline int normal_prio(struct task_struct *p)
2035{
2036        return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
2037}
2038
2039/*
2040 * Calculate the current priority, i.e. the priority
2041 * taken into account by the scheduler. This value might
2042 * be boosted by RT tasks, or might be boosted by
2043 * interactivity modifiers. Will be RT if the task got
2044 * RT-boosted. If not then it returns p->normal_prio.
2045 */
2046static int effective_prio(struct task_struct *p)
2047{
2048        p->normal_prio = normal_prio(p);
2049        /*
2050         * If we are RT tasks or we were boosted to RT priority,
2051         * keep the priority unchanged. Otherwise, update priority
2052         * to the normal priority:
2053         */
2054        if (!rt_prio(p->prio))
2055                return p->normal_prio;
2056        return p->prio;
2057}
2058
2059/**
2060 * task_curr - is this task currently executing on a CPU?
2061 * @p: the task in question.
2062 *
2063 * Return: 1 if the task is currently executing. 0 otherwise.
2064 */
2065inline int task_curr(const struct task_struct *p)
2066{
2067        return cpu_curr(task_cpu(p)) == p;
2068}
2069
2070/*
2071 * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
2072 * use the balance_callback list if you want balancing.
2073 *
2074 * this means any call to check_class_changed() must be followed by a call to
2075 * balance_callback().
2076 */
2077static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2078                                       const struct sched_class *prev_class,
2079                                       int oldprio)
2080{
2081        if (prev_class != p->sched_class) {
2082                if (prev_class->switched_from)
2083                        prev_class->switched_from(rq, p);
2084
2085                p->sched_class->switched_to(rq, p);
2086        } else if (oldprio != p->prio || dl_task(p))
2087                p->sched_class->prio_changed(rq, p, oldprio);
2088}
2089
2090void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2091{
2092        if (p->sched_class == rq->curr->sched_class)
2093                rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2094        else if (p->sched_class > rq->curr->sched_class)
2095                resched_curr(rq);
2096
2097        /*
2098         * A queue event has occurred, and we're going to schedule.  In
2099         * this case, we can save a useless back to back clock update.
2100         */
2101        if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
2102                rq_clock_skip_update(rq);
2103}
2104
2105#ifdef CONFIG_SMP
2106
2107static void
2108__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
2109
2110static int __set_cpus_allowed_ptr(struct task_struct *p,
2111                                  const struct cpumask *new_mask,
2112                                  u32 flags);
2113
2114static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
2115{
2116        if (likely(!p->migration_disabled))
2117                return;
2118
2119        if (p->cpus_ptr != &p->cpus_mask)
2120                return;
2121
2122        /*
2123         * Violates locking rules! see comment in __do_set_cpus_allowed().
2124         */
2125        __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
2126}
2127
2128void migrate_disable(void)
2129{
2130        struct task_struct *p = current;
2131
2132        if (p->migration_disabled) {
2133                p->migration_disabled++;
2134                return;
2135        }
2136
2137        preempt_disable();
2138        this_rq()->nr_pinned++;
2139        p->migration_disabled = 1;
2140        preempt_enable();
2141}
2142EXPORT_SYMBOL_GPL(migrate_disable);
2143
2144void migrate_enable(void)
2145{
2146        struct task_struct *p = current;
2147
2148        if (p->migration_disabled > 1) {
2149                p->migration_disabled--;
2150                return;
2151        }
2152
2153        /*
2154         * Ensure stop_task runs either before or after this, and that
2155         * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
2156         */
2157        preempt_disable();
2158        if (p->cpus_ptr != &p->cpus_mask)
2159                __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
2160        /*
2161         * Mustn't clear migration_disabled() until cpus_ptr points back at the
2162         * regular cpus_mask, otherwise things that race (eg.
2163         * select_fallback_rq) get confused.
2164         */
2165        barrier();
2166        p->migration_disabled = 0;
2167        this_rq()->nr_pinned--;
2168        preempt_enable();
2169}
2170EXPORT_SYMBOL_GPL(migrate_enable);
2171
2172static inline bool rq_has_pinned_tasks(struct rq *rq)
2173{
2174        return rq->nr_pinned;
2175}
2176
2177/*
2178 * Per-CPU kthreads are allowed to run on !active && online CPUs, see
2179 * __set_cpus_allowed_ptr() and select_fallback_rq().
2180 */
2181static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
2182{
2183        /* When not in the task's cpumask, no point in looking further. */
2184        if (!cpumask_test_cpu(cpu, p->cpus_ptr))
2185                return false;
2186
2187        /* migrate_disabled() must be allowed to finish. */
2188        if (is_migration_disabled(p))
2189                return cpu_online(cpu);
2190
2191        /* Non kernel threads are not allowed during either online or offline. */
2192        if (!(p->flags & PF_KTHREAD))
2193                return cpu_active(cpu) && task_cpu_possible(cpu, p);
2194
2195        /* KTHREAD_IS_PER_CPU is always allowed. */
2196        if (kthread_is_per_cpu(p))
2197                return cpu_online(cpu);
2198
2199        /* Regular kernel threads don't get to stay during offline. */
2200        if (cpu_dying(cpu))
2201                return false;
2202
2203        /* But are allowed during online. */
2204        return cpu_online(cpu);
2205}
2206
2207/*
2208 * This is how migration works:
2209 *
2210 * 1) we invoke migration_cpu_stop() on the target CPU using
2211 *    stop_one_cpu().
2212 * 2) stopper starts to run (implicitly forcing the migrated thread
2213 *    off the CPU)
2214 * 3) it checks whether the migrated task is still in the wrong runqueue.
2215 * 4) if it's in the wrong runqueue then the migration thread removes
2216 *    it and puts it into the right queue.
2217 * 5) stopper completes and stop_one_cpu() returns and the migration
2218 *    is done.
2219 */
2220
2221/*
2222 * move_queued_task - move a queued task to new rq.
2223 *
2224 * Returns (locked) new rq. Old rq's lock is released.
2225 */
2226static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
2227                                   struct task_struct *p, int new_cpu)
2228{
2229        lockdep_assert_rq_held(rq);
2230
2231        deactivate_task(rq, p, DEQUEUE_NOCLOCK);
2232        set_task_cpu(p, new_cpu);
2233        rq_unlock(rq, rf);
2234
2235        rq = cpu_rq(new_cpu);
2236
2237        rq_lock(rq, rf);
2238        BUG_ON(task_cpu(p) != new_cpu);
2239        activate_task(rq, p, 0);
2240        check_preempt_curr(rq, p, 0);
2241
2242        return rq;
2243}
2244
2245struct migration_arg {
2246        struct task_struct              *task;
2247        int                             dest_cpu;
2248        struct set_affinity_pending     *pending;
2249};
2250
2251/*
2252 * @refs: number of wait_for_completion()
2253 * @stop_pending: is @stop_work in use
2254 */
2255struct set_affinity_pending {
2256        refcount_t              refs;
2257        unsigned int            stop_pending;
2258        struct completion       done;
2259        struct cpu_stop_work    stop_work;
2260        struct migration_arg    arg;
2261};
2262
2263/*
2264 * Move (not current) task off this CPU, onto the destination CPU. We're doing
2265 * this because either it can't run here any more (set_cpus_allowed()
2266 * away from this CPU, or CPU going down), or because we're
2267 * attempting to rebalance this task on exec (sched_exec).
2268 *
2269 * So we race with normal scheduler movements, but that's OK, as long
2270 * as the task is no longer on this CPU.
2271 */
2272static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
2273                                 struct task_struct *p, int dest_cpu)
2274{
2275        /* Affinity changed (again). */
2276        if (!is_cpu_allowed(p, dest_cpu))
2277                return rq;
2278
2279        update_rq_clock(rq);
2280        rq = move_queued_task(rq, rf, p, dest_cpu);
2281
2282        return rq;
2283}
2284
2285/*
2286 * migration_cpu_stop - this will be executed by a highprio stopper thread
2287 * and performs thread migration by bumping thread off CPU then
2288 * 'pushing' onto another runqueue.
2289 */
2290static int migration_cpu_stop(void *data)
2291{
2292        struct migration_arg *arg = data;
2293        struct set_affinity_pending *pending = arg->pending;
2294        struct task_struct *p = arg->task;
2295        struct rq *rq = this_rq();
2296        bool complete = false;
2297        struct rq_flags rf;
2298
2299        /*
2300         * The original target CPU might have gone down and we might
2301         * be on another CPU but it doesn't matter.
2302         */
2303        local_irq_save(rf.flags);
2304        /*
2305         * We need to explicitly wake pending tasks before running
2306         * __migrate_task() such that we will not miss enforcing cpus_ptr
2307         * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
2308         */
2309        flush_smp_call_function_from_idle();
2310
2311        raw_spin_lock(&p->pi_lock);
2312        rq_lock(rq, &rf);
2313
2314        /*
2315         * If we were passed a pending, then ->stop_pending was set, thus
2316         * p->migration_pending must have remained stable.
2317         */
2318        WARN_ON_ONCE(pending && pending != p->migration_pending);
2319
2320        /*
2321         * If task_rq(p) != rq, it cannot be migrated here, because we're
2322         * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
2323         * we're holding p->pi_lock.
2324         */
2325        if (task_rq(p) == rq) {
2326                if (is_migration_disabled(p))
2327                        goto out;
2328
2329                if (pending) {
2330                        p->migration_pending = NULL;
2331                        complete = true;
2332
2333                        if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
2334                                goto out;
2335                }
2336
2337                if (task_on_rq_queued(p))
2338                        rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
2339                else
2340                        p->wake_cpu = arg->dest_cpu;
2341
2342                /*
2343                 * XXX __migrate_task() can fail, at which point we might end
2344                 * up running on a dodgy CPU, AFAICT this can only happen
2345                 * during CPU hotplug, at which point we'll get pushed out
2346                 * anyway, so it's probably not a big deal.
2347                 */
2348
2349        } else if (pending) {
2350                /*
2351                 * This happens when we get migrated between migrate_enable()'s
2352                 * preempt_enable() and scheduling the stopper task. At that
2353                 * point we're a regular task again and not current anymore.
2354                 *
2355                 * A !PREEMPT kernel has a giant hole here, which makes it far
2356                 * more likely.
2357                 */
2358
2359                /*
2360                 * The task moved before the stopper got to run. We're holding
2361                 * ->pi_lock, so the allowed mask is stable - if it got
2362                 * somewhere allowed, we're done.
2363                 */
2364                if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
2365                        p->migration_pending = NULL;
2366                        complete = true;
2367                        goto out;
2368                }
2369
2370                /*
2371                 * When migrate_enable() hits a rq mis-match we can't reliably
2372                 * determine is_migration_disabled() and so have to chase after
2373                 * it.
2374                 */
2375                WARN_ON_ONCE(!pending->stop_pending);
2376                task_rq_unlock(rq, p, &rf);
2377                stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
2378                                    &pending->arg, &pending->stop_work);
2379                return 0;
2380        }
2381out:
2382        if (pending)
2383                pending->stop_pending = false;
2384        task_rq_unlock(rq, p, &rf);
2385
2386        if (complete)
2387                complete_all(&pending->done);
2388
2389        return 0;
2390}
2391
2392int push_cpu_stop(void *arg)
2393{
2394        struct rq *lowest_rq = NULL, *rq = this_rq();
2395        struct task_struct *p = arg;
2396
2397        raw_spin_lock_irq(&p->pi_lock);
2398        raw_spin_rq_lock(rq);
2399
2400        if (task_rq(p) != rq)
2401                goto out_unlock;
2402
2403        if (is_migration_disabled(p)) {
2404                p->migration_flags |= MDF_PUSH;
2405                goto out_unlock;
2406        }
2407
2408        p->migration_flags &= ~MDF_PUSH;
2409
2410        if (p->sched_class->find_lock_rq)
2411                lowest_rq = p->sched_class->find_lock_rq(p, rq);
2412
2413        if (!lowest_rq)
2414                goto out_unlock;
2415
2416        // XXX validate p is still the highest prio task
2417        if (task_rq(p) == rq) {
2418                deactivate_task(rq, p, 0);
2419                set_task_cpu(p, lowest_rq->cpu);
2420                activate_task(lowest_rq, p, 0);
2421                resched_curr(lowest_rq);
2422        }
2423
2424        double_unlock_balance(rq, lowest_rq);
2425
2426out_unlock:
2427        rq->push_busy = false;
2428        raw_spin_rq_unlock(rq);
2429        raw_spin_unlock_irq(&p->pi_lock);
2430
2431        put_task_struct(p);
2432        return 0;
2433}
2434
2435/*
2436 * sched_class::set_cpus_allowed must do the below, but is not required to
2437 * actually call this function.
2438 */
2439void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
2440{
2441        if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
2442                p->cpus_ptr = new_mask;
2443                return;
2444        }
2445
2446        cpumask_copy(&p->cpus_mask, new_mask);
2447        p->nr_cpus_allowed = cpumask_weight(new_mask);
2448}
2449
2450static void
2451__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
2452{
2453        struct rq *rq = task_rq(p);
2454        bool queued, running;
2455
2456        /*
2457         * This here violates the locking rules for affinity, since we're only
2458         * supposed to change these variables while holding both rq->lock and
2459         * p->pi_lock.
2460         *
2461         * HOWEVER, it magically works, because ttwu() is the only code that
2462         * accesses these variables under p->pi_lock and only does so after
2463         * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
2464         * before finish_task().
2465         *
2466         * XXX do further audits, this smells like something putrid.
2467         */
2468        if (flags & SCA_MIGRATE_DISABLE)
2469                SCHED_WARN_ON(!p->on_cpu);
2470        else
2471                lockdep_assert_held(&p->pi_lock);
2472
2473        queued = task_on_rq_queued(p);
2474        running = task_current(rq, p);
2475
2476        if (queued) {
2477                /*
2478                 * Because __kthread_bind() calls this on blocked tasks without
2479                 * holding rq->lock.
2480                 */
2481                lockdep_assert_rq_held(rq);
2482                dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
2483        }
2484        if (running)
2485                put_prev_task(rq, p);
2486
2487        p->sched_class->set_cpus_allowed(p, new_mask, flags);
2488
2489        if (queued)
2490                enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
2491        if (running)
2492                set_next_task(rq, p);
2493}
2494
2495void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
2496{
2497        __do_set_cpus_allowed(p, new_mask, 0);
2498}
2499
2500int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
2501                      int node)
2502{
2503        if (!src->user_cpus_ptr)
2504                return 0;
2505
2506        dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node);
2507        if (!dst->user_cpus_ptr)
2508                return -ENOMEM;
2509
2510        cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
2511        return 0;
2512}
2513
2514static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
2515{
2516        struct cpumask *user_mask = NULL;
2517
2518        swap(p->user_cpus_ptr, user_mask);
2519
2520        return user_mask;
2521}
2522
2523void release_user_cpus_ptr(struct task_struct *p)
2524{
2525        kfree(clear_user_cpus_ptr(p));
2526}
2527
2528/*
2529 * This function is wildly self concurrent; here be dragons.
2530 *
2531 *
2532 * When given a valid mask, __set_cpus_allowed_ptr() must block until the
2533 * designated task is enqueued on an allowed CPU. If that task is currently
2534 * running, we have to kick it out using the CPU stopper.
2535 *
2536 * Migrate-Disable comes along and tramples all over our nice sandcastle.
2537 * Consider:
2538 *
2539 *     Initial conditions: P0->cpus_mask = [0, 1]
2540 *
2541 *     P0@CPU0                  P1
2542 *
2543 *     migrate_disable();
2544 *     <preempted>
2545 *                              set_cpus_allowed_ptr(P0, [1]);
2546 *
2547 * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes
2548 * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region).
2549 * This means we need the following scheme:
2550 *
2551 *     P0@CPU0                  P1
2552 *
2553 *     migrate_disable();
2554 *     <preempted>
2555 *                              set_cpus_allowed_ptr(P0, [1]);
2556 *                                <blocks>
2557 *     <resumes>
2558 *     migrate_enable();
2559 *       __set_cpus_allowed_ptr();
2560 *       <wakes local stopper>
2561 *                         `--> <woken on migration completion>
2562 *
2563 * Now the fun stuff: there may be several P1-like tasks, i.e. multiple
2564 * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any
2565 * task p are serialized by p->pi_lock, which we can leverage: the one that
2566 * should come into effect at the end of the Migrate-Disable region is the last
2567 * one. This means we only need to track a single cpumask (i.e. p->cpus_mask),
2568 * but we still need to properly signal those waiting tasks at the appropriate
2569 * moment.
2570 *
2571 * This is implemented using struct set_affinity_pending. The first
2572 * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will
2573 * setup an instance of that struct and install it on the targeted task_struct.
2574 * Any and all further callers will reuse that instance. Those then wait for
2575 * a completion signaled at the tail of the CPU stopper callback (1), triggered
2576 * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()).
2577 *
2578 *
2579 * (1) In the cases covered above. There is one more where the completion is
2580 * signaled within affine_move_task() itself: when a subsequent affinity request
2581 * occurs after the stopper bailed out due to the targeted task still being
2582 * Migrate-Disable. Consider:
2583 *
2584 *     Initial conditions: P0->cpus_mask = [0, 1]
2585 *
2586 *     CPU0               P1                            P2
2587 *     <P0>
2588 *       migrate_disable();
2589 *       <preempted>
2590 *                        set_cpus_allowed_ptr(P0, [1]);
2591 *                          <blocks>
2592 *     <migration/0>
2593 *       migration_cpu_stop()
2594 *         is_migration_disabled()
2595 *           <bails>
2596 *                                                       set_cpus_allowed_ptr(P0, [0, 1]);
2597 *                                                         <signal completion>
2598 *                          <awakes>
2599 *
2600 * Note that the above is safe vs a concurrent migrate_enable(), as any
2601 * pending affinity completion is preceded by an uninstallation of
2602 * p->migration_pending done with p->pi_lock held.
2603 */
2604static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
2605                            int dest_cpu, unsigned int flags)
2606{
2607        struct set_affinity_pending my_pending = { }, *pending = NULL;
2608        bool stop_pending, complete = false;
2609
2610        /* Can the task run on the task's current CPU? If so, we're done */
2611        if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
2612                struct task_struct *push_task = NULL;
2613
2614                if ((flags & SCA_MIGRATE_ENABLE) &&
2615                    (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
2616                        rq->push_busy = true;
2617                        push_task = get_task_struct(p);
2618                }
2619
2620                /*
2621                 * If there are pending waiters, but no pending stop_work,
2622                 * then complete now.
2623                 */
2624                pending = p->migration_pending;
2625                if (pending && !pending->stop_pending) {
2626                        p->migration_pending = NULL;
2627                        complete = true;
2628                }
2629
2630                task_rq_unlock(rq, p, rf);
2631
2632                if (push_task) {
2633                        stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
2634                                            p, &rq->push_work);
2635                }
2636
2637                if (complete)
2638                        complete_all(&pending->done);
2639
2640                return 0;
2641        }
2642
2643        if (!(flags & SCA_MIGRATE_ENABLE)) {
2644                /* serialized by p->pi_lock */
2645                if (!p->migration_pending) {
2646                        /* Install the request */
2647                        refcount_set(&my_pending.refs, 1);
2648                        init_completion(&my_pending.done);
2649                        my_pending.arg = (struct migration_arg) {
2650                                .task = p,
2651                                .dest_cpu = dest_cpu,
2652                                .pending = &my_pending,
2653                        };
2654
2655                        p->migration_pending = &my_pending;
2656                } else {
2657                        pending = p->migration_pending;
2658                        refcount_inc(&pending->refs);
2659                        /*
2660                         * Affinity has changed, but we've already installed a
2661                         * pending. migration_cpu_stop() *must* see this, else
2662                         * we risk a completion of the pending despite having a
2663                         * task on a disallowed CPU.
2664                         *
2665                         * Serialized by p->pi_lock, so this is safe.
2666                         */
2667                        pending->arg.dest_cpu = dest_cpu;
2668                }
2669        }
2670        pending = p->migration_pending;
2671        /*
2672         * - !MIGRATE_ENABLE:
2673         *   we'll have installed a pending if there wasn't one already.
2674         *
2675         * - MIGRATE_ENABLE:
2676         *   we're here because the current CPU isn't matching anymore,
2677         *   the only way that can happen is because of a concurrent
2678         *   set_cpus_allowed_ptr() call, which should then still be
2679         *   pending completion.
2680         *
2681         * Either way, we really should have a @pending here.
2682         */
2683        if (WARN_ON_ONCE(!pending)) {
2684                task_rq_unlock(rq, p, rf);
2685                return -EINVAL;
2686        }
2687
2688        if (task_running(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
2689                /*
2690                 * MIGRATE_ENABLE gets here because 'p == current', but for
2691                 * anything else we cannot do is_migration_disabled(), punt
2692                 * and have the stopper function handle it all race-free.
2693                 */
2694                stop_pending = pending->stop_pending;
2695                if (!stop_pending)
2696                        pending->stop_pending = true;
2697
2698                if (flags & SCA_MIGRATE_ENABLE)
2699                        p->migration_flags &= ~MDF_PUSH;
2700
2701                task_rq_unlock(rq, p, rf);
2702
2703                if (!stop_pending) {
2704                        stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
2705                                            &pending->arg, &pending->stop_work);
2706                }
2707
2708                if (flags & SCA_MIGRATE_ENABLE)
2709                        return 0;
2710        } else {
2711
2712                if (!is_migration_disabled(p)) {
2713                        if (task_on_rq_queued(p))
2714                                rq = move_queued_task(rq, rf, p, dest_cpu);
2715
2716                        if (!pending->stop_pending) {
2717                                p->migration_pending = NULL;
2718                                complete = true;
2719                        }
2720                }
2721                task_rq_unlock(rq, p, rf);
2722
2723                if (complete)
2724                        complete_all(&pending->done);
2725        }
2726
2727        wait_for_completion(&pending->done);
2728
2729        if (refcount_dec_and_test(&pending->refs))
2730                wake_up_var(&pending->refs); /* No UaF, just an address */
2731
2732        /*
2733         * Block the original owner of &pending until all subsequent callers
2734         * have seen the completion and decremented the refcount
2735         */
2736        wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
2737
2738        /* ARGH */
2739        WARN_ON_ONCE(my_pending.stop_pending);
2740
2741        return 0;
2742}
2743
2744/*
2745 * Called with both p->pi_lock and rq->lock held; drops both before returning.
2746 */
2747static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
2748                                         const struct cpumask *new_mask,
2749                                         u32 flags,
2750                                         struct rq *rq,
2751                                         struct rq_flags *rf)
2752        __releases(rq->lock)
2753        __releases(p->pi_lock)
2754{
2755        const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
2756        const struct cpumask *cpu_valid_mask = cpu_active_mask;
2757        bool kthread = p->flags & PF_KTHREAD;
2758        struct cpumask *user_mask = NULL;
2759        unsigned int dest_cpu;
2760        int ret = 0;
2761
2762        update_rq_clock(rq);
2763
2764        if (kthread || is_migration_disabled(p)) {
2765                /*
2766                 * Kernel threads are allowed on online && !active CPUs,
2767                 * however, during cpu-hot-unplug, even these might get pushed
2768                 * away if not KTHREAD_IS_PER_CPU.
2769                 *
2770                 * Specifically, migration_disabled() tasks must not fail the
2771                 * cpumask_any_and_distribute() pick below, esp. so on
2772                 * SCA_MIGRATE_ENABLE, otherwise we'll not call
2773                 * set_cpus_allowed_common() and actually reset p->cpus_ptr.
2774                 */
2775                cpu_valid_mask = cpu_online_mask;
2776        }
2777
2778        if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) {
2779                ret = -EINVAL;
2780                goto out;
2781        }
2782
2783        /*
2784         * Must re-check here, to close a race against __kthread_bind(),
2785         * sched_setaffinity() is not guaranteed to observe the flag.
2786         */
2787        if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
2788                ret = -EINVAL;
2789                goto out;
2790        }
2791
2792        if (!(flags & SCA_MIGRATE_ENABLE)) {
2793                if (cpumask_equal(&p->cpus_mask, new_mask))
2794                        goto out;
2795
2796                if (WARN_ON_ONCE(p == current &&
2797                                 is_migration_disabled(p) &&
2798                                 !cpumask_test_cpu(task_cpu(p), new_mask))) {
2799                        ret = -EBUSY;
2800                        goto out;
2801                }
2802        }
2803
2804        /*
2805         * Picking a ~random cpu helps in cases where we are changing affinity
2806         * for groups of tasks (ie. cpuset), so that load balancing is not
2807         * immediately required to distribute the tasks within their new mask.
2808         */
2809        dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
2810        if (dest_cpu >= nr_cpu_ids) {
2811                ret = -EINVAL;
2812                goto out;
2813        }
2814
2815        __do_set_cpus_allowed(p, new_mask, flags);
2816
2817        if (flags & SCA_USER)
2818                user_mask = clear_user_cpus_ptr(p);
2819
2820        ret = affine_move_task(rq, p, rf, dest_cpu, flags);
2821
2822        kfree(user_mask);
2823
2824        return ret;
2825
2826out:
2827        task_rq_unlock(rq, p, rf);
2828
2829        return ret;
2830}
2831
2832/*
2833 * Change a given task's CPU affinity. Migrate the thread to a
2834 * proper CPU and schedule it away if the CPU it's executing on
2835 * is removed from the allowed bitmask.
2836 *
2837 * NOTE: the caller must have a valid reference to the task, the
2838 * task must not exit() & deallocate itself prematurely. The
2839 * call is not atomic; no spinlocks may be held.
2840 */
2841static int __set_cpus_allowed_ptr(struct task_struct *p,
2842                                  const struct cpumask *new_mask, u32 flags)
2843{
2844        struct rq_flags rf;
2845        struct rq *rq;
2846
2847        rq = task_rq_lock(p, &rf);
2848        return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf);
2849}
2850
2851int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
2852{
2853        return __set_cpus_allowed_ptr(p, new_mask, 0);
2854}
2855EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
2856
2857/*
2858 * Change a given task's CPU affinity to the intersection of its current
2859 * affinity mask and @subset_mask, writing the resulting mask to @new_mask
2860 * and pointing @p->user_cpus_ptr to a copy of the old mask.
2861 * If the resulting mask is empty, leave the affinity unchanged and return
2862 * -EINVAL.
2863 */
2864static int restrict_cpus_allowed_ptr(struct task_struct *p,
2865                                     struct cpumask *new_mask,
2866                                     const struct cpumask *subset_mask)
2867{
2868        struct cpumask *user_mask = NULL;
2869        struct rq_flags rf;
2870        struct rq *rq;
2871        int err;
2872
2873        if (!p->user_cpus_ptr) {
2874                user_mask = kmalloc(cpumask_size(), GFP_KERNEL);
2875                if (!user_mask)
2876                        return -ENOMEM;
2877        }
2878
2879        rq = task_rq_lock(p, &rf);
2880
2881        /*
2882         * Forcefully restricting the affinity of a deadline task is
2883         * likely to cause problems, so fail and noisily override the
2884         * mask entirely.
2885         */
2886        if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
2887                err = -EPERM;
2888                goto err_unlock;
2889        }
2890
2891        if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
2892                err = -EINVAL;
2893                goto err_unlock;
2894        }
2895
2896        /*
2897         * We're about to butcher the task affinity, so keep track of what
2898         * the user asked for in case we're able to restore it later on.
2899         */
2900        if (user_mask) {
2901                cpumask_copy(user_mask, p->cpus_ptr);
2902                p->user_cpus_ptr = user_mask;
2903        }
2904
2905        return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf);
2906
2907err_unlock:
2908        task_rq_unlock(rq, p, &rf);
2909        kfree(user_mask);
2910        return err;
2911}
2912
2913/*
2914 * Restrict the CPU affinity of task @p so that it is a subset of
2915 * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the
2916 * old affinity mask. If the resulting mask is empty, we warn and walk
2917 * up the cpuset hierarchy until we find a suitable mask.
2918 */
2919void force_compatible_cpus_allowed_ptr(struct task_struct *p)
2920{
2921        cpumask_var_t new_mask;
2922        const struct cpumask *override_mask = task_cpu_possible_mask(p);
2923
2924        alloc_cpumask_var(&new_mask, GFP_KERNEL);
2925
2926        /*
2927         * __migrate_task() can fail silently in the face of concurrent
2928         * offlining of the chosen destination CPU, so take the hotplug
2929         * lock to ensure that the migration succeeds.
2930         */
2931        cpus_read_lock();
2932        if (!cpumask_available(new_mask))
2933                goto out_set_mask;
2934
2935        if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
2936                goto out_free_mask;
2937
2938        /*
2939         * We failed to find a valid subset of the affinity mask for the
2940         * task, so override it based on its cpuset hierarchy.
2941         */
2942        cpuset_cpus_allowed(p, new_mask);
2943        override_mask = new_mask;
2944
2945out_set_mask:
2946        if (printk_ratelimit()) {
2947                printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
2948                                task_pid_nr(p), p->comm,
2949                                cpumask_pr_args(override_mask));
2950        }
2951
2952        WARN_ON(set_cpus_allowed_ptr(p, override_mask));
2953out_free_mask:
2954        cpus_read_unlock();
2955        free_cpumask_var(new_mask);
2956}
2957
2958static int
2959__sched_setaffinity(struct task_struct *p, const struct cpumask *mask);
2960
2961/*
2962 * Restore the affinity of a task @p which was previously restricted by a
2963 * call to force_compatible_cpus_allowed_ptr(). This will clear (and free)
2964 * @p->user_cpus_ptr.
2965 *
2966 * It is the caller's responsibility to serialise this with any calls to
2967 * force_compatible_cpus_allowed_ptr(@p).
2968 */
2969void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
2970{
2971        struct cpumask *user_mask = p->user_cpus_ptr;
2972        unsigned long flags;
2973
2974        /*
2975         * Try to restore the old affinity mask. If this fails, then
2976         * we free the mask explicitly to avoid it being inherited across
2977         * a subsequent fork().
2978         */
2979        if (!user_mask || !__sched_setaffinity(p, user_mask))
2980                return;
2981
2982        raw_spin_lock_irqsave(&p->pi_lock, flags);
2983        user_mask = clear_user_cpus_ptr(p);
2984        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2985
2986        kfree(user_mask);
2987}
2988
2989void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2990{
2991#ifdef CONFIG_SCHED_DEBUG
2992        unsigned int state = READ_ONCE(p->__state);
2993
2994        /*
2995         * We should never call set_task_cpu() on a blocked task,
2996         * ttwu() will sort out the placement.
2997         */
2998        WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq);
2999
3000        /*
3001         * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
3002         * because schedstat_wait_{start,end} rebase migrating task's wait_start
3003         * time relying on p->on_rq.
3004         */
3005        WARN_ON_ONCE(state == TASK_RUNNING &&
3006                     p->sched_class == &fair_sched_class &&
3007                     (p->on_rq && !task_on_rq_migrating(p)));
3008
3009#ifdef CONFIG_LOCKDEP
3010        /*
3011         * The caller should hold either p->pi_lock or rq->lock, when changing
3012         * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
3013         *
3014         * sched_move_task() holds both and thus holding either pins the cgroup,
3015         * see task_group().
3016         *
3017         * Furthermore, all task_rq users should acquire both locks, see
3018         * task_rq_lock().
3019         */
3020        WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
3021                                      lockdep_is_held(__rq_lockp(task_rq(p)))));
3022#endif
3023        /*
3024         * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
3025         */
3026        WARN_ON_ONCE(!cpu_online(new_cpu));
3027
3028        WARN_ON_ONCE(is_migration_disabled(p));
3029#endif
3030
3031        trace_sched_migrate_task(p, new_cpu);
3032
3033        if (task_cpu(p) != new_cpu) {
3034                if (p->sched_class->migrate_task_rq)
3035                        p->sched_class->migrate_task_rq(p, new_cpu);
3036                p->se.nr_migrations++;
3037                rseq_migrate(p);
3038                perf_event_task_migrate(p);
3039        }
3040
3041        __set_task_cpu(p, new_cpu);
3042}
3043
3044#ifdef CONFIG_NUMA_BALANCING
3045static void __migrate_swap_task(struct task_struct *p, int cpu)
3046{
3047        if (task_on_rq_queued(p)) {
3048                struct rq *src_rq, *dst_rq;
3049                struct rq_flags srf, drf;
3050
3051                src_rq = task_rq(p);
3052                dst_rq = cpu_rq(cpu);
3053
3054                rq_pin_lock(src_rq, &srf);
3055                rq_pin_lock(dst_rq, &drf);
3056
3057                deactivate_task(src_rq, p, 0);
3058                set_task_cpu(p, cpu);
3059                activate_task(dst_rq, p, 0);
3060                check_preempt_curr(dst_rq, p, 0);
3061
3062                rq_unpin_lock(dst_rq, &drf);
3063                rq_unpin_lock(src_rq, &srf);
3064
3065        } else {
3066                /*
3067                 * Task isn't running anymore; make it appear like we migrated
3068                 * it before it went to sleep. This means on wakeup we make the
3069                 * previous CPU our target instead of where it really is.
3070                 */
3071                p->wake_cpu = cpu;
3072        }
3073}
3074
3075struct migration_swap_arg {
3076        struct task_struct *src_task, *dst_task;
3077        int src_cpu, dst_cpu;
3078};
3079
3080static int migrate_swap_stop(void *data)
3081{
3082        struct migration_swap_arg *arg = data;
3083        struct rq *src_rq, *dst_rq;
3084        int ret = -EAGAIN;
3085
3086        if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
3087                return -EAGAIN;
3088
3089        src_rq = cpu_rq(arg->src_cpu);
3090        dst_rq = cpu_rq(arg->dst_cpu);
3091
3092        double_raw_lock(&arg->src_task->pi_lock,
3093                        &arg->dst_task->pi_lock);
3094        double_rq_lock(src_rq, dst_rq);
3095
3096        if (task_cpu(arg->dst_task) != arg->dst_cpu)
3097                goto unlock;
3098
3099        if (task_cpu(arg->src_task) != arg->src_cpu)
3100                goto unlock;
3101
3102        if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
3103                goto unlock;
3104
3105        if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
3106                goto unlock;
3107
3108        __migrate_swap_task(arg->src_task, arg->dst_cpu);
3109        __migrate_swap_task(arg->dst_task, arg->src_cpu);
3110
3111        ret = 0;
3112
3113unlock:
3114        double_rq_unlock(src_rq, dst_rq);
3115        raw_spin_unlock(&arg->dst_task->pi_lock);
3116        raw_spin_unlock(&arg->src_task->pi_lock);
3117
3118        return ret;
3119}
3120
3121/*
3122 * Cross migrate two tasks
3123 */
3124int migrate_swap(struct task_struct *cur, struct task_struct *p,
3125                int target_cpu, int curr_cpu)
3126{
3127        struct migration_swap_arg arg;
3128        int ret = -EINVAL;
3129
3130        arg = (struct migration_swap_arg){
3131                .src_task = cur,
3132                .src_cpu = curr_cpu,
3133                .dst_task = p,
3134                .dst_cpu = target_cpu,
3135        };
3136
3137        if (arg.src_cpu == arg.dst_cpu)
3138                goto out;
3139
3140        /*
3141         * These three tests are all lockless; this is OK since all of them
3142         * will be re-checked with proper locks held further down the line.
3143         */
3144        if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
3145                goto out;
3146
3147        if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
3148                goto out;
3149
3150        if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
3151                goto out;
3152
3153        trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
3154        ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
3155
3156out:
3157        return ret;
3158}
3159#endif /* CONFIG_NUMA_BALANCING */
3160
3161/*
3162 * wait_task_inactive - wait for a thread to unschedule.
3163 *
3164 * If @match_state is nonzero, it's the @p->state value just checked and
3165 * not expected to change.  If it changes, i.e. @p might have woken up,
3166 * then return zero.  When we succeed in waiting for @p to be off its CPU,
3167 * we return a positive number (its total switch count).  If a second call
3168 * a short while later returns the same number, the caller can be sure that
3169 * @p has remained unscheduled the whole time.
3170 *
3171 * The caller must ensure that the task *will* unschedule sometime soon,
3172 * else this function might spin for a *long* time. This function can't
3173 * be called with interrupts off, or it may introduce deadlock with
3174 * smp_call_function() if an IPI is sent by the same process we are
3175 * waiting to become inactive.
3176 */
3177unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
3178{
3179        int running, queued;
3180        struct rq_flags rf;
3181        unsigned long ncsw;
3182        struct rq *rq;
3183
3184        for (;;) {
3185                /*
3186                 * We do the initial early heuristics without holding
3187                 * any task-queue locks at all. We'll only try to get
3188                 * the runqueue lock when things look like they will
3189                 * work out!
3190                 */
3191                rq = task_rq(p);
3192
3193                /*
3194                 * If the task is actively running on another CPU
3195                 * still, just relax and busy-wait without holding
3196                 * any locks.
3197                 *
3198                 * NOTE! Since we don't hold any locks, it's not
3199                 * even sure that "rq" stays as the right runqueue!
3200                 * But we don't care, since "task_running()" will
3201                 * return false if the runqueue has changed and p
3202                 * is actually now running somewhere else!
3203                 */
3204                while (task_running(rq, p)) {
3205                        if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
3206                                return 0;
3207                        cpu_relax();
3208                }
3209
3210                /*
3211                 * Ok, time to look more closely! We need the rq
3212                 * lock now, to be *sure*. If we're wrong, we'll
3213                 * just go back and repeat.
3214                 */
3215                rq = task_rq_lock(p, &rf);
3216                trace_sched_wait_task(p);
3217                running = task_running(rq, p);
3218                queued = task_on_rq_queued(p);
3219                ncsw = 0;
3220                if (!match_state || READ_ONCE(p->__state) == match_state)
3221                        ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
3222                task_rq_unlock(rq, p, &rf);
3223
3224                /*
3225                 * If it changed from the expected state, bail out now.
3226                 */
3227                if (unlikely(!ncsw))
3228                        break;
3229
3230                /*
3231                 * Was it really running after all now that we
3232                 * checked with the proper locks actually held?
3233                 *
3234                 * Oops. Go back and try again..
3235                 */
3236                if (unlikely(running)) {
3237                        cpu_relax();
3238                        continue;
3239                }
3240
3241                /*
3242                 * It's not enough that it's not actively running,
3243                 * it must be off the runqueue _entirely_, and not
3244                 * preempted!
3245                 *
3246                 * So if it was still runnable (but just not actively
3247                 * running right now), it's preempted, and we should
3248                 * yield - it could be a while.
3249                 */
3250                if (unlikely(queued)) {
3251                        ktime_t to = NSEC_PER_SEC / HZ;
3252
3253                        set_current_state(TASK_UNINTERRUPTIBLE);
3254                        schedule_hrtimeout(&to, HRTIMER_MODE_REL);
3255                        continue;
3256                }
3257
3258                /*
3259                 * Ahh, all good. It wasn't running, and it wasn't
3260                 * runnable, which means that it will never become
3261                 * running in the future either. We're all done!
3262                 */
3263                break;
3264        }
3265
3266        return ncsw;
3267}
3268
3269/***
3270 * kick_process - kick a running thread to enter/exit the kernel
3271 * @p: the to-be-kicked thread
3272 *
3273 * Cause a process which is running on another CPU to enter
3274 * kernel-mode, without any delay. (to get signals handled.)
3275 *
3276 * NOTE: this function doesn't have to take the runqueue lock,
3277 * because all it wants to ensure is that the remote task enters
3278 * the kernel. If the IPI races and the task has been migrated
3279 * to another CPU then no harm is done and the purpose has been
3280 * achieved as well.
3281 */
3282void kick_process(struct task_struct *p)
3283{
3284        int cpu;
3285
3286        preempt_disable();
3287        cpu = task_cpu(p);
3288        if ((cpu != smp_processor_id()) && task_curr(p))
3289                smp_send_reschedule(cpu);
3290        preempt_enable();
3291}
3292EXPORT_SYMBOL_GPL(kick_process);
3293
3294/*
3295 * ->cpus_ptr is protected by both rq->lock and p->pi_lock
3296 *
3297 * A few notes on cpu_active vs cpu_online:
3298 *
3299 *  - cpu_active must be a subset of cpu_online
3300 *
3301 *  - on CPU-up we allow per-CPU kthreads on the online && !active CPU,
3302 *    see __set_cpus_allowed_ptr(). At this point the newly online
3303 *    CPU isn't yet part of the sched domains, and balancing will not
3304 *    see it.
3305 *
3306 *  - on CPU-down we clear cpu_active() to mask the sched domains and
3307 *    avoid the load balancer to place new tasks on the to be removed
3308 *    CPU. Existing tasks will remain running there and will be taken
3309 *    off.
3310 *
3311 * This means that fallback selection must not select !active CPUs.
3312 * And can assume that any active CPU must be online. Conversely
3313 * select_task_rq() below may allow selection of !active CPUs in order
3314 * to satisfy the above rules.
3315 */
3316static int select_fallback_rq(int cpu, struct task_struct *p)
3317{
3318        int nid = cpu_to_node(cpu);
3319        const struct cpumask *nodemask = NULL;
3320        enum { cpuset, possible, fail } state = cpuset;
3321        int dest_cpu;
3322
3323        /*
3324         * If the node that the CPU is on has been offlined, cpu_to_node()
3325         * will return -1. There is no CPU on the node, and we should
3326         * select the CPU on the other node.
3327         */
3328        if (nid != -1) {
3329                nodemask = cpumask_of_node(nid);
3330
3331                /* Look for allowed, online CPU in same node. */
3332                for_each_cpu(dest_cpu, nodemask) {
3333                        if (is_cpu_allowed(p, dest_cpu))
3334                                return dest_cpu;
3335                }
3336        }
3337
3338        for (;;) {
3339                /* Any allowed, online CPU? */
3340                for_each_cpu(dest_cpu, p->cpus_ptr) {
3341                        if (!is_cpu_allowed(p, dest_cpu))
3342                                continue;
3343
3344                        goto out;
3345                }
3346
3347                /* No more Mr. Nice Guy. */
3348                switch (state) {
3349                case cpuset:
3350                        if (cpuset_cpus_allowed_fallback(p)) {
3351                                state = possible;
3352                                break;
3353                        }
3354                        fallthrough;
3355                case possible:
3356                        /*
3357                         * XXX When called from select_task_rq() we only
3358                         * hold p->pi_lock and again violate locking order.
3359                         *
3360                         * More yuck to audit.
3361                         */
3362                        do_set_cpus_allowed(p, task_cpu_possible_mask(p));
3363                        state = fail;
3364                        break;
3365                case fail:
3366                        BUG();
3367                        break;
3368                }
3369        }
3370
3371out:
3372        if (state != cpuset) {
3373                /*
3374                 * Don't tell them about moving exiting tasks or
3375                 * kernel threads (both mm NULL), since they never
3376                 * leave kernel.
3377                 */
3378                if (p->mm && printk_ratelimit()) {
3379                        printk_deferred("process %d (%s) no longer affine to cpu%d\n",
3380                                        task_pid_nr(p), p->comm, cpu);
3381                }
3382        }
3383
3384        return dest_cpu;
3385}
3386
3387/*
3388 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
3389 */
3390static inline
3391int select_task_rq(struct task_struct *p, int cpu, int wake_flags)
3392{
3393        lockdep_assert_held(&p->pi_lock);
3394
3395        if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
3396                cpu = p->sched_class->select_task_rq(p, cpu, wake_flags);
3397        else
3398                cpu = cpumask_any(p->cpus_ptr);
3399
3400        /*
3401         * In order not to call set_task_cpu() on a blocking task we need
3402         * to rely on ttwu() to place the task on a valid ->cpus_ptr
3403         * CPU.
3404         *
3405         * Since this is common to all placement strategies, this lives here.
3406         *
3407         * [ this allows ->select_task() to simply return task_cpu(p) and
3408         *   not worry about this generic constraint ]
3409         */
3410        if (unlikely(!is_cpu_allowed(p, cpu)))
3411                cpu = select_fallback_rq(task_cpu(p), p);
3412
3413        return cpu;
3414}
3415
3416void sched_set_stop_task(int cpu, struct task_struct *stop)
3417{
3418        static struct lock_class_key stop_pi_lock;
3419        struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
3420        struct task_struct *old_stop = cpu_rq(cpu)->stop;
3421
3422        if (stop) {
3423                /*
3424                 * Make it appear like a SCHED_FIFO task, its something
3425                 * userspace knows about and won't get confused about.
3426                 *
3427                 * Also, it will make PI more or less work without too
3428                 * much confusion -- but then, stop work should not
3429                 * rely on PI working anyway.
3430                 */
3431                sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
3432
3433                stop->sched_class = &stop_sched_class;
3434
3435                /*
3436                 * The PI code calls rt_mutex_setprio() with ->pi_lock held to
3437                 * adjust the effective priority of a task. As a result,
3438                 * rt_mutex_setprio() can trigger (RT) balancing operations,
3439                 * which can then trigger wakeups of the stop thread to push
3440                 * around the current task.
3441                 *
3442                 * The stop task itself will never be part of the PI-chain, it
3443                 * never blocks, therefore that ->pi_lock recursion is safe.
3444                 * Tell lockdep about this by placing the stop->pi_lock in its
3445                 * own class.
3446                 */
3447                lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
3448        }
3449
3450        cpu_rq(cpu)->stop = stop;
3451
3452        if (old_stop) {
3453                /*
3454                 * Reset it back to a normal scheduling class so that
3455                 * it can die in pieces.
3456                 */
3457                old_stop->sched_class = &rt_sched_class;
3458        }
3459}
3460
3461#else /* CONFIG_SMP */
3462
3463static inline int __set_cpus_allowed_ptr(struct task_struct *p,
3464                                         const struct cpumask *new_mask,
3465                                         u32 flags)
3466{
3467        return set_cpus_allowed_ptr(p, new_mask);
3468}
3469
3470static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
3471
3472static inline bool rq_has_pinned_tasks(struct rq *rq)
3473{
3474        return false;
3475}
3476
3477#endif /* !CONFIG_SMP */
3478
3479static void
3480ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
3481{
3482        struct rq *rq;
3483
3484        if (!schedstat_enabled())
3485                return;
3486
3487        rq = this_rq();
3488
3489#ifdef CONFIG_SMP
3490        if (cpu == rq->cpu) {
3491                __schedstat_inc(rq->ttwu_local);
3492                __schedstat_inc(p->se.statistics.nr_wakeups_local);
3493        } else {
3494                struct sched_domain *sd;
3495
3496                __schedstat_inc(p->se.statistics.nr_wakeups_remote);
3497                rcu_read_lock();
3498                for_each_domain(rq->cpu, sd) {
3499                        if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
3500                                __schedstat_inc(sd->ttwu_wake_remote);
3501                                break;
3502                        }
3503                }
3504                rcu_read_unlock();
3505        }
3506
3507        if (wake_flags & WF_MIGRATED)
3508                __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
3509#endif /* CONFIG_SMP */
3510
3511        __schedstat_inc(rq->ttwu_count);
3512        __schedstat_inc(p->se.statistics.nr_wakeups);
3513
3514        if (wake_flags & WF_SYNC)
3515                __schedstat_inc(p->se.statistics.nr_wakeups_sync);
3516}
3517
3518/*
3519 * Mark the task runnable and perform wakeup-preemption.
3520 */
3521static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
3522                           struct rq_flags *rf)
3523{
3524        check_preempt_curr(rq, p, wake_flags);
3525        WRITE_ONCE(p->__state, TASK_RUNNING);
3526        trace_sched_wakeup(p);
3527
3528#ifdef CONFIG_SMP
3529        if (p->sched_class->task_woken) {
3530                /*
3531                 * Our task @p is fully woken up and running; so it's safe to
3532                 * drop the rq->lock, hereafter rq is only used for statistics.
3533                 */
3534                rq_unpin_lock(rq, rf);
3535                p->sched_class->task_woken(rq, p);
3536                rq_repin_lock(rq, rf);
3537        }
3538
3539        if (rq->idle_stamp) {
3540                u64 delta = rq_clock(rq) - rq->idle_stamp;
3541                u64 max = 2*rq->max_idle_balance_cost;
3542
3543                update_avg(&rq->avg_idle, delta);
3544
3545                if (rq->avg_idle > max)
3546                        rq->avg_idle = max;
3547
3548                rq->wake_stamp = jiffies;
3549                rq->wake_avg_idle = rq->avg_idle / 2;
3550
3551                rq->idle_stamp = 0;
3552        }
3553#endif
3554}
3555
3556static void
3557ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
3558                 struct rq_flags *rf)
3559{
3560        int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
3561
3562        lockdep_assert_rq_held(rq);
3563
3564        if (p->sched_contributes_to_load)
3565                rq->nr_uninterruptible--;
3566
3567#ifdef CONFIG_SMP
3568        if (wake_flags & WF_MIGRATED)
3569                en_flags |= ENQUEUE_MIGRATED;
3570        else
3571#endif
3572        if (p->in_iowait) {
3573                delayacct_blkio_end(p);
3574                atomic_dec(&task_rq(p)->nr_iowait);
3575        }
3576
3577        activate_task(rq, p, en_flags);
3578        ttwu_do_wakeup(rq, p, wake_flags, rf);
3579}
3580
3581/*
3582 * Consider @p being inside a wait loop:
3583 *
3584 *   for (;;) {
3585 *      set_current_state(TASK_UNINTERRUPTIBLE);
3586 *
3587 *      if (CONDITION)
3588 *         break;
3589 *
3590 *      schedule();
3591 *   }
3592 *   __set_current_state(TASK_RUNNING);
3593 *
3594 * between set_current_state() and schedule(). In this case @p is still
3595 * runnable, so all that needs doing is change p->state back to TASK_RUNNING in
3596 * an atomic manner.
3597 *
3598 * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
3599 * then schedule() must still happen and p->state can be changed to
3600 * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
3601 * need to do a full wakeup with enqueue.
3602 *
3603 * Returns: %true when the wakeup is done,
3604 *          %false otherwise.
3605 */
3606static int ttwu_runnable(struct task_struct *p, int wake_flags)
3607{
3608        struct rq_flags rf;
3609        struct rq *rq;
3610        int ret = 0;
3611
3612        rq = __task_rq_lock(p, &rf);
3613        if (task_on_rq_queued(p)) {
3614                /* check_preempt_curr() may use rq clock */
3615                update_rq_clock(rq);
3616                ttwu_do_wakeup(rq, p, wake_flags, &rf);
3617                ret = 1;
3618        }
3619        __task_rq_unlock(rq, &rf);
3620
3621        return ret;
3622}
3623
3624#ifdef CONFIG_SMP
3625void sched_ttwu_pending(void *arg)
3626{
3627        struct llist_node *llist = arg;
3628        struct rq *rq = this_rq();
3629        struct task_struct *p, *t;
3630        struct rq_flags rf;
3631
3632        if (!llist)
3633                return;
3634
3635        /*
3636         * rq::ttwu_pending racy indication of out-standing wakeups.
3637         * Races such that false-negatives are possible, since they
3638         * are shorter lived that false-positives would be.
3639         */
3640        WRITE_ONCE(rq->ttwu_pending, 0);
3641
3642        rq_lock_irqsave(rq, &rf);
3643        update_rq_clock(rq);
3644
3645        llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
3646                if (WARN_ON_ONCE(p->on_cpu))
3647                        smp_cond_load_acquire(&p->on_cpu, !VAL);
3648
3649                if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
3650                        set_task_cpu(p, cpu_of(rq));
3651
3652                ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
3653        }
3654
3655        rq_unlock_irqrestore(rq, &rf);
3656}
3657
3658void send_call_function_single_ipi(int cpu)
3659{
3660        struct rq *rq = cpu_rq(cpu);
3661
3662        if (!set_nr_if_polling(rq->idle))
3663                arch_send_call_function_single_ipi(cpu);
3664        else
3665                trace_sched_wake_idle_without_ipi(cpu);
3666}
3667
3668/*
3669 * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
3670 * necessary. The wakee CPU on receipt of the IPI will queue the task
3671 * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
3672 * of the wakeup instead of the waker.
3673 */
3674static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3675{
3676        struct rq *rq = cpu_rq(cpu);
3677
3678        p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
3679
3680        WRITE_ONCE(rq->ttwu_pending, 1);
3681        __smp_call_single_queue(cpu, &p->wake_entry.llist);
3682}
3683
3684void wake_up_if_idle(int cpu)
3685{
3686        struct rq *rq = cpu_rq(cpu);
3687        struct rq_flags rf;
3688
3689        rcu_read_lock();
3690
3691        if (!is_idle_task(rcu_dereference(rq->curr)))
3692                goto out;
3693
3694        if (set_nr_if_polling(rq->idle)) {
3695                trace_sched_wake_idle_without_ipi(cpu);
3696        } else {
3697                rq_lock_irqsave(rq, &rf);
3698                if (is_idle_task(rq->curr))
3699                        smp_send_reschedule(cpu);
3700                /* Else CPU is not idle, do nothing here: */
3701                rq_unlock_irqrestore(rq, &rf);
3702        }
3703
3704out:
3705        rcu_read_unlock();
3706}
3707
3708bool cpus_share_cache(int this_cpu, int that_cpu)
3709{
3710        return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
3711}
3712
3713static inline bool ttwu_queue_cond(int cpu, int wake_flags)
3714{
3715        /*
3716         * Do not complicate things with the async wake_list while the CPU is
3717         * in hotplug state.
3718         */
3719        if (!cpu_active(cpu))
3720                return false;
3721
3722        /*
3723         * If the CPU does not share cache, then queue the task on the
3724         * remote rqs wakelist to avoid accessing remote data.
3725         */
3726        if (!cpus_share_cache(smp_processor_id(), cpu))
3727                return true;
3728
3729        /*
3730         * If the task is descheduling and the only running task on the
3731         * CPU then use the wakelist to offload the task activation to
3732         * the soon-to-be-idle CPU as the current CPU is likely busy.
3733         * nr_running is checked to avoid unnecessary task stacking.
3734         */
3735        if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)
3736                return true;
3737
3738        return false;
3739}
3740
3741static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3742{
3743        if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
3744                if (WARN_ON_ONCE(cpu == smp_processor_id()))
3745                        return false;
3746
3747                sched_clock_cpu(cpu); /* Sync clocks across CPUs */
3748                __ttwu_queue_wakelist(p, cpu, wake_flags);
3749                return true;
3750        }
3751
3752        return false;
3753}
3754
3755#else /* !CONFIG_SMP */
3756
3757static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3758{
3759        return false;
3760}
3761
3762#endif /* CONFIG_SMP */
3763
3764static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
3765{
3766        struct rq *rq = cpu_rq(cpu);
3767        struct rq_flags rf;
3768
3769        if (ttwu_queue_wakelist(p, cpu, wake_flags))
3770                return;
3771
3772        rq_lock(rq, &rf);
3773        update_rq_clock(rq);
3774        ttwu_do_activate(rq, p, wake_flags, &rf);
3775        rq_unlock(rq, &rf);
3776}
3777
3778/*
3779 * Invoked from try_to_wake_up() to check whether the task can be woken up.
3780 *
3781 * The caller holds p::pi_lock if p != current or has preemption
3782 * disabled when p == current.
3783 *
3784 * The rules of PREEMPT_RT saved_state:
3785 *
3786 *   The related locking code always holds p::pi_lock when updating
3787 *   p::saved_state, which means the code is fully serialized in both cases.
3788 *
3789 *   The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other
3790 *   bits set. This allows to distinguish all wakeup scenarios.
3791 */
3792static __always_inline
3793bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
3794{
3795        if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
3796                WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
3797                             state != TASK_RTLOCK_WAIT);
3798        }
3799
3800        if (READ_ONCE(p->__state) & state) {
3801                *success = 1;
3802                return true;
3803        }
3804
3805#ifdef CONFIG_PREEMPT_RT
3806        /*
3807         * Saved state preserves the task state across blocking on
3808         * an RT lock.  If the state matches, set p::saved_state to
3809         * TASK_RUNNING, but do not wake the task because it waits
3810         * for a lock wakeup. Also indicate success because from
3811         * the regular waker's point of view this has succeeded.
3812         *
3813         * After acquiring the lock the task will restore p::__state
3814         * from p::saved_state which ensures that the regular
3815         * wakeup is not lost. The restore will also set
3816         * p::saved_state to TASK_RUNNING so any further tests will
3817         * not result in false positives vs. @success
3818         */
3819        if (p->saved_state & state) {
3820                p->saved_state = TASK_RUNNING;
3821                *success = 1;
3822        }
3823#endif
3824        return false;
3825}
3826
3827/*
3828 * Notes on Program-Order guarantees on SMP systems.
3829 *
3830 *  MIGRATION
3831 *
3832 * The basic program-order guarantee on SMP systems is that when a task [t]
3833 * migrates, all its activity on its old CPU [c0] happens-before any subsequent
3834 * execution on its new CPU [c1].
3835 *
3836 * For migration (of runnable tasks) this is provided by the following means:
3837 *
3838 *  A) UNLOCK of the rq(c0)->lock scheduling out task t
3839 *  B) migration for t is required to synchronize *both* rq(c0)->lock and
3840 *     rq(c1)->lock (if not at the same time, then in that order).
3841 *  C) LOCK of the rq(c1)->lock scheduling in task
3842 *
3843 * Release/acquire chaining guarantees that B happens after A and C after B.
3844 * Note: the CPU doing B need not be c0 or c1
3845 *
3846 * Example:
3847 *
3848 *   CPU0            CPU1            CPU2
3849 *
3850 *   LOCK rq(0)->lock
3851 *   sched-out X
3852 *   sched-in Y
3853 *   UNLOCK rq(0)->lock
3854 *
3855 *                                   LOCK rq(0)->lock // orders against CPU0
3856 *                                   dequeue X
3857 *                                   UNLOCK rq(0)->lock
3858 *
3859 *                                   LOCK rq(1)->lock
3860 *                                   enqueue X
3861 *                                   UNLOCK rq(1)->lock
3862 *
3863 *                   LOCK rq(1)->lock // orders against CPU2
3864 *                   sched-out Z
3865 *                   sched-in X
3866 *                   UNLOCK rq(1)->lock
3867 *
3868 *
3869 *  BLOCKING -- aka. SLEEP + WAKEUP
3870 *
3871 * For blocking we (obviously) need to provide the same guarantee as for
3872 * migration. However the means are completely different as there is no lock
3873 * chain to provide order. Instead we do:
3874 *
3875 *   1) smp_store_release(X->on_cpu, 0)   -- finish_task()
3876 *   2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
3877 *
3878 * Example:
3879 *
3880 *   CPU0 (schedule)  CPU1 (try_to_wake_up) CPU2 (schedule)
3881 *
3882 *   LOCK rq(0)->lock LOCK X->pi_lock
3883 *   dequeue X
3884 *   sched-out X
3885 *   smp_store_release(X->on_cpu, 0);
3886 *
3887 *                    smp_cond_load_acquire(&X->on_cpu, !VAL);
3888 *                    X->state = WAKING
3889 *                    set_task_cpu(X,2)
3890 *
3891 *                    LOCK rq(2)->lock
3892 *                    enqueue X
3893 *                    X->state = RUNNING
3894 *                    UNLOCK rq(2)->lock
3895 *
3896 *                                          LOCK rq(2)->lock // orders against CPU1
3897 *                                          sched-out Z
3898 *                                          sched-in X
3899 *                                          UNLOCK rq(2)->lock
3900 *
3901 *                    UNLOCK X->pi_lock
3902 *   UNLOCK rq(0)->lock
3903 *
3904 *
3905 * However, for wakeups there is a second guarantee we must provide, namely we
3906 * must ensure that CONDITION=1 done by the caller can not be reordered with
3907 * accesses to the task state; see try_to_wake_up() and set_current_state().
3908 */
3909
3910/**
3911 * try_to_wake_up - wake up a thread
3912 * @p: the thread to be awakened
3913 * @state: the mask of task states that can be woken
3914 * @wake_flags: wake modifier flags (WF_*)
3915 *
3916 * Conceptually does:
3917 *
3918 *   If (@state & @p->state) @p->state = TASK_RUNNING.
3919 *
3920 * If the task was not queued/runnable, also place it back on a runqueue.
3921 *
3922 * This function is atomic against schedule() which would dequeue the task.
3923 *
3924 * It issues a full memory barrier before accessing @p->state, see the comment
3925 * with set_current_state().
3926 *
3927 * Uses p->pi_lock to serialize against concurrent wake-ups.
3928 *
3929 * Relies on p->pi_lock stabilizing:
3930 *  - p->sched_class
3931 *  - p->cpus_ptr
3932 *  - p->sched_task_group
3933 * in order to do migration, see its use of select_task_rq()/set_task_cpu().
3934 *
3935 * Tries really hard to only take one task_rq(p)->lock for performance.
3936 * Takes rq->lock in:
3937 *  - ttwu_runnable()    -- old rq, unavoidable, see comment there;
3938 *  - ttwu_queue()       -- new rq, for enqueue of the task;
3939 *  - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
3940 *
3941 * As a consequence we race really badly with just about everything. See the
3942 * many memory barriers and their comments for details.
3943 *
3944 * Return: %true if @p->state changes (an actual wakeup was done),
3945 *         %false otherwise.
3946 */
3947static int
3948try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
3949{
3950        unsigned long flags;
3951        int cpu, success = 0;
3952
3953        preempt_disable();
3954        if (p == current) {
3955                /*
3956                 * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
3957                 * == smp_processor_id()'. Together this means we can special
3958                 * case the whole 'p->on_rq && ttwu_runnable()' case below
3959                 * without taking any locks.
3960                 *
3961                 * In particular:
3962                 *  - we rely on Program-Order guarantees for all the ordering,
3963                 *  - we're serialized against set_special_state() by virtue of
3964                 *    it disabling IRQs (this allows not taking ->pi_lock).
3965                 */
3966                if (!ttwu_state_match(p, state, &success))
3967                        goto out;
3968
3969                trace_sched_waking(p);
3970                WRITE_ONCE(p->__state, TASK_RUNNING);
3971                trace_sched_wakeup(p);
3972                goto out;
3973        }
3974
3975        /*
3976         * If we are going to wake up a thread waiting for CONDITION we
3977         * need to ensure that CONDITION=1 done by the caller can not be
3978         * reordered with p->state check below. This pairs with smp_store_mb()
3979         * in set_current_state() that the waiting thread does.
3980         */
3981        raw_spin_lock_irqsave(&p->pi_lock, flags);
3982        smp_mb__after_spinlock();
3983        if (!ttwu_state_match(p, state, &success))
3984                goto unlock;
3985
3986        trace_sched_waking(p);
3987
3988        /*
3989         * Ensure we load p->on_rq _after_ p->state, otherwise it would
3990         * be possible to, falsely, observe p->on_rq == 0 and get stuck
3991         * in smp_cond_load_acquire() below.
3992         *
3993         * sched_ttwu_pending()                 try_to_wake_up()
3994         *   STORE p->on_rq = 1                   LOAD p->state
3995         *   UNLOCK rq->lock
3996         *
3997         * __schedule() (switch to task 'p')
3998         *   LOCK rq->lock                        smp_rmb();
3999         *   smp_mb__after_spinlock();
4000         *   UNLOCK rq->lock
4001         *
4002         * [task p]
4003         *   STORE p->state = UNINTERRUPTIBLE     LOAD p->on_rq
4004         *
4005         * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
4006         * __schedule().  See the comment for smp_mb__after_spinlock().
4007         *
4008         * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
4009         */
4010        smp_rmb();
4011        if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
4012                goto unlock;
4013
4014#ifdef CONFIG_SMP
4015        /*
4016         * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
4017         * possible to, falsely, observe p->on_cpu == 0.
4018         *
4019         * One must be running (->on_cpu == 1) in order to remove oneself
4020         * from the runqueue.
4021         *
4022         * __schedule() (switch to task 'p')    try_to_wake_up()
4023         *   STORE p->on_cpu = 1                  LOAD p->on_rq
4024         *   UNLOCK rq->lock
4025         *
4026         * __schedule() (put 'p' to sleep)
4027         *   LOCK rq->lock                        smp_rmb();
4028         *   smp_mb__after_spinlock();
4029         *   STORE p->on_rq = 0                   LOAD p->on_cpu
4030         *
4031         * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
4032         * __schedule().  See the comment for smp_mb__after_spinlock().
4033         *
4034         * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
4035         * schedule()'s deactivate_task() has 'happened' and p will no longer
4036         * care about it's own p->state. See the comment in __schedule().
4037         */
4038        smp_acquire__after_ctrl_dep();
4039
4040        /*
4041         * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
4042         * == 0), which means we need to do an enqueue, change p->state to
4043         * TASK_WAKING such that we can unlock p->pi_lock before doing the
4044         * enqueue, such as ttwu_queue_wakelist().
4045         */
4046        WRITE_ONCE(p->__state, TASK_WAKING);
4047
4048        /*
4049         * If the owning (remote) CPU is still in the middle of schedule() with
4050         * this task as prev, considering queueing p on the remote CPUs wake_list
4051         * which potentially sends an IPI instead of spinning on p->on_cpu to
4052         * let the waker make forward progress. This is safe because IRQs are
4053         * disabled and the IPI will deliver after on_cpu is cleared.
4054         *
4055         * Ensure we load task_cpu(p) after p->on_cpu:
4056         *
4057         * set_task_cpu(p, cpu);
4058         *   STORE p->cpu = @cpu
4059         * __schedule() (switch to task 'p')
4060         *   LOCK rq->lock
4061         *   smp_mb__after_spin_lock()          smp_cond_load_acquire(&p->on_cpu)
4062         *   STORE p->on_cpu = 1                LOAD p->cpu
4063         *
4064         * to ensure we observe the correct CPU on which the task is currently
4065         * scheduling.
4066         */
4067        if (smp_load_acquire(&p->on_cpu) &&
4068            ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
4069                goto unlock;
4070
4071        /*
4072         * If the owning (remote) CPU is still in the middle of schedule() with
4073         * this task as prev, wait until it's done referencing the task.
4074         *
4075         * Pairs with the smp_store_release() in finish_task().
4076         *
4077         * This ensures that tasks getting woken will be fully ordered against
4078         * their previous state and preserve Program Order.
4079         */
4080        smp_cond_load_acquire(&p->on_cpu, !VAL);
4081
4082        cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
4083        if (task_cpu(p) != cpu) {
4084                if (p->in_iowait) {
4085                        delayacct_blkio_end(p);
4086                        atomic_dec(&task_rq(p)->nr_iowait);
4087                }
4088
4089                wake_flags |= WF_MIGRATED;
4090                psi_ttwu_dequeue(p);
4091                set_task_cpu(p, cpu);
4092        }
4093#else
4094        cpu = task_cpu(p);
4095#endif /* CONFIG_SMP */
4096
4097        ttwu_queue(p, cpu, wake_flags);
4098unlock:
4099        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4100out:
4101        if (success)
4102                ttwu_stat(p, task_cpu(p), wake_flags);
4103        preempt_enable();
4104
4105        return success;
4106}
4107
4108/**
4109 * try_invoke_on_locked_down_task - Invoke a function on task in fixed state
4110 * @p: Process for which the function is to be invoked, can be @current.
4111 * @func: Function to invoke.
4112 * @arg: Argument to function.
4113 *
4114 * If the specified task can be quickly locked into a definite state
4115 * (either sleeping or on a given runqueue), arrange to keep it in that
4116 * state while invoking @func(@arg).  This function can use ->on_rq and
4117 * task_curr() to work out what the state is, if required.  Given that
4118 * @func can be invoked with a runqueue lock held, it had better be quite
4119 * lightweight.
4120 *
4121 * Returns:
4122 *      @false if the task slipped out from under the locks.
4123 *      @true if the task was locked onto a runqueue or is sleeping.
4124 *              However, @func can override this by returning @false.
4125 */
4126bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)
4127{
4128        struct rq_flags rf;
4129        bool ret = false;
4130        struct rq *rq;
4131
4132        raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
4133        if (p->on_rq) {
4134                rq = __task_rq_lock(p, &rf);
4135                if (task_rq(p) == rq)
4136                        ret = func(p, arg);
4137                rq_unlock(rq, &rf);
4138        } else {
4139                switch (READ_ONCE(p->__state)) {
4140                case TASK_RUNNING:
4141                case TASK_WAKING:
4142                        break;
4143                default:
4144                        smp_rmb(); // See smp_rmb() comment in try_to_wake_up().
4145                        if (!p->on_rq)
4146                                ret = func(p, arg);
4147                }
4148        }
4149        raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
4150        return ret;
4151}
4152
4153/**
4154 * wake_up_process - Wake up a specific process
4155 * @p: The process to be woken up.
4156 *
4157 * Attempt to wake up the nominated process and move it to the set of runnable
4158 * processes.
4159 *
4160 * Return: 1 if the process was woken up, 0 if it was already running.
4161 *
4162 * This function executes a full memory barrier before accessing the task state.
4163 */
4164int wake_up_process(struct task_struct *p)
4165{
4166        return try_to_wake_up(p, TASK_NORMAL, 0);
4167}
4168EXPORT_SYMBOL(wake_up_process);
4169
4170int wake_up_state(struct task_struct *p, unsigned int state)
4171{
4172        return try_to_wake_up(p, state, 0);
4173}
4174
4175/*
4176 * Perform scheduler related setup for a newly forked process p.
4177 * p is forked by current.
4178 *
4179 * __sched_fork() is basic setup used by init_idle() too:
4180 */
4181static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
4182{
4183        p->on_rq                        = 0;
4184
4185        p->se.on_rq                     = 0;
4186        p->se.exec_start                = 0;
4187        p->se.sum_exec_runtime          = 0;
4188        p->se.prev_sum_exec_runtime     = 0;
4189        p->se.nr_migrations             = 0;
4190        p->se.vruntime                  = 0;
4191        INIT_LIST_HEAD(&p->se.group_node);
4192
4193#ifdef CONFIG_FAIR_GROUP_SCHED
4194        p->se.cfs_rq                    = NULL;
4195#endif
4196
4197#ifdef CONFIG_SCHEDSTATS
4198        /* Even if schedstat is disabled, there should not be garbage */
4199        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
4200#endif
4201
4202        RB_CLEAR_NODE(&p->dl.rb_node);
4203        init_dl_task_timer(&p->dl);
4204        init_dl_inactive_task_timer(&p->dl);
4205        __dl_clear_params(p);
4206
4207        INIT_LIST_HEAD(&p->rt.run_list);
4208        p->rt.timeout           = 0;
4209        p->rt.time_slice        = sched_rr_timeslice;
4210        p->rt.on_rq             = 0;
4211        p->rt.on_list           = 0;
4212
4213#ifdef CONFIG_PREEMPT_NOTIFIERS
4214        INIT_HLIST_HEAD(&p->preempt_notifiers);
4215#endif
4216
4217#ifdef CONFIG_COMPACTION
4218        p->capture_control = NULL;
4219#endif
4220        init_numa_balancing(clone_flags, p);
4221#ifdef CONFIG_SMP
4222        p->wake_entry.u_flags = CSD_TYPE_TTWU;
4223        p->migration_pending = NULL;
4224#endif
4225}
4226
4227DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
4228
4229#ifdef CONFIG_NUMA_BALANCING
4230
4231void set_numabalancing_state(bool enabled)
4232{
4233        if (enabled)
4234                static_branch_enable(&sched_numa_balancing);
4235        else
4236                static_branch_disable(&sched_numa_balancing);
4237}
4238
4239#ifdef CONFIG_PROC_SYSCTL
4240int sysctl_numa_balancing(struct ctl_table *table, int write,
4241                          void *buffer, size_t *lenp, loff_t *ppos)
4242{
4243        struct ctl_table t;
4244        int err;
4245        int state = static_branch_likely(&sched_numa_balancing);
4246
4247        if (write && !capable(CAP_SYS_ADMIN))
4248                return -EPERM;
4249
4250        t = *table;
4251        t.data = &state;
4252        err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
4253        if (err < 0)
4254                return err;
4255        if (write)
4256                set_numabalancing_state(state);
4257        return err;
4258}
4259#endif
4260#endif
4261
4262#ifdef CONFIG_SCHEDSTATS
4263
4264DEFINE_STATIC_KEY_FALSE(sched_schedstats);
4265
4266static void set_schedstats(bool enabled)
4267{
4268        if (enabled)
4269                static_branch_enable(&sched_schedstats);
4270        else
4271                static_branch_disable(&sched_schedstats);
4272}
4273
4274void force_schedstat_enabled(void)
4275{
4276        if (!schedstat_enabled()) {
4277                pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
4278                static_branch_enable(&sched_schedstats);
4279        }
4280}
4281
4282static int __init setup_schedstats(char *str)
4283{
4284        int ret = 0;
4285        if (!str)
4286                goto out;
4287
4288        if (!strcmp(str, "enable")) {
4289                set_schedstats(true);
4290                ret = 1;
4291        } else if (!strcmp(str, "disable")) {
4292                set_schedstats(false);
4293                ret = 1;
4294        }
4295out:
4296        if (!ret)
4297                pr_warn("Unable to parse schedstats=\n");
4298
4299        return ret;
4300}
4301__setup("schedstats=", setup_schedstats);
4302
4303#ifdef CONFIG_PROC_SYSCTL
4304int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
4305                size_t *lenp, loff_t *ppos)
4306{
4307        struct ctl_table t;
4308        int err;
4309        int state = static_branch_likely(&sched_schedstats);
4310
4311        if (write && !capable(CAP_SYS_ADMIN))
4312                return -EPERM;
4313
4314        t = *table;
4315        t.data = &state;
4316        err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
4317        if (err < 0)
4318                return err;
4319        if (write)
4320                set_schedstats(state);
4321        return err;
4322}
4323#endif /* CONFIG_PROC_SYSCTL */
4324#endif /* CONFIG_SCHEDSTATS */
4325
4326/*
4327 * fork()/clone()-time setup:
4328 */
4329int sched_fork(unsigned long clone_flags, struct task_struct *p)
4330{
4331        unsigned long flags;
4332
4333        __sched_fork(clone_flags, p);
4334        /*
4335         * We mark the process as NEW here. This guarantees that
4336         * nobody will actually run it, and a signal or other external
4337         * event cannot wake it up and insert it on the runqueue either.
4338         */
4339        p->__state = TASK_NEW;
4340
4341        /*
4342         * Make sure we do not leak PI boosting priority to the child.
4343         */
4344        p->prio = current->normal_prio;
4345
4346        uclamp_fork(p);
4347
4348        /*
4349         * Revert to default priority/policy on fork if requested.
4350         */
4351        if (unlikely(p->sched_reset_on_fork)) {
4352                if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
4353                        p->policy = SCHED_NORMAL;
4354                        p->static_prio = NICE_TO_PRIO(0);
4355                        p->rt_priority = 0;
4356                } else if (PRIO_TO_NICE(p->static_prio) < 0)
4357                        p->static_prio = NICE_TO_PRIO(0);
4358
4359                p->prio = p->normal_prio = p->static_prio;
4360                set_load_weight(p, false);
4361
4362                /*
4363                 * We don't need the reset flag anymore after the fork. It has
4364                 * fulfilled its duty:
4365                 */
4366                p->sched_reset_on_fork = 0;
4367        }
4368
4369        if (dl_prio(p->prio))
4370                return -EAGAIN;
4371        else if (rt_prio(p->prio))
4372                p->sched_class = &rt_sched_class;
4373        else
4374                p->sched_class = &fair_sched_class;
4375
4376        init_entity_runnable_average(&p->se);
4377
4378        /*
4379         * The child is not yet in the pid-hash so no cgroup attach races,
4380         * and the cgroup is pinned to this child due to cgroup_fork()
4381         * is ran before sched_fork().
4382         *
4383         * Silence PROVE_RCU.
4384         */
4385        raw_spin_lock_irqsave(&p->pi_lock, flags);
4386        rseq_migrate(p);
4387        /*
4388         * We're setting the CPU for the first time, we don't migrate,
4389         * so use __set_task_cpu().
4390         */
4391        __set_task_cpu(p, smp_processor_id());
4392        if (p->sched_class->task_fork)
4393                p->sched_class->task_fork(p);
4394        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4395
4396#ifdef CONFIG_SCHED_INFO
4397        if (likely(sched_info_on()))
4398                memset(&p->sched_info, 0, sizeof(p->sched_info));
4399#endif
4400#if defined(CONFIG_SMP)
4401        p->on_cpu = 0;
4402#endif
4403        init_task_preempt_count(p);
4404#ifdef CONFIG_SMP
4405        plist_node_init(&p->pushable_tasks, MAX_PRIO);
4406        RB_CLEAR_NODE(&p->pushable_dl_tasks);
4407#endif
4408        return 0;
4409}
4410
4411void sched_post_fork(struct task_struct *p)
4412{
4413        uclamp_post_fork(p);
4414}
4415
4416unsigned long to_ratio(u64 period, u64 runtime)
4417{
4418        if (runtime == RUNTIME_INF)
4419                return BW_UNIT;
4420
4421        /*
4422         * Doing this here saves a lot of checks in all
4423         * the calling paths, and returning zero seems
4424         * safe for them anyway.
4425         */
4426        if (period == 0)
4427                return 0;
4428
4429        return div64_u64(runtime << BW_SHIFT, period);
4430}
4431
4432/*
4433 * wake_up_new_task - wake up a newly created task for the first time.
4434 *
4435 * This function will do some initial scheduler statistics housekeeping
4436 * that must be done for every newly created context, then puts the task
4437 * on the runqueue and wakes it.
4438 */
4439void wake_up_new_task(struct task_struct *p)
4440{
4441        struct rq_flags rf;
4442        struct rq *rq;
4443
4444        raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
4445        WRITE_ONCE(p->__state, TASK_RUNNING);
4446#ifdef CONFIG_SMP
4447        /*
4448         * Fork balancing, do it here and not earlier because:
4449         *  - cpus_ptr can change in the fork path
4450         *  - any previously selected CPU might disappear through hotplug
4451         *
4452         * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
4453         * as we're not fully set-up yet.
4454         */
4455        p->recent_used_cpu = task_cpu(p);
4456        rseq_migrate(p);
4457        __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK));
4458#endif
4459        rq = __task_rq_lock(p, &rf);
4460        update_rq_clock(rq);
4461        post_init_entity_util_avg(p);
4462
4463        activate_task(rq, p, ENQUEUE_NOCLOCK);
4464        trace_sched_wakeup_new(p);
4465        check_preempt_curr(rq, p, WF_FORK);
4466#ifdef CONFIG_SMP
4467        if (p->sched_class->task_woken) {
4468                /*
4469                 * Nothing relies on rq->lock after this, so it's fine to
4470                 * drop it.
4471                 */
4472                rq_unpin_lock(rq, &rf);
4473                p->sched_class->task_woken(rq, p);
4474                rq_repin_lock(rq, &rf);
4475        }
4476#endif
4477        task_rq_unlock(rq, p, &rf);
4478}
4479
4480#ifdef CONFIG_PREEMPT_NOTIFIERS
4481
4482static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
4483
4484void preempt_notifier_inc(void)
4485{
4486        static_branch_inc(&preempt_notifier_key);
4487}
4488EXPORT_SYMBOL_GPL(preempt_notifier_inc);
4489
4490void preempt_notifier_dec(void)
4491{
4492        static_branch_dec(&preempt_notifier_key);
4493}
4494EXPORT_SYMBOL_GPL(preempt_notifier_dec);
4495
4496/**
4497 * preempt_notifier_register - tell me when current is being preempted & rescheduled
4498 * @notifier: notifier struct to register
4499 */
4500void preempt_notifier_register(struct preempt_notifier *notifier)
4501{
4502        if (!static_branch_unlikely(&preempt_notifier_key))
4503                WARN(1, "registering preempt_notifier while notifiers disabled\n");
4504
4505        hlist_add_head(&notifier->link, &current->preempt_notifiers);
4506}
4507EXPORT_SYMBOL_GPL(preempt_notifier_register);
4508
4509/**
4510 * preempt_notifier_unregister - no longer interested in preemption notifications
4511 * @notifier: notifier struct to unregister
4512 *
4513 * This is *not* safe to call from within a preemption notifier.
4514 */
4515void preempt_notifier_unregister(struct preempt_notifier *notifier)
4516{
4517        hlist_del(&notifier->link);
4518}
4519EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
4520
4521static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
4522{
4523        struct preempt_notifier *notifier;
4524
4525        hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
4526                notifier->ops->sched_in(notifier, raw_smp_processor_id());
4527}
4528
4529static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
4530{
4531        if (static_branch_unlikely(&preempt_notifier_key))
4532                __fire_sched_in_preempt_notifiers(curr);
4533}
4534
4535static void
4536__fire_sched_out_preempt_notifiers(struct task_struct *curr,
4537                                   struct task_struct *next)
4538{
4539        struct preempt_notifier *notifier;
4540
4541        hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
4542                notifier->ops->sched_out(notifier, next);
4543}
4544
4545static __always_inline void
4546fire_sched_out_preempt_notifiers(struct task_struct *curr,
4547                                 struct task_struct *next)
4548{
4549        if (static_branch_unlikely(&preempt_notifier_key))
4550                __fire_sched_out_preempt_notifiers(curr, next);
4551}
4552
4553#else /* !CONFIG_PREEMPT_NOTIFIERS */
4554
4555static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
4556{
4557}
4558
4559static inline void
4560fire_sched_out_preempt_notifiers(struct task_struct *curr,
4561                                 struct task_struct *next)
4562{
4563}
4564
4565#endif /* CONFIG_PREEMPT_NOTIFIERS */
4566
4567static inline void prepare_task(struct task_struct *next)
4568{
4569#ifdef CONFIG_SMP
4570        /*
4571         * Claim the task as running, we do this before switching to it
4572         * such that any running task will have this set.
4573         *
4574         * See the ttwu() WF_ON_CPU case and its ordering comment.
4575         */
4576        WRITE_ONCE(next->on_cpu, 1);
4577#endif
4578}
4579
4580static inline void finish_task(struct task_struct *prev)
4581{
4582#ifdef CONFIG_SMP
4583        /*
4584         * This must be the very last reference to @prev from this CPU. After
4585         * p->on_cpu is cleared, the task can be moved to a different CPU. We
4586         * must ensure this doesn't happen until the switch is completely
4587         * finished.
4588         *
4589         * In particular, the load of prev->state in finish_task_switch() must
4590         * happen before this.
4591         *
4592         * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
4593         */
4594        smp_store_release(&prev->on_cpu, 0);
4595#endif
4596}
4597
4598#ifdef CONFIG_SMP
4599
4600static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
4601{
4602        void (*func)(struct rq *rq);
4603        struct callback_head *next;
4604
4605        lockdep_assert_rq_held(rq);
4606
4607        while (head) {
4608                func = (void (*)(struct rq *))head->func;
4609                next = head->next;
4610                head->next = NULL;
4611                head = next;
4612
4613                func(rq);
4614        }
4615}
4616
4617static void balance_push(struct rq *rq);
4618
4619struct callback_head balance_push_callback = {
4620        .next = NULL,
4621        .func = (void (*)(struct callback_head *))balance_push,
4622};
4623
4624static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
4625{
4626        struct callback_head *head = rq->balance_callback;
4627
4628        lockdep_assert_rq_held(rq);
4629        if (head)
4630                rq->balance_callback = NULL;
4631
4632        return head;
4633}
4634
4635static void __balance_callbacks(struct rq *rq)
4636{
4637        do_balance_callbacks(rq, splice_balance_callbacks(rq));
4638}
4639
4640static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
4641{
4642        unsigned long flags;
4643
4644        if (unlikely(head)) {
4645                raw_spin_rq_lock_irqsave(rq, flags);
4646                do_balance_callbacks(rq, head);
4647                raw_spin_rq_unlock_irqrestore(rq, flags);
4648        }
4649}
4650
4651#else
4652
4653static inline void __balance_callbacks(struct rq *rq)
4654{
4655}
4656
4657static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
4658{
4659        return NULL;
4660}
4661
4662static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
4663{
4664}
4665
4666#endif
4667
4668static inline void
4669prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
4670{
4671        /*
4672         * Since the runqueue lock will be released by the next
4673         * task (which is an invalid locking op but in the case
4674         * of the scheduler it's an obvious special-case), so we
4675         * do an early lockdep release here:
4676         */
4677        rq_unpin_lock(rq, rf);
4678        spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_);
4679#ifdef CONFIG_DEBUG_SPINLOCK
4680        /* this is a valid case when another task releases the spinlock */
4681        rq_lockp(rq)->owner = next;
4682#endif
4683}
4684
4685static inline void finish_lock_switch(struct rq *rq)
4686{
4687        /*
4688         * If we are tracking spinlock dependencies then we have to
4689         * fix up the runqueue lock - which gets 'carried over' from
4690         * prev into current:
4691         */
4692        spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
4693        __balance_callbacks(rq);
4694        raw_spin_rq_unlock_irq(rq);
4695}
4696
4697/*
4698 * NOP if the arch has not defined these:
4699 */
4700
4701#ifndef prepare_arch_switch
4702# define prepare_arch_switch(next)      do { } while (0)
4703#endif
4704
4705#ifndef finish_arch_post_lock_switch
4706# define finish_arch_post_lock_switch() do { } while (0)
4707#endif
4708
4709static inline void kmap_local_sched_out(void)
4710{
4711#ifdef CONFIG_KMAP_LOCAL
4712        if (unlikely(current->kmap_ctrl.idx))
4713                __kmap_local_sched_out();
4714#endif
4715}
4716
4717static inline void kmap_local_sched_in(void)
4718{
4719#ifdef CONFIG_KMAP_LOCAL
4720        if (unlikely(current->kmap_ctrl.idx))
4721                __kmap_local_sched_in();
4722#endif
4723}
4724
4725/**
4726 * prepare_task_switch - prepare to switch tasks
4727 * @rq: the runqueue preparing to switch
4728 * @prev: the current task that is being switched out
4729 * @next: the task we are going to switch to.
4730 *
4731 * This is called with the rq lock held and interrupts off. It must
4732 * be paired with a subsequent finish_task_switch after the context
4733 * switch.
4734 *
4735 * prepare_task_switch sets up locking and calls architecture specific
4736 * hooks.
4737 */
4738static inline void
4739prepare_task_switch(struct rq *rq, struct task_struct *prev,
4740                    struct task_struct *next)
4741{
4742        kcov_prepare_switch(prev);
4743        sched_info_switch(rq, prev, next);
4744        perf_event_task_sched_out(prev, next);
4745        rseq_preempt(prev);
4746        fire_sched_out_preempt_notifiers(prev, next);
4747        kmap_local_sched_out();
4748        prepare_task(next);
4749        prepare_arch_switch(next);
4750}
4751
4752/**
4753 * finish_task_switch - clean up after a task-switch
4754 * @prev: the thread we just switched away from.
4755 *
4756 * finish_task_switch must be called after the context switch, paired
4757 * with a prepare_task_switch call before the context switch.
4758 * finish_task_switch will reconcile locking set up by prepare_task_switch,
4759 * and do any other architecture-specific cleanup actions.
4760 *
4761 * Note that we may have delayed dropping an mm in context_switch(). If
4762 * so, we finish that here outside of the runqueue lock. (Doing it
4763 * with the lock held can cause deadlocks; see schedule() for
4764 * details.)
4765 *
4766 * The context switch have flipped the stack from under us and restored the
4767 * local variables which were saved when this task called schedule() in the
4768 * past. prev == current is still correct but we need to recalculate this_rq
4769 * because prev may have moved to another CPU.
4770 */
4771static struct rq *finish_task_switch(struct task_struct *prev)
4772        __releases(rq->lock)
4773{
4774        struct rq *rq = this_rq();
4775        struct mm_struct *mm = rq->prev_mm;
4776        long prev_state;
4777
4778        /*
4779         * The previous task will have left us with a preempt_count of 2
4780         * because it left us after:
4781         *
4782         *      schedule()
4783         *        preempt_disable();                    // 1
4784         *        __schedule()
4785         *          raw_spin_lock_irq(&rq->lock)        // 2
4786         *
4787         * Also, see FORK_PREEMPT_COUNT.
4788         */
4789        if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
4790                      "corrupted preempt_count: %s/%d/0x%x\n",
4791                      current->comm, current->pid, preempt_count()))
4792                preempt_count_set(FORK_PREEMPT_COUNT);
4793
4794        rq->prev_mm = NULL;
4795
4796        /*
4797         * A task struct has one reference for the use as "current".
4798         * If a task dies, then it sets TASK_DEAD in tsk->state and calls
4799         * schedule one last time. The schedule call will never return, and
4800         * the scheduled task must drop that reference.
4801         *
4802         * We must observe prev->state before clearing prev->on_cpu (in
4803         * finish_task), otherwise a concurrent wakeup can get prev
4804         * running on another CPU and we could rave with its RUNNING -> DEAD
4805         * transition, resulting in a double drop.
4806         */
4807        prev_state = READ_ONCE(prev->__state);
4808        vtime_task_switch(prev);
4809        perf_event_task_sched_in(prev, current);
4810        finish_task(prev);
4811        tick_nohz_task_switch();
4812        finish_lock_switch(rq);
4813        finish_arch_post_lock_switch();
4814        kcov_finish_switch(current);
4815        /*
4816         * kmap_local_sched_out() is invoked with rq::lock held and
4817         * interrupts disabled. There is no requirement for that, but the
4818         * sched out code does not have an interrupt enabled section.
4819         * Restoring the maps on sched in does not require interrupts being
4820         * disabled either.
4821         */
4822        kmap_local_sched_in();
4823
4824        fire_sched_in_preempt_notifiers(current);
4825        /*
4826         * When switching through a kernel thread, the loop in
4827         * membarrier_{private,global}_expedited() may have observed that
4828         * kernel thread and not issued an IPI. It is therefore possible to
4829         * schedule between user->kernel->user threads without passing though
4830         * switch_mm(). Membarrier requires a barrier after storing to
4831         * rq->curr, before returning to userspace, so provide them here:
4832         *
4833         * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
4834         *   provided by mmdrop(),
4835         * - a sync_core for SYNC_CORE.
4836         */
4837        if (mm) {
4838                membarrier_mm_sync_core_before_usermode(mm);
4839                mmdrop(mm);
4840        }
4841        if (unlikely(prev_state == TASK_DEAD)) {
4842                if (prev->sched_class->task_dead)
4843                        prev->sched_class->task_dead(prev);
4844
4845                /*
4846                 * Remove function-return probe instances associated with this
4847                 * task and put them back on the free list.
4848                 */
4849                kprobe_flush_task(prev);
4850
4851                /* Task is done with its stack. */
4852                put_task_stack(prev);
4853
4854                put_task_struct_rcu_user(prev);
4855        }
4856
4857        return rq;
4858}
4859
4860/**
4861 * schedule_tail - first thing a freshly forked thread must call.
4862 * @prev: the thread we just switched away from.
4863 */
4864asmlinkage __visible void schedule_tail(struct task_struct *prev)
4865        __releases(rq->lock)
4866{
4867        /*
4868         * New tasks start with FORK_PREEMPT_COUNT, see there and
4869         * finish_task_switch() for details.
4870         *
4871         * finish_task_switch() will drop rq->lock() and lower preempt_count
4872         * and the preempt_enable() will end up enabling preemption (on
4873         * PREEMPT_COUNT kernels).
4874         */
4875
4876        finish_task_switch(prev);
4877        preempt_enable();
4878
4879        if (current->set_child_tid)
4880                put_user(task_pid_vnr(current), current->set_child_tid);
4881
4882        calculate_sigpending();
4883}
4884
4885/*
4886 * context_switch - switch to the new MM and the new thread's register state.
4887 */
4888static __always_inline struct rq *
4889context_switch(struct rq *rq, struct task_struct *prev,
4890               struct task_struct *next, struct rq_flags *rf)
4891{
4892        prepare_task_switch(rq, prev, next);
4893
4894        /*
4895         * For paravirt, this is coupled with an exit in switch_to to
4896         * combine the page table reload and the switch backend into
4897         * one hypercall.
4898         */
4899        arch_start_context_switch(prev);
4900
4901        /*
4902         * kernel -> kernel   lazy + transfer active
4903         *   user -> kernel   lazy + mmgrab() active
4904         *
4905         * kernel ->   user   switch + mmdrop() active
4906         *   user ->   user   switch
4907         */
4908        if (!next->mm) {                                // to kernel
4909                enter_lazy_tlb(prev->active_mm, next);
4910
4911                next->active_mm = prev->active_mm;
4912                if (prev->mm)                           // from user
4913                        mmgrab(prev->active_mm);
4914                else
4915                        prev->active_mm = NULL;
4916        } else {                                        // to user
4917                membarrier_switch_mm(rq, prev->active_mm, next->mm);
4918                /*
4919                 * sys_membarrier() requires an smp_mb() between setting
4920                 * rq->curr / membarrier_switch_mm() and returning to userspace.
4921                 *
4922                 * The below provides this either through switch_mm(), or in
4923                 * case 'prev->active_mm == next->mm' through
4924                 * finish_task_switch()'s mmdrop().
4925                 */
4926                switch_mm_irqs_off(prev->active_mm, next->mm, next);
4927
4928                if (!prev->mm) {                        // from kernel
4929                        /* will mmdrop() in finish_task_switch(). */
4930                        rq->prev_mm = prev->active_mm;
4931                        prev->active_mm = NULL;
4932                }
4933        }
4934
4935        rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
4936
4937        prepare_lock_switch(rq, next, rf);
4938
4939        /* Here we just switch the register state and the stack. */
4940        switch_to(prev, next, prev);
4941        barrier();
4942
4943        return finish_task_switch(prev);
4944}
4945
4946/*
4947 * nr_running and nr_context_switches:
4948 *
4949 * externally visible scheduler statistics: current number of runnable
4950 * threads, total number of context switches performed since bootup.
4951 */
4952unsigned int nr_running(void)
4953{
4954        unsigned int i, sum = 0;
4955
4956        for_each_online_cpu(i)
4957                sum += cpu_rq(i)->nr_running;
4958
4959        return sum;
4960}
4961
4962/*
4963 * Check if only the current task is running on the CPU.
4964 *
4965 * Caution: this function does not check that the caller has disabled
4966 * preemption, thus the result might have a time-of-check-to-time-of-use
4967 * race.  The caller is responsible to use it correctly, for example:
4968 *
4969 * - from a non-preemptible section (of course)
4970 *
4971 * - from a thread that is bound to a single CPU
4972 *
4973 * - in a loop with very short iterations (e.g. a polling loop)
4974 */
4975bool single_task_running(void)
4976{
4977        return raw_rq()->nr_running == 1;
4978}
4979EXPORT_SYMBOL(single_task_running);
4980
4981unsigned long long nr_context_switches(void)
4982{
4983        int i;
4984        unsigned long long sum = 0;
4985
4986        for_each_possible_cpu(i)
4987                sum += cpu_rq(i)->nr_switches;
4988
4989        return sum;
4990}
4991
4992/*
4993 * Consumers of these two interfaces, like for example the cpuidle menu
4994 * governor, are using nonsensical data. Preferring shallow idle state selection
4995 * for a CPU that has IO-wait which might not even end up running the task when
4996 * it does become runnable.
4997 */
4998
4999unsigned int nr_iowait_cpu(int cpu)
5000{
5001        return atomic_read(&cpu_rq(cpu)->nr_iowait);
5002}
5003
5004/*
5005 * IO-wait accounting, and how it's mostly bollocks (on SMP).
5006 *
5007 * The idea behind IO-wait account is to account the idle time that we could
5008 * have spend running if it were not for IO. That is, if we were to improve the
5009 * storage performance, we'd have a proportional reduction in IO-wait time.
5010 *
5011 * This all works nicely on UP, where, when a task blocks on IO, we account
5012 * idle time as IO-wait, because if the storage were faster, it could've been
5013 * running and we'd not be idle.
5014 *
5015 * This has been extended to SMP, by doing the same for each CPU. This however
5016 * is broken.
5017 *
5018 * Imagine for instance the case where two tasks block on one CPU, only the one
5019 * CPU will have IO-wait accounted, while the other has regular idle. Even
5020 * though, if the storage were faster, both could've ran at the same time,
5021 * utilising both CPUs.
5022 *
5023 * This means, that when looking globally, the current IO-wait accounting on
5024 * SMP is a lower bound, by reason of under accounting.
5025 *
5026 * Worse, since the numbers are provided per CPU, they are sometimes
5027 * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly
5028 * associated with any one particular CPU, it can wake to another CPU than it
5029 * blocked on. This means the per CPU IO-wait number is meaningless.
5030 *
5031 * Task CPU affinities can make all that even more 'interesting'.
5032 */
5033
5034unsigned int nr_iowait(void)
5035{
5036        unsigned int i, sum = 0;
5037
5038        for_each_possible_cpu(i)
5039                sum += nr_iowait_cpu(i);
5040
5041        return sum;
5042}
5043
5044#ifdef CONFIG_SMP
5045
5046/*
5047 * sched_exec - execve() is a valuable balancing opportunity, because at
5048 * this point the task has the smallest effective memory and cache footprint.
5049 */
5050void sched_exec(void)
5051{
5052        struct task_struct *p = current;
5053        unsigned long flags;
5054        int dest_cpu;
5055
5056        raw_spin_lock_irqsave(&p->pi_lock, flags);
5057        dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC);
5058        if (dest_cpu == smp_processor_id())
5059                goto unlock;
5060
5061        if (likely(cpu_active(dest_cpu))) {
5062                struct migration_arg arg = { p, dest_cpu };
5063
5064                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5065                stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
5066                return;
5067        }
5068unlock:
5069        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5070}
5071
5072#endif
5073
5074DEFINE_PER_CPU(struct kernel_stat, kstat);
5075DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
5076
5077EXPORT_PER_CPU_SYMBOL(kstat);
5078EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
5079
5080/*
5081 * The function fair_sched_class.update_curr accesses the struct curr
5082 * and its field curr->exec_start; when called from task_sched_runtime(),
5083 * we observe a high rate of cache misses in practice.
5084 * Prefetching this data results in improved performance.
5085 */
5086static inline void prefetch_curr_exec_start(struct task_struct *p)
5087{
5088#ifdef CONFIG_FAIR_GROUP_SCHED
5089        struct sched_entity *curr = (&p->se)->cfs_rq->curr;
5090#else
5091        struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
5092#endif
5093        prefetch(curr);
5094        prefetch(&curr->exec_start);
5095}
5096
5097/*
5098 * Return accounted runtime for the task.
5099 * In case the task is currently running, return the runtime plus current's
5100 * pending runtime that have not been accounted yet.
5101 */
5102unsigned long long task_sched_runtime(struct task_struct *p)
5103{
5104        struct rq_flags rf;
5105        struct rq *rq;
5106        u64 ns;
5107
5108#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
5109        /*
5110         * 64-bit doesn't need locks to atomically read a 64-bit value.
5111         * So we have a optimization chance when the task's delta_exec is 0.
5112         * Reading ->on_cpu is racy, but this is ok.
5113         *
5114         * If we race with it leaving CPU, we'll take a lock. So we're correct.
5115         * If we race with it entering CPU, unaccounted time is 0. This is
5116         * indistinguishable from the read occurring a few cycles earlier.
5117         * If we see ->on_cpu without ->on_rq, the task is leaving, and has
5118         * been accounted, so we're correct here as well.
5119         */
5120        if (!p->on_cpu || !task_on_rq_queued(p))
5121                return p->se.sum_exec_runtime;
5122#endif
5123
5124        rq = task_rq_lock(p, &rf);
5125        /*
5126         * Must be ->curr _and_ ->on_rq.  If dequeued, we would
5127         * project cycles that may never be accounted to this
5128         * thread, breaking clock_gettime().
5129         */
5130        if (task_current(rq, p) && task_on_rq_queued(p)) {
5131                prefetch_curr_exec_start(p);
5132                update_rq_clock(rq);
5133                p->sched_class->update_curr(rq);
5134        }
5135        ns = p->se.sum_exec_runtime;
5136        task_rq_unlock(rq, p, &rf);
5137
5138        return ns;
5139}
5140
5141#ifdef CONFIG_SCHED_DEBUG
5142static u64 cpu_resched_latency(struct rq *rq)
5143{
5144        int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
5145        u64 resched_latency, now = rq_clock(rq);
5146        static bool warned_once;
5147
5148        if (sysctl_resched_latency_warn_once && warned_once)
5149                return 0;
5150
5151        if (!need_resched() || !latency_warn_ms)
5152                return 0;
5153
5154        if (system_state == SYSTEM_BOOTING)
5155                return 0;
5156
5157        if (!rq->last_seen_need_resched_ns) {
5158                rq->last_seen_need_resched_ns = now;
5159                rq->ticks_without_resched = 0;
5160                return 0;
5161        }
5162
5163        rq->ticks_without_resched++;
5164        resched_latency = now - rq->last_seen_need_resched_ns;
5165        if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
5166                return 0;
5167
5168        warned_once = true;
5169
5170        return resched_latency;
5171}
5172
5173static int __init setup_resched_latency_warn_ms(char *str)
5174{
5175        long val;
5176
5177        if ((kstrtol(str, 0, &val))) {
5178                pr_warn("Unable to set resched_latency_warn_ms\n");
5179                return 1;
5180        }
5181
5182        sysctl_resched_latency_warn_ms = val;
5183        return 1;
5184}
5185__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
5186#else
5187static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
5188#endif /* CONFIG_SCHED_DEBUG */
5189
5190/*
5191 * This function gets called by the timer code, with HZ frequency.
5192 * We call it with interrupts disabled.
5193 */
5194void scheduler_tick(void)
5195{
5196        int cpu = smp_processor_id();
5197        struct rq *rq = cpu_rq(cpu);
5198        struct task_struct *curr = rq->curr;
5199        struct rq_flags rf;
5200        unsigned long thermal_pressure;
5201        u64 resched_latency;
5202
5203        arch_scale_freq_tick();
5204        sched_clock_tick();
5205
5206        rq_lock(rq, &rf);
5207
5208        update_rq_clock(rq);
5209        thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
5210        update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
5211        curr->sched_class->task_tick(rq, curr, 0);
5212        if (sched_feat(LATENCY_WARN))
5213                resched_latency = cpu_resched_latency(rq);
5214        calc_global_load_tick(rq);
5215
5216        rq_unlock(rq, &rf);
5217
5218        if (sched_feat(LATENCY_WARN) && resched_latency)
5219                resched_latency_warn(cpu, resched_latency);
5220
5221        perf_event_task_tick();
5222
5223#ifdef CONFIG_SMP
5224        rq->idle_balance = idle_cpu(cpu);
5225        trigger_load_balance(rq);
5226#endif
5227}
5228
5229#ifdef CONFIG_NO_HZ_FULL
5230
5231struct tick_work {
5232        int                     cpu;
5233        atomic_t                state;
5234        struct delayed_work     work;
5235};
5236/* Values for ->state, see diagram below. */
5237#define TICK_SCHED_REMOTE_OFFLINE       0
5238#define TICK_SCHED_REMOTE_OFFLINING     1
5239#define TICK_SCHED_REMOTE_RUNNING       2
5240
5241/*
5242 * State diagram for ->state:
5243 *
5244 *
5245 *          TICK_SCHED_REMOTE_OFFLINE
5246 *                    |   ^
5247 *                    |   |
5248 *                    |   | sched_tick_remote()
5249 *                    |   |
5250 *                    |   |
5251 *                    +--TICK_SCHED_REMOTE_OFFLINING
5252 *                    |   ^
5253 *                    |   |
5254 * sched_tick_start() |   | sched_tick_stop()
5255 *                    |   |
5256 *                    V   |
5257 *          TICK_SCHED_REMOTE_RUNNING
5258 *
5259 *
5260 * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()
5261 * and sched_tick_start() are happy to leave the state in RUNNING.
5262 */
5263
5264static struct tick_work __percpu *tick_work_cpu;
5265
5266static void sched_tick_remote(struct work_struct *work)
5267{
5268        struct delayed_work *dwork = to_delayed_work(work);
5269        struct tick_work *twork = container_of(dwork, struct tick_work, work);
5270        int cpu = twork->cpu;
5271        struct rq *rq = cpu_rq(cpu);
5272        struct task_struct *curr;
5273        struct rq_flags rf;
5274        u64 delta;
5275        int os;
5276
5277        /*
5278         * Handle the tick only if it appears the remote CPU is running in full
5279         * dynticks mode. The check is racy by nature, but missing a tick or
5280         * having one too much is no big deal because the scheduler tick updates
5281         * statistics and checks timeslices in a time-independent way, regardless
5282         * of when exactly it is running.
5283         */
5284        if (!tick_nohz_tick_stopped_cpu(cpu))
5285                goto out_requeue;
5286
5287        rq_lock_irq(rq, &rf);
5288        curr = rq->curr;
5289        if (cpu_is_offline(cpu))
5290                goto out_unlock;
5291
5292        update_rq_clock(rq);
5293
5294        if (!is_idle_task(curr)) {
5295                /*
5296                 * Make sure the next tick runs within a reasonable
5297                 * amount of time.
5298                 */
5299                delta = rq_clock_task(rq) - curr->se.exec_start;
5300                WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
5301        }
5302        curr->sched_class->task_tick(rq, curr, 0);
5303
5304        calc_load_nohz_remote(rq);
5305out_unlock:
5306        rq_unlock_irq(rq, &rf);
5307out_requeue:
5308
5309        /*
5310         * Run the remote tick once per second (1Hz). This arbitrary
5311         * frequency is large enough to avoid overload but short enough
5312         * to keep scheduler internal stats reasonably up to date.  But
5313         * first update state to reflect hotplug activity if required.
5314         */
5315        os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
5316        WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
5317        if (os == TICK_SCHED_REMOTE_RUNNING)
5318                queue_delayed_work(system_unbound_wq, dwork, HZ);
5319}
5320
5321static void sched_tick_start(int cpu)
5322{
5323        int os;
5324        struct tick_work *twork;
5325
5326        if (housekeeping_cpu(cpu, HK_FLAG_TICK))
5327                return;
5328
5329        WARN_ON_ONCE(!tick_work_cpu);
5330
5331        twork = per_cpu_ptr(tick_work_cpu, cpu);
5332        os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
5333        WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
5334        if (os == TICK_SCHED_REMOTE_OFFLINE) {
5335                twork->cpu = cpu;
5336                INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
5337                queue_delayed_work(system_unbound_wq, &twork->work, HZ);
5338        }
5339}
5340
5341#ifdef CONFIG_HOTPLUG_CPU
5342static void sched_tick_stop(int cpu)
5343{
5344        struct tick_work *twork;
5345        int os;
5346
5347        if (housekeeping_cpu(cpu, HK_FLAG_TICK))
5348                return;
5349
5350        WARN_ON_ONCE(!tick_work_cpu);
5351
5352        twork = per_cpu_ptr(tick_work_cpu, cpu);
5353        /* There cannot be competing actions, but don't rely on stop-machine. */
5354        os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
5355        WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
5356        /* Don't cancel, as this would mess up the state machine. */
5357}
5358#endif /* CONFIG_HOTPLUG_CPU */
5359
5360int __init sched_tick_offload_init(void)
5361{
5362        tick_work_cpu = alloc_percpu(struct tick_work);
5363        BUG_ON(!tick_work_cpu);
5364        return 0;
5365}
5366
5367#else /* !CONFIG_NO_HZ_FULL */
5368static inline void sched_tick_start(int cpu) { }
5369static inline void sched_tick_stop(int cpu) { }
5370#endif
5371
5372#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
5373                                defined(CONFIG_TRACE_PREEMPT_TOGGLE))
5374/*
5375 * If the value passed in is equal to the current preempt count
5376 * then we just disabled preemption. Start timing the latency.
5377 */
5378static inline void preempt_latency_start(int val)
5379{
5380        if (preempt_count() == val) {
5381                unsigned long ip = get_lock_parent_ip();
5382#ifdef CONFIG_DEBUG_PREEMPT
5383                current->preempt_disable_ip = ip;
5384#endif
5385                trace_preempt_off(CALLER_ADDR0, ip);
5386        }
5387}
5388
5389void preempt_count_add(int val)
5390{
5391#ifdef CONFIG_DEBUG_PREEMPT
5392        /*
5393         * Underflow?
5394         */
5395        if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
5396                return;
5397#endif
5398        __preempt_count_add(val);
5399#ifdef CONFIG_DEBUG_PREEMPT
5400        /*
5401         * Spinlock count overflowing soon?
5402         */
5403        DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
5404                                PREEMPT_MASK - 10);
5405#endif
5406        preempt_latency_start(val);
5407}
5408EXPORT_SYMBOL(preempt_count_add);
5409NOKPROBE_SYMBOL(preempt_count_add);
5410
5411/*
5412 * If the value passed in equals to the current preempt count
5413 * then we just enabled preemption. Stop timing the latency.
5414 */
5415static inline void preempt_latency_stop(int val)
5416{
5417        if (preempt_count() == val)
5418                trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
5419}
5420
5421void preempt_count_sub(int val)
5422{
5423#ifdef CONFIG_DEBUG_PREEMPT
5424        /*
5425         * Underflow?
5426         */
5427        if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
5428                return;
5429        /*
5430         * Is the spinlock portion underflowing?
5431         */
5432        if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
5433                        !(preempt_count() & PREEMPT_MASK)))
5434                return;
5435#endif
5436
5437        preempt_latency_stop(val);
5438        __preempt_count_sub(val);
5439}
5440EXPORT_SYMBOL(preempt_count_sub);
5441NOKPROBE_SYMBOL(preempt_count_sub);
5442
5443#else
5444static inline void preempt_latency_start(int val) { }
5445static inline void preempt_latency_stop(int val) { }
5446#endif
5447
5448static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
5449{
5450#ifdef CONFIG_DEBUG_PREEMPT
5451        return p->preempt_disable_ip;
5452#else
5453        return 0;
5454#endif
5455}
5456
5457/*
5458 * Print scheduling while atomic bug:
5459 */
5460static noinline void __schedule_bug(struct task_struct *prev)
5461{
5462        /* Save this before calling printk(), since that will clobber it */
5463        unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
5464
5465        if (oops_in_progress)
5466                return;
5467
5468        printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
5469                prev->comm, prev->pid, preempt_count());
5470
5471        debug_show_held_locks(prev);
5472        print_modules();
5473        if (irqs_disabled())
5474                print_irqtrace_events(prev);
5475        if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
5476            && in_atomic_preempt_off()) {
5477                pr_err("Preemption disabled at:");
5478                print_ip_sym(KERN_ERR, preempt_disable_ip);
5479        }
5480        if (panic_on_warn)
5481                panic("scheduling while atomic\n");
5482
5483        dump_stack();
5484        add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
5485}
5486
5487/*
5488 * Various schedule()-time debugging checks and statistics:
5489 */
5490static inline void schedule_debug(struct task_struct *prev, bool preempt)
5491{
5492#ifdef CONFIG_SCHED_STACK_END_CHECK
5493        if (task_stack_end_corrupted(prev))
5494                panic("corrupted stack end detected inside scheduler\n");
5495
5496        if (task_scs_end_corrupted(prev))
5497                panic("corrupted shadow stack detected inside scheduler\n");
5498#endif
5499
5500#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
5501        if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) {
5502                printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
5503                        prev->comm, prev->pid, prev->non_block_count);
5504                dump_stack();
5505                add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
5506        }
5507#endif
5508
5509        if (unlikely(in_atomic_preempt_off())) {
5510                __schedule_bug(prev);
5511                preempt_count_set(PREEMPT_DISABLED);
5512        }
5513        rcu_sleep_check();
5514        SCHED_WARN_ON(ct_state() == CONTEXT_USER);
5515
5516        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
5517
5518        schedstat_inc(this_rq()->sched_count);
5519}
5520
5521static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
5522                                  struct rq_flags *rf)
5523{
5524#ifdef CONFIG_SMP
5525        const struct sched_class *class;
5526        /*
5527         * We must do the balancing pass before put_prev_task(), such
5528         * that when we release the rq->lock the task is in the same
5529         * state as before we took rq->lock.
5530         *
5531         * We can terminate the balance pass as soon as we know there is
5532         * a runnable task of @class priority or higher.
5533         */
5534        for_class_range(class, prev->sched_class, &idle_sched_class) {
5535                if (class->balance(rq, prev, rf))
5536                        break;
5537        }
5538#endif
5539
5540        put_prev_task(rq, prev);
5541}
5542
5543/*
5544 * Pick up the highest-prio task:
5545 */
5546static inline struct task_struct *
5547__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
5548{
5549        const struct sched_class *class;
5550        struct task_struct *p;
5551
5552        /*
5553         * Optimization: we know that if all tasks are in the fair class we can
5554         * call that function directly, but only if the @prev task wasn't of a
5555         * higher scheduling class, because otherwise those lose the
5556         * opportunity to pull in more work from other CPUs.
5557         */
5558        if (likely(prev->sched_class <= &fair_sched_class &&
5559                   rq->nr_running == rq->cfs.h_nr_running)) {
5560
5561                p = pick_next_task_fair(rq, prev, rf);
5562                if (unlikely(p == RETRY_TASK))
5563                        goto restart;
5564
5565                /* Assume the next prioritized class is idle_sched_class */
5566                if (!p) {
5567                        put_prev_task(rq, prev);
5568                        p = pick_next_task_idle(rq);
5569                }
5570
5571                return p;
5572        }
5573
5574restart:
5575        put_prev_task_balance(rq, prev, rf);
5576
5577        for_each_class(class) {
5578                p = class->pick_next_task(rq);
5579                if (p)
5580                        return p;
5581        }
5582
5583        /* The idle class should always have a runnable task: */
5584        BUG();
5585}
5586
5587#ifdef CONFIG_SCHED_CORE
5588static inline bool is_task_rq_idle(struct task_struct *t)
5589{
5590        return (task_rq(t)->idle == t);
5591}
5592
5593static inline bool cookie_equals(struct task_struct *a, unsigned long cookie)
5594{
5595        return is_task_rq_idle(a) || (a->core_cookie == cookie);
5596}
5597
5598static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
5599{
5600        if (is_task_rq_idle(a) || is_task_rq_idle(b))
5601                return true;
5602
5603        return a->core_cookie == b->core_cookie;
5604}
5605
5606// XXX fairness/fwd progress conditions
5607/*
5608 * Returns
5609 * - NULL if there is no runnable task for this class.
5610 * - the highest priority task for this runqueue if it matches
5611 *   rq->core->core_cookie or its priority is greater than max.
5612 * - Else returns idle_task.
5613 */
5614static struct task_struct *
5615pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max, bool in_fi)
5616{
5617        struct task_struct *class_pick, *cookie_pick;
5618        unsigned long cookie = rq->core->core_cookie;
5619
5620        class_pick = class->pick_task(rq);
5621        if (!class_pick)
5622                return NULL;
5623
5624        if (!cookie) {
5625                /*
5626                 * If class_pick is tagged, return it only if it has
5627                 * higher priority than max.
5628                 */
5629                if (max && class_pick->core_cookie &&
5630                    prio_less(class_pick, max, in_fi))
5631                        return idle_sched_class.pick_task(rq);
5632
5633                return class_pick;
5634        }
5635
5636        /*
5637         * If class_pick is idle or matches cookie, return early.
5638         */
5639        if (cookie_equals(class_pick, cookie))
5640                return class_pick;
5641
5642        cookie_pick = sched_core_find(rq, cookie);
5643
5644        /*
5645         * If class > max && class > cookie, it is the highest priority task on
5646         * the core (so far) and it must be selected, otherwise we must go with
5647         * the cookie pick in order to satisfy the constraint.
5648         */
5649        if (prio_less(cookie_pick, class_pick, in_fi) &&
5650            (!max || prio_less(max, class_pick, in_fi)))
5651                return class_pick;
5652
5653        return cookie_pick;
5654}
5655
5656extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
5657
5658static struct task_struct *
5659pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
5660{
5661        struct task_struct *next, *max = NULL;
5662        const struct sched_class *class;
5663        const struct cpumask *smt_mask;
5664        bool fi_before = false;
5665        int i, j, cpu, occ = 0;
5666        bool need_sync;
5667
5668        if (!sched_core_enabled(rq))
5669                return __pick_next_task(rq, prev, rf);
5670
5671        cpu = cpu_of(rq);
5672
5673        /* Stopper task is switching into idle, no need core-wide selection. */
5674        if (cpu_is_offline(cpu)) {
5675                /*
5676                 * Reset core_pick so that we don't enter the fastpath when
5677                 * coming online. core_pick would already be migrated to
5678                 * another cpu during offline.
5679                 */
5680                rq->core_pick = NULL;
5681                return __pick_next_task(rq, prev, rf);
5682        }
5683
5684        /*
5685         * If there were no {en,de}queues since we picked (IOW, the task
5686         * pointers are all still valid), and we haven't scheduled the last
5687         * pick yet, do so now.
5688         *
5689         * rq->core_pick can be NULL if no selection was made for a CPU because
5690         * it was either offline or went offline during a sibling's core-wide
5691         * selection. In this case, do a core-wide selection.
5692         */
5693        if (rq->core->core_pick_seq == rq->core->core_task_seq &&
5694            rq->core->core_pick_seq != rq->core_sched_seq &&
5695            rq->core_pick) {
5696                WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
5697
5698                next = rq->core_pick;
5699                if (next != prev) {
5700                        put_prev_task(rq, prev);
5701                        set_next_task(rq, next);
5702                }
5703
5704                rq->core_pick = NULL;
5705                return next;
5706        }
5707
5708        put_prev_task_balance(rq, prev, rf);
5709
5710        smt_mask = cpu_smt_mask(cpu);
5711        need_sync = !!rq->core->core_cookie;
5712
5713        /* reset state */
5714        rq->core->core_cookie = 0UL;
5715        if (rq->core->core_forceidle) {
5716                need_sync = true;
5717                fi_before = true;
5718                rq->core->core_forceidle = false;
5719        }
5720
5721        /*
5722         * core->core_task_seq, core->core_pick_seq, rq->core_sched_seq
5723         *
5724         * @task_seq guards the task state ({en,de}queues)
5725         * @pick_seq is the @task_seq we did a selection on
5726         * @sched_seq is the @pick_seq we scheduled
5727         *
5728         * However, preemptions can cause multiple picks on the same task set.
5729         * 'Fix' this by also increasing @task_seq for every pick.
5730         */
5731        rq->core->core_task_seq++;
5732
5733        /*
5734         * Optimize for common case where this CPU has no cookies
5735         * and there are no cookied tasks running on siblings.
5736         */
5737        if (!need_sync) {
5738                for_each_class(class) {
5739                        next = class->pick_task(rq);
5740                        if (next)
5741                                break;
5742                }
5743
5744                if (!next->core_cookie) {
5745                        rq->core_pick = NULL;
5746                        /*
5747                         * For robustness, update the min_vruntime_fi for
5748                         * unconstrained picks as well.
5749                         */
5750                        WARN_ON_ONCE(fi_before);
5751                        task_vruntime_update(rq, next, false);
5752                        goto done;
5753                }
5754        }
5755
5756        for_each_cpu(i, smt_mask) {
5757                struct rq *rq_i = cpu_rq(i);
5758
5759                rq_i->core_pick = NULL;
5760
5761                if (i != cpu)
5762                        update_rq_clock(rq_i);
5763        }
5764
5765        /*
5766         * Try and select tasks for each sibling in descending sched_class
5767         * order.
5768         */
5769        for_each_class(class) {
5770again:
5771                for_each_cpu_wrap(i, smt_mask, cpu) {
5772                        struct rq *rq_i = cpu_rq(i);
5773                        struct task_struct *p;
5774
5775                        if (rq_i->core_pick)
5776                                continue;
5777
5778                        /*
5779                         * If this sibling doesn't yet have a suitable task to
5780                         * run; ask for the most eligible task, given the
5781                         * highest priority task already selected for this
5782                         * core.
5783                         */
5784                        p = pick_task(rq_i, class, max, fi_before);
5785                        if (!p)
5786                                continue;
5787
5788                        if (!is_task_rq_idle(p))
5789                                occ++;
5790
5791                        rq_i->core_pick = p;
5792                        if (rq_i->idle == p && rq_i->nr_running) {
5793                                rq->core->core_forceidle = true;
5794                                if (!fi_before)
5795                                        rq->core->core_forceidle_seq++;
5796                        }
5797
5798                        /*
5799                         * If this new candidate is of higher priority than the
5800                         * previous; and they're incompatible; we need to wipe
5801                         * the slate and start over. pick_task makes sure that
5802                         * p's priority is more than max if it doesn't match
5803                         * max's cookie.
5804                         *
5805                         * NOTE: this is a linear max-filter and is thus bounded
5806                         * in execution time.
5807                         */
5808                        if (!max || !cookie_match(max, p)) {
5809                                struct task_struct *old_max = max;
5810
5811                                rq->core->core_cookie = p->core_cookie;
5812                                max = p;
5813
5814                                if (old_max) {
5815                                        rq->core->core_forceidle = false;
5816                                        for_each_cpu(j, smt_mask) {
5817                                                if (j == i)
5818                                                        continue;
5819
5820                                                cpu_rq(j)->core_pick = NULL;
5821                                        }
5822                                        occ = 1;
5823                                        goto again;
5824                                }
5825                        }
5826                }
5827        }
5828
5829        rq->core->core_pick_seq = rq->core->core_task_seq;
5830        next = rq->core_pick;
5831        rq->core_sched_seq = rq->core->core_pick_seq;
5832
5833        /* Something should have been selected for current CPU */
5834        WARN_ON_ONCE(!next);
5835
5836        /*
5837         * Reschedule siblings
5838         *
5839         * NOTE: L1TF -- at this point we're no longer running the old task and
5840         * sending an IPI (below) ensures the sibling will no longer be running
5841         * their task. This ensures there is no inter-sibling overlap between
5842         * non-matching user state.
5843         */
5844        for_each_cpu(i, smt_mask) {
5845                struct rq *rq_i = cpu_rq(i);
5846
5847                /*
5848                 * An online sibling might have gone offline before a task
5849                 * could be picked for it, or it might be offline but later
5850                 * happen to come online, but its too late and nothing was
5851                 * picked for it.  That's Ok - it will pick tasks for itself,
5852                 * so ignore it.
5853                 */
5854                if (!rq_i->core_pick)
5855                        continue;
5856
5857                /*
5858                 * Update for new !FI->FI transitions, or if continuing to be in !FI:
5859                 * fi_before     fi      update?
5860                 *  0            0       1
5861                 *  0            1       1
5862                 *  1            0       1
5863                 *  1            1       0
5864                 */
5865                if (!(fi_before && rq->core->core_forceidle))
5866                        task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
5867
5868                rq_i->core_pick->core_occupation = occ;
5869
5870                if (i == cpu) {
5871                        rq_i->core_pick = NULL;
5872                        continue;
5873                }
5874
5875                /* Did we break L1TF mitigation requirements? */
5876                WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));
5877
5878                if (rq_i->curr == rq_i->core_pick) {
5879                        rq_i->core_pick = NULL;
5880                        continue;
5881                }
5882
5883                resched_curr(rq_i);
5884        }
5885
5886done:
5887        set_next_task(rq, next);
5888        return next;
5889}
5890
5891static bool try_steal_cookie(int this, int that)
5892{
5893        struct rq *dst = cpu_rq(this), *src = cpu_rq(that);
5894        struct task_struct *p;
5895        unsigned long cookie;
5896        bool success = false;
5897
5898        local_irq_disable();
5899        double_rq_lock(dst, src);
5900
5901        cookie = dst->core->core_cookie;
5902        if (!cookie)
5903                goto unlock;
5904
5905        if (dst->curr != dst->idle)
5906                goto unlock;
5907
5908        p = sched_core_find(src, cookie);
5909        if (p == src->idle)
5910                goto unlock;
5911
5912        do {
5913                if (p == src->core_pick || p == src->curr)
5914                        goto next;
5915
5916                if (!cpumask_test_cpu(this, &p->cpus_mask))
5917                        goto next;
5918
5919                if (p->core_occupation > dst->idle->core_occupation)
5920                        goto next;
5921
5922                deactivate_task(src, p, 0);
5923                set_task_cpu(p, this);
5924                activate_task(dst, p, 0);
5925
5926                resched_curr(dst);
5927
5928                success = true;
5929                break;
5930
5931next:
5932                p = sched_core_next(p, cookie);
5933        } while (p);
5934
5935unlock:
5936        double_rq_unlock(dst, src);
5937        local_irq_enable();
5938
5939        return success;
5940}
5941
5942static bool steal_cookie_task(int cpu, struct sched_domain *sd)
5943{
5944        int i;
5945
5946        for_each_cpu_wrap(i, sched_domain_span(sd), cpu) {
5947                if (i == cpu)
5948                        continue;
5949
5950                if (need_resched())
5951                        break;
5952
5953                if (try_steal_cookie(cpu, i))
5954                        return true;
5955        }
5956
5957        return false;
5958}
5959
5960static void sched_core_balance(struct rq *rq)
5961{
5962        struct sched_domain *sd;
5963        int cpu = cpu_of(rq);
5964
5965        preempt_disable();
5966        rcu_read_lock();
5967        raw_spin_rq_unlock_irq(rq);
5968        for_each_domain(cpu, sd) {
5969                if (need_resched())
5970                        break;
5971
5972                if (steal_cookie_task(cpu, sd))
5973                        break;
5974        }
5975        raw_spin_rq_lock_irq(rq);
5976        rcu_read_unlock();
5977        preempt_enable();
5978}
5979
5980static DEFINE_PER_CPU(struct callback_head, core_balance_head);
5981
5982void queue_core_balance(struct rq *rq)
5983{
5984        if (!sched_core_enabled(rq))
5985                return;
5986
5987        if (!rq->core->core_cookie)
5988                return;
5989
5990        if (!rq->nr_running) /* not forced idle */
5991                return;
5992
5993        queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
5994}
5995
5996static void sched_core_cpu_starting(unsigned int cpu)
5997{
5998        const struct cpumask *smt_mask = cpu_smt_mask(cpu);
5999        struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
6000        unsigned long flags;
6001        int t;
6002
6003        sched_core_lock(cpu, &flags);
6004
6005        WARN_ON_ONCE(rq->core != rq);
6006
6007        /* if we're the first, we'll be our own leader */
6008        if (cpumask_weight(smt_mask) == 1)
6009                goto unlock;
6010
6011        /* find the leader */
6012        for_each_cpu(t, smt_mask) {
6013                if (t == cpu)
6014                        continue;
6015                rq = cpu_rq(t);
6016                if (rq->core == rq) {
6017                        core_rq = rq;
6018                        break;
6019                }
6020        }
6021
6022        if (WARN_ON_ONCE(!core_rq)) /* whoopsie */
6023                goto unlock;
6024
6025        /* install and validate core_rq */
6026        for_each_cpu(t, smt_mask) {
6027                rq = cpu_rq(t);
6028
6029                if (t == cpu)
6030                        rq->core = core_rq;
6031
6032                WARN_ON_ONCE(rq->core != core_rq);
6033        }
6034
6035unlock:
6036        sched_core_unlock(cpu, &flags);
6037}
6038
6039static void sched_core_cpu_deactivate(unsigned int cpu)
6040{
6041        const struct cpumask *smt_mask = cpu_smt_mask(cpu);
6042        struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
6043        unsigned long flags;
6044        int t;
6045
6046        sched_core_lock(cpu, &flags);
6047
6048        /* if we're the last man standing, nothing to do */
6049        if (cpumask_weight(smt_mask) == 1) {
6050                WARN_ON_ONCE(rq->core != rq);
6051                goto unlock;
6052        }
6053
6054        /* if we're not the leader, nothing to do */
6055        if (rq->core != rq)
6056                goto unlock;
6057
6058        /* find a new leader */
6059        for_each_cpu(t, smt_mask) {
6060                if (t == cpu)
6061                        continue;
6062                core_rq = cpu_rq(t);
6063                break;
6064        }
6065
6066        if (WARN_ON_ONCE(!core_rq)) /* impossible */
6067                goto unlock;
6068
6069        /* copy the shared state to the new leader */
6070        core_rq->core_task_seq      = rq->core_task_seq;
6071        core_rq->core_pick_seq      = rq->core_pick_seq;
6072        core_rq->core_cookie        = rq->core_cookie;
6073        core_rq->core_forceidle     = rq->core_forceidle;
6074        core_rq->core_forceidle_seq = rq->core_forceidle_seq;
6075
6076        /* install new leader */
6077        for_each_cpu(t, smt_mask) {
6078                rq = cpu_rq(t);
6079                rq->core = core_rq;
6080        }
6081
6082unlock:
6083        sched_core_unlock(cpu, &flags);
6084}
6085
6086static inline void sched_core_cpu_dying(unsigned int cpu)
6087{
6088        struct rq *rq = cpu_rq(cpu);
6089
6090        if (rq->core != rq)
6091                rq->core = rq;
6092}
6093
6094#else /* !CONFIG_SCHED_CORE */
6095
6096static inline void sched_core_cpu_starting(unsigned int cpu) {}
6097static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
6098static inline void sched_core_cpu_dying(unsigned int cpu) {}
6099
6100static struct task_struct *
6101pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
6102{
6103        return __pick_next_task(rq, prev, rf);
6104}
6105
6106#endif /* CONFIG_SCHED_CORE */
6107
6108/*
6109 * Constants for the sched_mode argument of __schedule().
6110 *
6111 * The mode argument allows RT enabled kernels to differentiate a
6112 * preemption from blocking on an 'sleeping' spin/rwlock. Note that
6113 * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
6114 * optimize the AND operation out and just check for zero.
6115 */
6116#define SM_NONE                 0x0
6117#define SM_PREEMPT              0x1
6118#define SM_RTLOCK_WAIT          0x2
6119
6120#ifndef CONFIG_PREEMPT_RT
6121# define SM_MASK_PREEMPT        (~0U)
6122#else
6123# define SM_MASK_PREEMPT        SM_PREEMPT
6124#endif
6125
6126/*
6127 * __schedule() is the main scheduler function.
6128 *
6129 * The main means of driving the scheduler and thus entering this function are:
6130 *
6131 *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
6132 *
6133 *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
6134 *      paths. For example, see arch/x86/entry_64.S.
6135 *
6136 *      To drive preemption between tasks, the scheduler sets the flag in timer
6137 *      interrupt handler scheduler_tick().
6138 *
6139 *   3. Wakeups don't really cause entry into schedule(). They add a
6140 *      task to the run-queue and that's it.
6141 *
6142 *      Now, if the new task added to the run-queue preempts the current
6143 *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
6144 *      called on the nearest possible occasion:
6145 *
6146 *       - If the kernel is preemptible (CONFIG_PREEMPTION=y):
6147 *
6148 *         - in syscall or exception context, at the next outmost
6149 *           preempt_enable(). (this might be as soon as the wake_up()'s
6150 *           spin_unlock()!)
6151 *
6152 *         - in IRQ context, return from interrupt-handler to
6153 *           preemptible context
6154 *
6155 *       - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
6156 *         then at the next:
6157 *
6158 *          - cond_resched() call
6159 *          - explicit schedule() call
6160 *          - return from syscall or exception to user-space
6161 *          - return from interrupt-handler to user-space
6162 *
6163 * WARNING: must be called with preemption disabled!
6164 */
6165static void __sched notrace __schedule(unsigned int sched_mode)
6166{
6167        struct task_struct *prev, *next;
6168        unsigned long *switch_count;
6169        unsigned long prev_state;
6170        struct rq_flags rf;
6171        struct rq *rq;
6172        int cpu;
6173
6174        cpu = smp_processor_id();
6175        rq = cpu_rq(cpu);
6176        prev = rq->curr;
6177
6178        schedule_debug(prev, !!sched_mode);
6179
6180        if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
6181                hrtick_clear(rq);
6182
6183        local_irq_disable();
6184        rcu_note_context_switch(!!sched_mode);
6185
6186        /*
6187         * Make sure that signal_pending_state()->signal_pending() below
6188         * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
6189         * done by the caller to avoid the race with signal_wake_up():
6190         *
6191         * __set_current_state(@state)          signal_wake_up()
6192         * schedule()                             set_tsk_thread_flag(p, TIF_SIGPENDING)
6193         *                                        wake_up_state(p, state)
6194         *   LOCK rq->lock                          LOCK p->pi_state
6195         *   smp_mb__after_spinlock()               smp_mb__after_spinlock()
6196         *     if (signal_pending_state())          if (p->state & @state)
6197         *
6198         * Also, the membarrier system call requires a full memory barrier
6199         * after coming from user-space, before storing to rq->curr.
6200         */
6201        rq_lock(rq, &rf);
6202        smp_mb__after_spinlock();
6203
6204        /* Promote REQ to ACT */
6205        rq->clock_update_flags <<= 1;
6206        update_rq_clock(rq);
6207
6208        switch_count = &prev->nivcsw;
6209
6210        /*
6211         * We must load prev->state once (task_struct::state is volatile), such
6212         * that:
6213         *
6214         *  - we form a control dependency vs deactivate_task() below.
6215         *  - ptrace_{,un}freeze_traced() can change ->state underneath us.
6216         */
6217        prev_state = READ_ONCE(prev->__state);
6218        if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
6219                if (signal_pending_state(prev_state, prev)) {
6220                        WRITE_ONCE(prev->__state, TASK_RUNNING);
6221                } else {
6222                        prev->sched_contributes_to_load =
6223                                (prev_state & TASK_UNINTERRUPTIBLE) &&
6224                                !(prev_state & TASK_NOLOAD) &&
6225                                !(prev->flags & PF_FROZEN);
6226
6227                        if (prev->sched_contributes_to_load)
6228                                rq->nr_uninterruptible++;
6229
6230                        /*
6231                         * __schedule()                 ttwu()
6232                         *   prev_state = prev->state;    if (p->on_rq && ...)
6233                         *   if (prev_state)                goto out;
6234                         *     p->on_rq = 0;              smp_acquire__after_ctrl_dep();
6235                         *                                p->state = TASK_WAKING
6236                         *
6237                         * Where __schedule() and ttwu() have matching control dependencies.
6238                         *
6239                         * After this, schedule() must not care about p->state any more.
6240                         */
6241                        deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
6242
6243                        if (prev->in_iowait) {
6244                                atomic_inc(&rq->nr_iowait);
6245                                delayacct_blkio_start();
6246                        }
6247                }
6248                switch_count = &prev->nvcsw;
6249        }
6250
6251        next = pick_next_task(rq, prev, &rf);
6252        clear_tsk_need_resched(prev);
6253        clear_preempt_need_resched();
6254#ifdef CONFIG_SCHED_DEBUG
6255        rq->last_seen_need_resched_ns = 0;
6256#endif
6257
6258        if (likely(prev != next)) {
6259                rq->nr_switches++;
6260                /*
6261                 * RCU users of rcu_dereference(rq->curr) may not see
6262                 * changes to task_struct made by pick_next_task().
6263                 */
6264                RCU_INIT_POINTER(rq->curr, next);
6265                /*
6266                 * The membarrier system call requires each architecture
6267                 * to have a full memory barrier after updating
6268                 * rq->curr, before returning to user-space.
6269                 *
6270                 * Here are the schemes providing that barrier on the
6271                 * various architectures:
6272                 * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
6273                 *   switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
6274                 * - finish_lock_switch() for weakly-ordered
6275                 *   architectures where spin_unlock is a full barrier,
6276                 * - switch_to() for arm64 (weakly-ordered, spin_unlock
6277                 *   is a RELEASE barrier),
6278                 */
6279                ++*switch_count;
6280
6281                migrate_disable_switch(rq, prev);
6282                psi_sched_switch(prev, next, !task_on_rq_queued(prev));
6283
6284                trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next);
6285
6286                /* Also unlocks the rq: */
6287                rq = context_switch(rq, prev, next, &rf);
6288        } else {
6289                rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
6290
6291                rq_unpin_lock(rq, &rf);
6292                __balance_callbacks(rq);
6293                raw_spin_rq_unlock_irq(rq);
6294        }
6295}
6296
6297void __noreturn do_task_dead(void)
6298{
6299        /* Causes final put_task_struct in finish_task_switch(): */
6300        set_special_state(TASK_DEAD);
6301
6302        /* Tell freezer to ignore us: */
6303        current->flags |= PF_NOFREEZE;
6304
6305        __schedule(SM_NONE);
6306        BUG();
6307
6308        /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
6309        for (;;)
6310                cpu_relax();
6311}
6312
6313static inline void sched_submit_work(struct task_struct *tsk)
6314{
6315        unsigned int task_flags;
6316
6317        if (task_is_running(tsk))
6318                return;
6319
6320        task_flags = tsk->flags;
6321        /*
6322         * If a worker went to sleep, notify and ask workqueue whether
6323         * it wants to wake up a task to maintain concurrency.
6324         * As this function is called inside the schedule() context,
6325         * we disable preemption to avoid it calling schedule() again
6326         * in the possible wakeup of a kworker and because wq_worker_sleeping()
6327         * requires it.
6328         */
6329        if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
6330                preempt_disable();
6331                if (task_flags & PF_WQ_WORKER)
6332                        wq_worker_sleeping(tsk);
6333                else
6334                        io_wq_worker_sleeping(tsk);
6335                preempt_enable_no_resched();
6336        }
6337
6338        if (tsk_is_pi_blocked(tsk))
6339                return;
6340
6341        /*
6342         * If we are going to sleep and we have plugged IO queued,
6343         * make sure to submit it to avoid deadlocks.
6344         */
6345        if (blk_needs_flush_plug(tsk))
6346                blk_schedule_flush_plug(tsk);
6347}
6348
6349static void sched_update_worker(struct task_struct *tsk)
6350{
6351        if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
6352                if (tsk->flags & PF_WQ_WORKER)
6353                        wq_worker_running(tsk);
6354                else
6355                        io_wq_worker_running(tsk);
6356        }
6357}
6358
6359asmlinkage __visible void __sched schedule(void)
6360{
6361        struct task_struct *tsk = current;
6362
6363        sched_submit_work(tsk);
6364        do {
6365                preempt_disable();
6366                __schedule(SM_NONE);
6367                sched_preempt_enable_no_resched();
6368        } while (need_resched());
6369        sched_update_worker(tsk);
6370}
6371EXPORT_SYMBOL(schedule);
6372
6373/*
6374 * synchronize_rcu_tasks() makes sure that no task is stuck in preempted
6375 * state (have scheduled out non-voluntarily) by making sure that all
6376 * tasks have either left the run queue or have gone into user space.
6377 * As idle tasks do not do either, they must not ever be preempted
6378 * (schedule out non-voluntarily).
6379 *
6380 * schedule_idle() is similar to schedule_preempt_disable() except that it
6381 * never enables preemption because it does not call sched_submit_work().
6382 */
6383void __sched schedule_idle(void)
6384{
6385        /*
6386         * As this skips calling sched_submit_work(), which the idle task does
6387         * regardless because that function is a nop when the task is in a
6388         * TASK_RUNNING state, make sure this isn't used someplace that the
6389         * current task can be in any other state. Note, idle is always in the
6390         * TASK_RUNNING state.
6391         */
6392        WARN_ON_ONCE(current->__state);
6393        do {
6394                __schedule(SM_NONE);
6395        } while (need_resched());
6396}
6397
6398#if defined(CONFIG_CONTEXT_TRACKING) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK)
6399asmlinkage __visible void __sched schedule_user(void)
6400{
6401        /*
6402         * If we come here after a random call to set_need_resched(),
6403         * or we have been woken up remotely but the IPI has not yet arrived,
6404         * we haven't yet exited the RCU idle mode. Do it here manually until
6405         * we find a better solution.
6406         *
6407         * NB: There are buggy callers of this function.  Ideally we
6408         * should warn if prev_state != CONTEXT_USER, but that will trigger
6409         * too frequently to make sense yet.
6410         */
6411        enum ctx_state prev_state = exception_enter();
6412        schedule();
6413        exception_exit(prev_state);
6414}
6415#endif
6416
6417/**
6418 * schedule_preempt_disabled - called with preemption disabled
6419 *
6420 * Returns with preemption disabled. Note: preempt_count must be 1
6421 */
6422void __sched schedule_preempt_disabled(void)
6423{
6424        sched_preempt_enable_no_resched();
6425        schedule();
6426        preempt_disable();
6427}
6428
6429#ifdef CONFIG_PREEMPT_RT
6430void __sched notrace schedule_rtlock(void)
6431{
6432        do {
6433                preempt_disable();
6434                __schedule(SM_RTLOCK_WAIT);
6435                sched_preempt_enable_no_resched();
6436        } while (need_resched());
6437}
6438NOKPROBE_SYMBOL(schedule_rtlock);
6439#endif
6440
6441static void __sched notrace preempt_schedule_common(void)
6442{
6443        do {
6444                /*
6445                 * Because the function tracer can trace preempt_count_sub()
6446                 * and it also uses preempt_enable/disable_notrace(), if
6447                 * NEED_RESCHED is set, the preempt_enable_notrace() called
6448                 * by the function tracer will call this function again and
6449                 * cause infinite recursion.
6450                 *
6451                 * Preemption must be disabled here before the function
6452                 * tracer can trace. Break up preempt_disable() into two
6453                 * calls. One to disable preemption without fear of being
6454                 * traced. The other to still record the preemption latency,
6455                 * which can also be traced by the function tracer.
6456                 */
6457                preempt_disable_notrace();
6458                preempt_latency_start(1);
6459                __schedule(SM_PREEMPT);
6460                preempt_latency_stop(1);
6461                preempt_enable_no_resched_notrace();
6462
6463                /*
6464                 * Check again in case we missed a preemption opportunity
6465                 * between schedule and now.
6466                 */
6467        } while (need_resched());
6468}
6469
6470#ifdef CONFIG_PREEMPTION
6471/*
6472 * This is the entry point to schedule() from in-kernel preemption
6473 * off of preempt_enable.
6474 */
6475asmlinkage __visible void __sched notrace preempt_schedule(void)
6476{
6477        /*
6478         * If there is a non-zero preempt_count or interrupts are disabled,
6479         * we do not want to preempt the current task. Just return..
6480         */
6481        if (likely(!preemptible()))
6482                return;
6483
6484        preempt_schedule_common();
6485}
6486NOKPROBE_SYMBOL(preempt_schedule);
6487EXPORT_SYMBOL(preempt_schedule);
6488
6489#ifdef CONFIG_PREEMPT_DYNAMIC
6490DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);
6491EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
6492#endif
6493
6494
6495/**
6496 * preempt_schedule_notrace - preempt_schedule called by tracing
6497 *
6498 * The tracing infrastructure uses preempt_enable_notrace to prevent
6499 * recursion and tracing preempt enabling caused by the tracing
6500 * infrastructure itself. But as tracing can happen in areas coming
6501 * from userspace or just about to enter userspace, a preempt enable
6502 * can occur before user_exit() is called. This will cause the scheduler
6503 * to be called when the system is still in usermode.
6504 *
6505 * To prevent this, the preempt_enable_notrace will use this function
6506 * instead of preempt_schedule() to exit user context if needed before
6507 * calling the scheduler.
6508 */
6509asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
6510{
6511        enum ctx_state prev_ctx;
6512
6513        if (likely(!preemptible()))
6514                return;
6515
6516        do {
6517                /*
6518                 * Because the function tracer can trace preempt_count_sub()
6519                 * and it also uses preempt_enable/disable_notrace(), if
6520                 * NEED_RESCHED is set, the preempt_enable_notrace() called
6521                 * by the function tracer will call this function again and
6522                 * cause infinite recursion.
6523                 *
6524                 * Preemption must be disabled here before the function
6525                 * tracer can trace. Break up preempt_disable() into two
6526                 * calls. One to disable preemption without fear of being
6527                 * traced. The other to still record the preemption latency,
6528                 * which can also be traced by the function tracer.
6529                 */
6530                preempt_disable_notrace();
6531                preempt_latency_start(1);
6532                /*
6533                 * Needs preempt disabled in case user_exit() is traced
6534                 * and the tracer calls preempt_enable_notrace() causing
6535                 * an infinite recursion.
6536                 */
6537                prev_ctx = exception_enter();
6538                __schedule(SM_PREEMPT);
6539                exception_exit(prev_ctx);
6540
6541                preempt_latency_stop(1);
6542                preempt_enable_no_resched_notrace();
6543        } while (need_resched());
6544}
6545EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
6546
6547#ifdef CONFIG_PREEMPT_DYNAMIC
6548DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);
6549EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
6550#endif
6551
6552#endif /* CONFIG_PREEMPTION */
6553
6554#ifdef CONFIG_PREEMPT_DYNAMIC
6555
6556#include <linux/entry-common.h>
6557
6558/*
6559 * SC:cond_resched
6560 * SC:might_resched
6561 * SC:preempt_schedule
6562 * SC:preempt_schedule_notrace
6563 * SC:irqentry_exit_cond_resched
6564 *
6565 *
6566 * NONE:
6567 *   cond_resched               <- __cond_resched
6568 *   might_resched              <- RET0
6569 *   preempt_schedule           <- NOP
6570 *   preempt_schedule_notrace   <- NOP
6571 *   irqentry_exit_cond_resched <- NOP
6572 *
6573 * VOLUNTARY:
6574 *   cond_resched               <- __cond_resched
6575 *   might_resched              <- __cond_resched
6576 *   preempt_schedule           <- NOP
6577 *   preempt_schedule_notrace   <- NOP
6578 *   irqentry_exit_cond_resched <- NOP
6579 *
6580 * FULL:
6581 *   cond_resched               <- RET0
6582 *   might_resched              <- RET0
6583 *   preempt_schedule           <- preempt_schedule
6584 *   preempt_schedule_notrace   <- preempt_schedule_notrace
6585 *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
6586 */
6587
6588enum {
6589        preempt_dynamic_none = 0,
6590        preempt_dynamic_voluntary,
6591        preempt_dynamic_full,
6592};
6593
6594int preempt_dynamic_mode = preempt_dynamic_full;
6595
6596int sched_dynamic_mode(const char *str)
6597{
6598        if (!strcmp(str, "none"))
6599                return preempt_dynamic_none;
6600
6601        if (!strcmp(str, "voluntary"))
6602                return preempt_dynamic_voluntary;
6603
6604        if (!strcmp(str, "full"))
6605                return preempt_dynamic_full;
6606
6607        return -EINVAL;
6608}
6609
6610void sched_dynamic_update(int mode)
6611{
6612        /*
6613         * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
6614         * the ZERO state, which is invalid.
6615         */
6616        static_call_update(cond_resched, __cond_resched);
6617        static_call_update(might_resched, __cond_resched);
6618        static_call_update(preempt_schedule, __preempt_schedule_func);
6619        static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
6620        static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
6621
6622        switch (mode) {
6623        case preempt_dynamic_none:
6624                static_call_update(cond_resched, __cond_resched);
6625                static_call_update(might_resched, (void *)&__static_call_return0);
6626                static_call_update(preempt_schedule, NULL);
6627                static_call_update(preempt_schedule_notrace, NULL);
6628                static_call_update(irqentry_exit_cond_resched, NULL);
6629                pr_info("Dynamic Preempt: none\n");
6630                break;
6631
6632        case preempt_dynamic_voluntary:
6633                static_call_update(cond_resched, __cond_resched);
6634                static_call_update(might_resched, __cond_resched);
6635                static_call_update(preempt_schedule, NULL);
6636                static_call_update(preempt_schedule_notrace, NULL);
6637                static_call_update(irqentry_exit_cond_resched, NULL);
6638                pr_info("Dynamic Preempt: voluntary\n");
6639                break;
6640
6641        case preempt_dynamic_full:
6642                static_call_update(cond_resched, (void *)&__static_call_return0);
6643                static_call_update(might_resched, (void *)&__static_call_return0);
6644                static_call_update(preempt_schedule, __preempt_schedule_func);
6645                static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
6646                static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
6647                pr_info("Dynamic Preempt: full\n");
6648                break;
6649        }
6650
6651        preempt_dynamic_mode = mode;
6652}
6653
6654static int __init setup_preempt_mode(char *str)
6655{
6656        int mode = sched_dynamic_mode(str);
6657        if (mode < 0) {
6658                pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
6659                return 1;
6660        }
6661
6662        sched_dynamic_update(mode);
6663        return 0;
6664}
6665__setup("preempt=", setup_preempt_mode);
6666
6667#endif /* CONFIG_PREEMPT_DYNAMIC */
6668
6669/*
6670 * This is the entry point to schedule() from kernel preemption
6671 * off of irq context.
6672 * Note, that this is called and return with irqs disabled. This will
6673 * protect us against recursive calling from irq.
6674 */
6675asmlinkage __visible void __sched preempt_schedule_irq(void)
6676{
6677        enum ctx_state prev_state;
6678
6679        /* Catch callers which need to be fixed */
6680        BUG_ON(preempt_count() || !irqs_disabled());
6681
6682        prev_state = exception_enter();
6683
6684        do {
6685                preempt_disable();
6686                local_irq_enable();
6687                __schedule(SM_PREEMPT);
6688                local_irq_disable();
6689                sched_preempt_enable_no_resched();
6690        } while (need_resched());
6691
6692        exception_exit(prev_state);
6693}
6694
6695int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
6696                          void *key)
6697{
6698        WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
6699        return try_to_wake_up(curr->private, mode, wake_flags);
6700}
6701EXPORT_SYMBOL(default_wake_function);
6702
6703static void __setscheduler_prio(struct task_struct *p, int prio)
6704{
6705        if (dl_prio(prio))
6706                p->sched_class = &dl_sched_class;
6707        else if (rt_prio(prio))
6708                p->sched_class = &rt_sched_class;
6709        else
6710                p->sched_class = &fair_sched_class;
6711
6712        p->prio = prio;
6713}
6714
6715#ifdef CONFIG_RT_MUTEXES
6716
6717static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
6718{
6719        if (pi_task)
6720                prio = min(prio, pi_task->prio);
6721
6722        return prio;
6723}
6724
6725static inline int rt_effective_prio(struct task_struct *p, int prio)
6726{
6727        struct task_struct *pi_task = rt_mutex_get_top_task(p);
6728
6729        return __rt_effective_prio(pi_task, prio);
6730}
6731
6732/*
6733 * rt_mutex_setprio - set the current priority of a task
6734 * @p: task to boost
6735 * @pi_task: donor task
6736 *
6737 * This function changes the 'effective' priority of a task. It does
6738 * not touch ->normal_prio like __setscheduler().
6739 *
6740 * Used by the rt_mutex code to implement priority inheritance
6741 * logic. Call site only calls if the priority of the task changed.
6742 */
6743void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
6744{
6745        int prio, oldprio, queued, running, queue_flag =
6746                DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
6747        const struct sched_class *prev_class;
6748        struct rq_flags rf;
6749        struct rq *rq;
6750
6751        /* XXX used to be waiter->prio, not waiter->task->prio */
6752        prio = __rt_effective_prio(pi_task, p->normal_prio);
6753
6754        /*
6755         * If nothing changed; bail early.
6756         */
6757        if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
6758                return;
6759
6760        rq = __task_rq_lock(p, &rf);
6761        update_rq_clock(rq);
6762        /*
6763         * Set under pi_lock && rq->lock, such that the value can be used under
6764         * either lock.
6765         *
6766         * Note that there is loads of tricky to make this pointer cache work
6767         * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
6768         * ensure a task is de-boosted (pi_task is set to NULL) before the
6769         * task is allowed to run again (and can exit). This ensures the pointer
6770         * points to a blocked task -- which guarantees the task is present.
6771         */
6772        p->pi_top_task = pi_task;
6773
6774        /*
6775         * For FIFO/RR we only need to set prio, if that matches we're done.
6776         */
6777        if (prio == p->prio && !dl_prio(prio))
6778                goto out_unlock;
6779
6780        /*
6781         * Idle task boosting is a nono in general. There is one
6782         * exception, when PREEMPT_RT and NOHZ is active:
6783         *
6784         * The idle task calls get_next_timer_interrupt() and holds
6785         * the timer wheel base->lock on the CPU and another CPU wants
6786         * to access the timer (probably to cancel it). We can safely
6787         * ignore the boosting request, as the idle CPU runs this code
6788         * with interrupts disabled and will complete the lock
6789         * protected section without being interrupted. So there is no
6790         * real need to boost.
6791         */
6792        if (unlikely(p == rq->idle)) {
6793                WARN_ON(p != rq->curr);
6794                WARN_ON(p->pi_blocked_on);
6795                goto out_unlock;
6796        }
6797
6798        trace_sched_pi_setprio(p, pi_task);
6799        oldprio = p->prio;
6800
6801        if (oldprio == prio)
6802                queue_flag &= ~DEQUEUE_MOVE;
6803
6804        prev_class = p->sched_class;
6805        queued = task_on_rq_queued(p);
6806        running = task_current(rq, p);
6807        if (queued)
6808                dequeue_task(rq, p, queue_flag);
6809        if (running)
6810                put_prev_task(rq, p);
6811
6812        /*
6813         * Boosting condition are:
6814         * 1. -rt task is running and holds mutex A
6815         *      --> -dl task blocks on mutex A
6816         *
6817         * 2. -dl task is running and holds mutex A
6818         *      --> -dl task blocks on mutex A and could preempt the
6819         *          running task
6820         */
6821        if (dl_prio(prio)) {
6822                if (!dl_prio(p->normal_prio) ||
6823                    (pi_task && dl_prio(pi_task->prio) &&
6824                     dl_entity_preempt(&pi_task->dl, &p->dl))) {
6825                        p->dl.pi_se = pi_task->dl.pi_se;
6826                        queue_flag |= ENQUEUE_REPLENISH;
6827                } else {
6828                        p->dl.pi_se = &p->dl;
6829                }
6830        } else if (rt_prio(prio)) {
6831                if (dl_prio(oldprio))
6832                        p->dl.pi_se = &p->dl;
6833                if (oldprio < prio)
6834                        queue_flag |= ENQUEUE_HEAD;
6835        } else {
6836                if (dl_prio(oldprio))
6837                        p->dl.pi_se = &p->dl;
6838                if (rt_prio(oldprio))
6839                        p->rt.timeout = 0;
6840        }
6841
6842        __setscheduler_prio(p, prio);
6843
6844        if (queued)
6845                enqueue_task(rq, p, queue_flag);
6846        if (running)
6847                set_next_task(rq, p);
6848
6849        check_class_changed(rq, p, prev_class, oldprio);
6850out_unlock:
6851        /* Avoid rq from going away on us: */
6852        preempt_disable();
6853
6854        rq_unpin_lock(rq, &rf);
6855        __balance_callbacks(rq);
6856        raw_spin_rq_unlock(rq);
6857
6858        preempt_enable();
6859}
6860#else
6861static inline int rt_effective_prio(struct task_struct *p, int prio)
6862{
6863        return prio;
6864}
6865#endif
6866
6867void set_user_nice(struct task_struct *p, long nice)
6868{
6869        bool queued, running;
6870        int old_prio;
6871        struct rq_flags rf;
6872        struct rq *rq;
6873
6874        if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
6875                return;
6876        /*
6877         * We have to be careful, if called from sys_setpriority(),
6878         * the task might be in the middle of scheduling on another CPU.
6879         */
6880        rq = task_rq_lock(p, &rf);
6881        update_rq_clock(rq);
6882
6883        /*
6884         * The RT priorities are set via sched_setscheduler(), but we still
6885         * allow the 'normal' nice value to be set - but as expected
6886         * it won't have any effect on scheduling until the task is
6887         * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
6888         */
6889        if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
6890                p->static_prio = NICE_TO_PRIO(nice);
6891                goto out_unlock;
6892        }
6893        queued = task_on_rq_queued(p);
6894        running = task_current(rq, p);
6895        if (queued)
6896                dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
6897        if (running)
6898                put_prev_task(rq, p);
6899
6900        p->static_prio = NICE_TO_PRIO(nice);
6901        set_load_weight(p, true);
6902        old_prio = p->prio;
6903        p->prio = effective_prio(p);
6904
6905        if (queued)
6906                enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
6907        if (running)
6908                set_next_task(rq, p);
6909
6910        /*
6911         * If the task increased its priority or is running and
6912         * lowered its priority, then reschedule its CPU:
6913         */
6914        p->sched_class->prio_changed(rq, p, old_prio);
6915
6916out_unlock:
6917        task_rq_unlock(rq, p, &rf);
6918}
6919EXPORT_SYMBOL(set_user_nice);
6920
6921/*
6922 * can_nice - check if a task can reduce its nice value
6923 * @p: task
6924 * @nice: nice value
6925 */
6926int can_nice(const struct task_struct *p, const int nice)
6927{
6928        /* Convert nice value [19,-20] to rlimit style value [1,40]: */
6929        int nice_rlim = nice_to_rlimit(nice);
6930
6931        return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
6932                capable(CAP_SYS_NICE));
6933}
6934
6935#ifdef __ARCH_WANT_SYS_NICE
6936
6937/*
6938 * sys_nice - change the priority of the current process.
6939 * @increment: priority increment
6940 *
6941 * sys_setpriority is a more generic, but much slower function that
6942 * does similar things.
6943 */
6944SYSCALL_DEFINE1(nice, int, increment)
6945{
6946        long nice, retval;
6947
6948        /*
6949         * Setpriority might change our priority at the same moment.
6950         * We don't have to worry. Conceptually one call occurs first
6951         * and we have a single winner.
6952         */
6953        increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
6954        nice = task_nice(current) + increment;
6955
6956        nice = clamp_val(nice, MIN_NICE, MAX_NICE);
6957        if (increment < 0 && !can_nice(current, nice))
6958                return -EPERM;
6959
6960        retval = security_task_setnice(current, nice);
6961        if (retval)
6962                return retval;
6963
6964        set_user_nice(current, nice);
6965        return 0;
6966}
6967
6968#endif
6969
6970/**
6971 * task_prio - return the priority value of a given task.
6972 * @p: the task in question.
6973 *
6974 * Return: The priority value as seen by users in /proc.
6975 *
6976 * sched policy         return value   kernel prio    user prio/nice
6977 *
6978 * normal, batch, idle     [0 ... 39]  [100 ... 139]          0/[-20 ... 19]
6979 * fifo, rr             [-2 ... -100]     [98 ... 0]  [1 ... 99]
6980 * deadline                     -101             -1           0
6981 */
6982int task_prio(const struct task_struct *p)
6983{
6984        return p->prio - MAX_RT_PRIO;
6985}
6986
6987/**
6988 * idle_cpu - is a given CPU idle currently?
6989 * @cpu: the processor in question.
6990 *
6991 * Return: 1 if the CPU is currently idle. 0 otherwise.
6992 */
6993int idle_cpu(int cpu)
6994{
6995        struct rq *rq = cpu_rq(cpu);
6996
6997        if (rq->curr != rq->idle)
6998                return 0;
6999
7000        if (rq->nr_running)
7001                return 0;
7002
7003#ifdef CONFIG_SMP
7004        if (rq->ttwu_pending)
7005                return 0;
7006#endif
7007
7008        return 1;
7009}
7010
7011/**
7012 * available_idle_cpu - is a given CPU idle for enqueuing work.
7013 * @cpu: the CPU in question.
7014 *
7015 * Return: 1 if the CPU is currently idle. 0 otherwise.
7016 */
7017int available_idle_cpu(int cpu)
7018{
7019        if (!idle_cpu(cpu))
7020                return 0;
7021
7022        if (vcpu_is_preempted(cpu))
7023                return 0;
7024
7025        return 1;
7026}
7027
7028/**
7029 * idle_task - return the idle task for a given CPU.
7030 * @cpu: the processor in question.
7031 *
7032 * Return: The idle task for the CPU @cpu.
7033 */
7034struct task_struct *idle_task(int cpu)
7035{
7036        return cpu_rq(cpu)->idle;
7037}
7038
7039#ifdef CONFIG_SMP
7040/*
7041 * This function computes an effective utilization for the given CPU, to be
7042 * used for frequency selection given the linear relation: f = u * f_max.
7043 *
7044 * The scheduler tracks the following metrics:
7045 *
7046 *   cpu_util_{cfs,rt,dl,irq}()
7047 *   cpu_bw_dl()
7048 *
7049 * Where the cfs,rt and dl util numbers are tracked with the same metric and
7050 * synchronized windows and are thus directly comparable.
7051 *
7052 * The cfs,rt,dl utilization are the running times measured with rq->clock_task
7053 * which excludes things like IRQ and steal-time. These latter are then accrued
7054 * in the irq utilization.
7055 *
7056 * The DL bandwidth number otoh is not a measured metric but a value computed
7057 * based on the task model parameters and gives the minimal utilization
7058 * required to meet deadlines.
7059 */
7060unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
7061                                 unsigned long max, enum cpu_util_type type,
7062                                 struct task_struct *p)
7063{
7064        unsigned long dl_util, util, irq;
7065        struct rq *rq = cpu_rq(cpu);
7066
7067        if (!uclamp_is_used() &&
7068            type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
7069                return max;
7070        }
7071
7072        /*
7073         * Early check to see if IRQ/steal time saturates the CPU, can be
7074         * because of inaccuracies in how we track these -- see
7075         * update_irq_load_avg().
7076         */
7077        irq = cpu_util_irq(rq);
7078        if (unlikely(irq >= max))
7079                return max;
7080
7081        /*
7082         * Because the time spend on RT/DL tasks is visible as 'lost' time to
7083         * CFS tasks and we use the same metric to track the effective
7084         * utilization (PELT windows are synchronized) we can directly add them
7085         * to obtain the CPU's actual utilization.
7086         *
7087         * CFS and RT utilization can be boosted or capped, depending on
7088         * utilization clamp constraints requested by currently RUNNABLE
7089         * tasks.
7090         * When there are no CFS RUNNABLE tasks, clamps are released and
7091         * frequency will be gracefully reduced with the utilization decay.
7092         */
7093        util = util_cfs + cpu_util_rt(rq);
7094        if (type == FREQUENCY_UTIL)
7095                util = uclamp_rq_util_with(rq, util, p);
7096
7097        dl_util = cpu_util_dl(rq);
7098
7099        /*
7100         * For frequency selection we do not make cpu_util_dl() a permanent part
7101         * of this sum because we want to use cpu_bw_dl() later on, but we need
7102         * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
7103         * that we select f_max when there is no idle time.
7104         *
7105         * NOTE: numerical errors or stop class might cause us to not quite hit
7106         * saturation when we should -- something for later.
7107         */
7108        if (util + dl_util >= max)
7109                return max;
7110
7111        /*
7112         * OTOH, for energy computation we need the estimated running time, so
7113         * include util_dl and ignore dl_bw.
7114         */
7115        if (type == ENERGY_UTIL)
7116                util += dl_util;
7117
7118        /*
7119         * There is still idle time; further improve the number by using the
7120         * irq metric. Because IRQ/steal time is hidden from the task clock we
7121         * need to scale the task numbers:
7122         *
7123         *              max - irq
7124         *   U' = irq + --------- * U
7125         *                 max
7126         */
7127        util = scale_irq_capacity(util, irq, max);
7128        util += irq;
7129
7130        /*
7131         * Bandwidth required by DEADLINE must always be granted while, for
7132         * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
7133         * to gracefully reduce the frequency when no tasks show up for longer
7134         * periods of time.
7135         *
7136         * Ideally we would like to set bw_dl as min/guaranteed freq and util +
7137         * bw_dl as requested freq. However, cpufreq is not yet ready for such
7138         * an interface. So, we only do the latter for now.
7139         */
7140        if (type == FREQUENCY_UTIL)
7141                util += cpu_bw_dl(rq);
7142
7143        return min(max, util);
7144}
7145
7146unsigned long sched_cpu_util(int cpu, unsigned long max)
7147{
7148        return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max,
7149                                  ENERGY_UTIL, NULL);
7150}
7151#endif /* CONFIG_SMP */
7152
7153/**
7154 * find_process_by_pid - find a process with a matching PID value.
7155 * @pid: the pid in question.
7156 *
7157 * The task of @pid, if found. %NULL otherwise.
7158 */
7159static struct task_struct *find_process_by_pid(pid_t pid)
7160{
7161        return pid ? find_task_by_vpid(pid) : current;
7162}
7163
7164/*
7165 * sched_setparam() passes in -1 for its policy, to let the functions
7166 * it calls know not to change it.
7167 */
7168#define SETPARAM_POLICY -1
7169
7170static void __setscheduler_params(struct task_struct *p,
7171                const struct sched_attr *attr)
7172{
7173        int policy = attr->sched_policy;
7174
7175        if (policy == SETPARAM_POLICY)
7176                policy = p->policy;
7177
7178        p->policy = policy;
7179
7180        if (dl_policy(policy))
7181                __setparam_dl(p, attr);
7182        else if (fair_policy(policy))
7183                p->static_prio = NICE_TO_PRIO(attr->sched_nice);
7184
7185        /*
7186         * __sched_setscheduler() ensures attr->sched_priority == 0 when
7187         * !rt_policy. Always setting this ensures that things like
7188         * getparam()/getattr() don't report silly values for !rt tasks.
7189         */
7190        p->rt_priority = attr->sched_priority;
7191        p->normal_prio = normal_prio(p);
7192        set_load_weight(p, true);
7193}
7194
7195/*
7196 * Check the target process has a UID that matches the current process's:
7197 */
7198static bool check_same_owner(struct task_struct *p)
7199{
7200        const struct cred *cred = current_cred(), *pcred;
7201        bool match;
7202
7203        rcu_read_lock();
7204        pcred = __task_cred(p);
7205        match = (uid_eq(cred->euid, pcred->euid) ||
7206                 uid_eq(cred->euid, pcred->uid));
7207        rcu_read_unlock();
7208        return match;
7209}
7210
7211static int __sched_setscheduler(struct task_struct *p,
7212                                const struct sched_attr *attr,
7213                                bool user, bool pi)
7214{
7215        int oldpolicy = -1, policy = attr->sched_policy;
7216        int retval, oldprio, newprio, queued, running;
7217        const struct sched_class *prev_class;
7218        struct callback_head *head;
7219        struct rq_flags rf;
7220        int reset_on_fork;
7221        int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
7222        struct rq *rq;
7223
7224        /* The pi code expects interrupts enabled */
7225        BUG_ON(pi && in_interrupt());
7226recheck:
7227        /* Double check policy once rq lock held: */
7228        if (policy < 0) {
7229                reset_on_fork = p->sched_reset_on_fork;
7230                policy = oldpolicy = p->policy;
7231        } else {
7232                reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
7233
7234                if (!valid_policy(policy))
7235                        return -EINVAL;
7236        }
7237
7238        if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
7239                return -EINVAL;
7240
7241        /*
7242         * Valid priorities for SCHED_FIFO and SCHED_RR are
7243         * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL,
7244         * SCHED_BATCH and SCHED_IDLE is 0.
7245         */
7246        if (attr->sched_priority > MAX_RT_PRIO-1)
7247                return -EINVAL;
7248        if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
7249            (rt_policy(policy) != (attr->sched_priority != 0)))
7250                return -EINVAL;
7251
7252        /*
7253         * Allow unprivileged RT tasks to decrease priority:
7254         */
7255        if (user && !capable(CAP_SYS_NICE)) {
7256                if (fair_policy(policy)) {
7257                        if (attr->sched_nice < task_nice(p) &&
7258                            !can_nice(p, attr->sched_nice))
7259                                return -EPERM;
7260                }
7261
7262                if (rt_policy(policy)) {
7263                        unsigned long rlim_rtprio =
7264                                        task_rlimit(p, RLIMIT_RTPRIO);
7265
7266                        /* Can't set/change the rt policy: */
7267                        if (policy != p->policy && !rlim_rtprio)
7268                                return -EPERM;
7269
7270                        /* Can't increase priority: */
7271                        if (attr->sched_priority > p->rt_priority &&
7272                            attr->sched_priority > rlim_rtprio)
7273                                return -EPERM;
7274                }
7275
7276                 /*
7277                  * Can't set/change SCHED_DEADLINE policy at all for now
7278                  * (safest behavior); in the future we would like to allow
7279                  * unprivileged DL tasks to increase their relative deadline
7280                  * or reduce their runtime (both ways reducing utilization)
7281                  */
7282                if (dl_policy(policy))
7283                        return -EPERM;
7284
7285                /*
7286                 * Treat SCHED_IDLE as nice 20. Only allow a switch to
7287                 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
7288                 */
7289                if (task_has_idle_policy(p) && !idle_policy(policy)) {
7290                        if (!can_nice(p, task_nice(p)))
7291                                return -EPERM;
7292                }
7293
7294                /* Can't change other user's priorities: */
7295                if (!check_same_owner(p))
7296                        return -EPERM;
7297
7298                /* Normal users shall not reset the sched_reset_on_fork flag: */
7299                if (p->sched_reset_on_fork && !reset_on_fork)
7300                        return -EPERM;
7301        }
7302
7303        if (user) {
7304                if (attr->sched_flags & SCHED_FLAG_SUGOV)
7305                        return -EINVAL;
7306
7307                retval = security_task_setscheduler(p);
7308                if (retval)
7309                        return retval;
7310        }
7311
7312        /* Update task specific "requested" clamps */
7313        if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
7314                retval = uclamp_validate(p, attr);
7315                if (retval)
7316                        return retval;
7317        }
7318
7319        if (pi)
7320                cpuset_read_lock();
7321
7322        /*
7323         * Make sure no PI-waiters arrive (or leave) while we are
7324         * changing the priority of the task:
7325         *
7326         * To be able to change p->policy safely, the appropriate
7327         * runqueue lock must be held.
7328         */
7329        rq = task_rq_lock(p, &rf);
7330        update_rq_clock(rq);
7331
7332        /*
7333         * Changing the policy of the stop threads its a very bad idea:
7334         */
7335        if (p == rq->stop) {
7336                retval = -EINVAL;
7337                goto unlock;
7338        }
7339
7340        /*
7341         * If not changing anything there's no need to proceed further,
7342         * but store a possible modification of reset_on_fork.
7343         */
7344        if (unlikely(policy == p->policy)) {
7345                if (fair_policy(policy) && attr->sched_nice != task_nice(p))
7346                        goto change;
7347                if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
7348                        goto change;
7349                if (dl_policy(policy) && dl_param_changed(p, attr))
7350                        goto change;
7351                if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
7352                        goto change;
7353
7354                p->sched_reset_on_fork = reset_on_fork;
7355                retval = 0;
7356                goto unlock;
7357        }
7358change:
7359
7360        if (user) {
7361#ifdef CONFIG_RT_GROUP_SCHED
7362                /*
7363                 * Do not allow realtime tasks into groups that have no runtime
7364                 * assigned.
7365                 */
7366                if (rt_bandwidth_enabled() && rt_policy(policy) &&
7367                                task_group(p)->rt_bandwidth.rt_runtime == 0 &&
7368                                !task_group_is_autogroup(task_group(p))) {
7369                        retval = -EPERM;
7370                        goto unlock;
7371                }
7372#endif
7373#ifdef CONFIG_SMP
7374                if (dl_bandwidth_enabled() && dl_policy(policy) &&
7375                                !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
7376                        cpumask_t *span = rq->rd->span;
7377
7378                        /*
7379                         * Don't allow tasks with an affinity mask smaller than
7380                         * the entire root_domain to become SCHED_DEADLINE. We
7381                         * will also fail if there's no bandwidth available.
7382                         */
7383                        if (!cpumask_subset(span, p->cpus_ptr) ||
7384                            rq->rd->dl_bw.bw == 0) {
7385                                retval = -EPERM;
7386                                goto unlock;
7387                        }
7388                }
7389#endif
7390        }
7391
7392        /* Re-check policy now with rq lock held: */
7393        if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
7394                policy = oldpolicy = -1;
7395                task_rq_unlock(rq, p, &rf);
7396                if (pi)
7397                        cpuset_read_unlock();
7398                goto recheck;
7399        }
7400
7401        /*
7402         * If setscheduling to SCHED_DEADLINE (or changing the parameters
7403         * of a SCHED_DEADLINE task) we need to check if enough bandwidth
7404         * is available.
7405         */
7406        if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
7407                retval = -EBUSY;
7408                goto unlock;
7409        }
7410
7411        p->sched_reset_on_fork = reset_on_fork;
7412        oldprio = p->prio;
7413
7414        newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
7415        if (pi) {
7416                /*
7417                 * Take priority boosted tasks into account. If the new
7418                 * effective priority is unchanged, we just store the new
7419                 * normal parameters and do not touch the scheduler class and
7420                 * the runqueue. This will be done when the task deboost
7421                 * itself.
7422                 */
7423                newprio = rt_effective_prio(p, newprio);
7424                if (newprio == oldprio)
7425                        queue_flags &= ~DEQUEUE_MOVE;
7426        }
7427
7428        queued = task_on_rq_queued(p);
7429        running = task_current(rq, p);
7430        if (queued)
7431                dequeue_task(rq, p, queue_flags);
7432        if (running)
7433                put_prev_task(rq, p);
7434
7435        prev_class = p->sched_class;
7436
7437        if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
7438                __setscheduler_params(p, attr);
7439                __setscheduler_prio(p, newprio);
7440        }
7441        __setscheduler_uclamp(p, attr);
7442
7443        if (queued) {
7444                /*
7445                 * We enqueue to tail when the priority of a task is
7446                 * increased (user space view).
7447                 */
7448                if (oldprio < p->prio)
7449                        queue_flags |= ENQUEUE_HEAD;
7450
7451                enqueue_task(rq, p, queue_flags);
7452        }
7453        if (running)
7454                set_next_task(rq, p);
7455
7456        check_class_changed(rq, p, prev_class, oldprio);
7457
7458        /* Avoid rq from going away on us: */
7459        preempt_disable();
7460        head = splice_balance_callbacks(rq);
7461        task_rq_unlock(rq, p, &rf);
7462
7463        if (pi) {
7464                cpuset_read_unlock();
7465                rt_mutex_adjust_pi(p);
7466        }
7467
7468        /* Run balance callbacks after we've adjusted the PI chain: */
7469        balance_callbacks(rq, head);
7470        preempt_enable();
7471
7472        return 0;
7473
7474unlock:
7475        task_rq_unlock(rq, p, &rf);
7476        if (pi)
7477                cpuset_read_unlock();
7478        return retval;
7479}
7480
7481static int _sched_setscheduler(struct task_struct *p, int policy,
7482                               const struct sched_param *param, bool check)
7483{
7484        struct sched_attr attr = {
7485                .sched_policy   = policy,
7486                .sched_priority = param->sched_priority,
7487                .sched_nice     = PRIO_TO_NICE(p->static_prio),
7488        };
7489
7490        /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
7491        if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
7492                attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
7493                policy &= ~SCHED_RESET_ON_FORK;
7494                attr.sched_policy = policy;
7495        }
7496
7497        return __sched_setscheduler(p, &attr, check, true);
7498}
7499/**
7500 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
7501 * @p: the task in question.
7502 * @policy: new policy.
7503 * @param: structure containing the new RT priority.
7504 *
7505 * Use sched_set_fifo(), read its comment.
7506 *
7507 * Return: 0 on success. An error code otherwise.
7508 *
7509 * NOTE that the task may be already dead.
7510 */
7511int sched_setscheduler(struct task_struct *p, int policy,
7512                       const struct sched_param *param)
7513{
7514        return _sched_setscheduler(p, policy, param, true);
7515}
7516
7517int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
7518{
7519        return __sched_setscheduler(p, attr, true, true);
7520}
7521
7522int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
7523{
7524        return __sched_setscheduler(p, attr, false, true);
7525}
7526EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
7527
7528/**
7529 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
7530 * @p: the task in question.
7531 * @policy: new policy.
7532 * @param: structure containing the new RT priority.
7533 *
7534 * Just like sched_setscheduler, only don't bother checking if the
7535 * current context has permission.  For example, this is needed in
7536 * stop_machine(): we create temporary high priority worker threads,
7537 * but our caller might not have that capability.
7538 *
7539 * Return: 0 on success. An error code otherwise.
7540 */
7541int sched_setscheduler_nocheck(struct task_struct *p, int policy,
7542                               const struct sched_param *param)
7543{
7544        return _sched_setscheduler(p, policy, param, false);
7545}
7546
7547/*
7548 * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
7549 * incapable of resource management, which is the one thing an OS really should
7550 * be doing.
7551 *
7552 * This is of course the reason it is limited to privileged users only.
7553 *
7554 * Worse still; it is fundamentally impossible to compose static priority
7555 * workloads. You cannot take two correctly working static prio workloads
7556 * and smash them together and still expect them to work.
7557 *
7558 * For this reason 'all' FIFO tasks the kernel creates are basically at:
7559 *
7560 *   MAX_RT_PRIO / 2
7561 *
7562 * The administrator _MUST_ configure the system, the kernel simply doesn't
7563 * know enough information to make a sensible choice.
7564 */
7565void sched_set_fifo(struct task_struct *p)
7566{
7567        struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
7568        WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
7569}
7570EXPORT_SYMBOL_GPL(sched_set_fifo);
7571
7572/*
7573 * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
7574 */
7575void sched_set_fifo_low(struct task_struct *p)
7576{
7577        struct sched_param sp = { .sched_priority = 1 };
7578        WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
7579}
7580EXPORT_SYMBOL_GPL(sched_set_fifo_low);
7581
7582void sched_set_normal(struct task_struct *p, int nice)
7583{
7584        struct sched_attr attr = {
7585                .sched_policy = SCHED_NORMAL,
7586                .sched_nice = nice,
7587        };
7588        WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
7589}
7590EXPORT_SYMBOL_GPL(sched_set_normal);
7591
7592static int
7593do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
7594{
7595        struct sched_param lparam;
7596        struct task_struct *p;
7597        int retval;
7598
7599        if (!param || pid < 0)
7600                return -EINVAL;
7601        if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
7602                return -EFAULT;
7603
7604        rcu_read_lock();
7605        retval = -ESRCH;
7606        p = find_process_by_pid(pid);
7607        if (likely(p))
7608                get_task_struct(p);
7609        rcu_read_unlock();
7610
7611        if (likely(p)) {
7612                retval = sched_setscheduler(p, policy, &lparam);
7613                put_task_struct(p);
7614        }
7615
7616        return retval;
7617}
7618
7619/*
7620 * Mimics kernel/events/core.c perf_copy_attr().
7621 */
7622static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
7623{
7624        u32 size;
7625        int ret;
7626
7627        /* Zero the full structure, so that a short copy will be nice: */
7628        memset(attr, 0, sizeof(*attr));
7629
7630        ret = get_user(size, &uattr->size);
7631        if (ret)
7632                return ret;
7633
7634        /* ABI compatibility quirk: */
7635        if (!size)
7636                size = SCHED_ATTR_SIZE_VER0;
7637        if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
7638                goto err_size;
7639
7640        ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
7641        if (ret) {
7642                if (ret == -E2BIG)
7643                        goto err_size;
7644                return ret;
7645        }
7646
7647        if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
7648            size < SCHED_ATTR_SIZE_VER1)
7649                return -EINVAL;
7650
7651        /*
7652         * XXX: Do we want to be lenient like existing syscalls; or do we want
7653         * to be strict and return an error on out-of-bounds values?
7654         */
7655        attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
7656
7657        return 0;
7658
7659err_size:
7660        put_user(sizeof(*attr), &uattr->size);
7661        return -E2BIG;
7662}
7663
7664static void get_params(struct task_struct *p, struct sched_attr *attr)
7665{
7666        if (task_has_dl_policy(p))
7667                __getparam_dl(p, attr);
7668        else if (task_has_rt_policy(p))
7669                attr->sched_priority = p->rt_priority;
7670        else
7671                attr->sched_nice = task_nice(p);
7672}
7673
7674/**
7675 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
7676 * @pid: the pid in question.
7677 * @policy: new policy.
7678 * @param: structure containing the new RT priority.
7679 *
7680 * Return: 0 on success. An error code otherwise.
7681 */
7682SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
7683{
7684        if (policy < 0)
7685                return -EINVAL;
7686
7687        return do_sched_setscheduler(pid, policy, param);
7688}
7689
7690/**
7691 * sys_sched_setparam - set/change the RT priority of a thread
7692 * @pid: the pid in question.
7693 * @param: structure containing the new RT priority.
7694 *
7695 * Return: 0 on success. An error code otherwise.
7696 */
7697SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
7698{
7699        return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
7700}
7701
7702/**
7703 * sys_sched_setattr - same as above, but with extended sched_attr
7704 * @pid: the pid in question.
7705 * @uattr: structure containing the extended parameters.
7706 * @flags: for future extension.
7707 */
7708SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
7709                               unsigned int, flags)
7710{
7711        struct sched_attr attr;
7712        struct task_struct *p;
7713        int retval;
7714
7715        if (!uattr || pid < 0 || flags)
7716                return -EINVAL;
7717
7718        retval = sched_copy_attr(uattr, &attr);
7719        if (retval)
7720                return retval;
7721
7722        if ((int)attr.sched_policy < 0)
7723                return -EINVAL;
7724        if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
7725                attr.sched_policy = SETPARAM_POLICY;
7726
7727        rcu_read_lock();
7728        retval = -ESRCH;
7729        p = find_process_by_pid(pid);
7730        if (likely(p))
7731                get_task_struct(p);
7732        rcu_read_unlock();
7733
7734        if (likely(p)) {
7735                if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
7736                        get_params(p, &attr);
7737                retval = sched_setattr(p, &attr);
7738                put_task_struct(p);
7739        }
7740
7741        return retval;
7742}
7743
7744/**
7745 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
7746 * @pid: the pid in question.
7747 *
7748 * Return: On success, the policy of the thread. Otherwise, a negative error
7749 * code.
7750 */
7751SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
7752{
7753        struct task_struct *p;
7754        int retval;
7755
7756        if (pid < 0)
7757                return -EINVAL;
7758
7759        retval = -ESRCH;
7760        rcu_read_lock();
7761        p = find_process_by_pid(pid);
7762        if (p) {
7763                retval = security_task_getscheduler(p);
7764                if (!retval)
7765                        retval = p->policy
7766                                | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
7767        }
7768        rcu_read_unlock();
7769        return retval;
7770}
7771
7772/**
7773 * sys_sched_getparam - get the RT priority of a thread
7774 * @pid: the pid in question.
7775 * @param: structure containing the RT priority.
7776 *
7777 * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
7778 * code.
7779 */
7780SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
7781{
7782        struct sched_param lp = { .sched_priority = 0 };
7783        struct task_struct *p;
7784        int retval;
7785
7786        if (!param || pid < 0)
7787                return -EINVAL;
7788
7789        rcu_read_lock();
7790        p = find_process_by_pid(pid);
7791        retval = -ESRCH;
7792        if (!p)
7793                goto out_unlock;
7794
7795        retval = security_task_getscheduler(p);
7796        if (retval)
7797                goto out_unlock;
7798
7799        if (task_has_rt_policy(p))
7800                lp.sched_priority = p->rt_priority;
7801        rcu_read_unlock();
7802
7803        /*
7804         * This one might sleep, we cannot do it with a spinlock held ...
7805         */
7806        retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
7807
7808        return retval;
7809
7810out_unlock:
7811        rcu_read_unlock();
7812        return retval;
7813}
7814
7815/*
7816 * Copy the kernel size attribute structure (which might be larger
7817 * than what user-space knows about) to user-space.
7818 *
7819 * Note that all cases are valid: user-space buffer can be larger or
7820 * smaller than the kernel-space buffer. The usual case is that both
7821 * have the same size.
7822 */
7823static int
7824sched_attr_copy_to_user(struct sched_attr __user *uattr,
7825                        struct sched_attr *kattr,
7826                        unsigned int usize)
7827{
7828        unsigned int ksize = sizeof(*kattr);
7829
7830        if (!access_ok(uattr, usize))
7831                return -EFAULT;
7832
7833        /*
7834         * sched_getattr() ABI forwards and backwards compatibility:
7835         *
7836         * If usize == ksize then we just copy everything to user-space and all is good.
7837         *
7838         * If usize < ksize then we only copy as much as user-space has space for,
7839         * this keeps ABI compatibility as well. We skip the rest.
7840         *
7841         * If usize > ksize then user-space is using a newer version of the ABI,
7842         * which part the kernel doesn't know about. Just ignore it - tooling can
7843         * detect the kernel's knowledge of attributes from the attr->size value
7844         * which is set to ksize in this case.
7845         */
7846        kattr->size = min(usize, ksize);
7847
7848        if (copy_to_user(uattr, kattr, kattr->size))
7849                return -EFAULT;
7850
7851        return 0;
7852}
7853
7854/**
7855 * sys_sched_getattr - similar to sched_getparam, but with sched_attr
7856 * @pid: the pid in question.
7857 * @uattr: structure containing the extended parameters.
7858 * @usize: sizeof(attr) for fwd/bwd comp.
7859 * @flags: for future extension.
7860 */
7861SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
7862                unsigned int, usize, unsigned int, flags)
7863{
7864        struct sched_attr kattr = { };
7865        struct task_struct *p;
7866        int retval;
7867
7868        if (!uattr || pid < 0 || usize > PAGE_SIZE ||
7869            usize < SCHED_ATTR_SIZE_VER0 || flags)
7870                return -EINVAL;
7871
7872        rcu_read_lock();
7873        p = find_process_by_pid(pid);
7874        retval = -ESRCH;
7875        if (!p)
7876                goto out_unlock;
7877
7878        retval = security_task_getscheduler(p);
7879        if (retval)
7880                goto out_unlock;
7881
7882        kattr.sched_policy = p->policy;
7883        if (p->sched_reset_on_fork)
7884                kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
7885        get_params(p, &kattr);
7886        kattr.sched_flags &= SCHED_FLAG_ALL;
7887
7888#ifdef CONFIG_UCLAMP_TASK
7889        /*
7890         * This could race with another potential updater, but this is fine
7891         * because it'll correctly read the old or the new value. We don't need
7892         * to guarantee who wins the race as long as it doesn't return garbage.
7893         */
7894        kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
7895        kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
7896#endif
7897
7898        rcu_read_unlock();
7899
7900        return sched_attr_copy_to_user(uattr, &kattr, usize);
7901
7902out_unlock:
7903        rcu_read_unlock();
7904        return retval;
7905}
7906
7907#ifdef CONFIG_SMP
7908int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
7909{
7910        int ret = 0;
7911
7912        /*
7913         * If the task isn't a deadline task or admission control is
7914         * disabled then we don't care about affinity changes.
7915         */
7916        if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
7917                return 0;
7918
7919        /*
7920         * Since bandwidth control happens on root_domain basis,
7921         * if admission test is enabled, we only admit -deadline
7922         * tasks allowed to run on all the CPUs in the task's
7923         * root_domain.
7924         */
7925        rcu_read_lock();
7926        if (!cpumask_subset(task_rq(p)->rd->span, mask))
7927                ret = -EBUSY;
7928        rcu_read_unlock();
7929        return ret;
7930}
7931#endif
7932
7933static int
7934__sched_setaffinity(struct task_struct *p, const struct cpumask *mask)
7935{
7936        int retval;
7937        cpumask_var_t cpus_allowed, new_mask;
7938
7939        if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
7940                return -ENOMEM;
7941
7942        if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
7943                retval = -ENOMEM;
7944                goto out_free_cpus_allowed;
7945        }
7946
7947        cpuset_cpus_allowed(p, cpus_allowed);
7948        cpumask_and(new_mask, mask, cpus_allowed);
7949
7950        retval = dl_task_check_affinity(p, new_mask);
7951        if (retval)
7952                goto out_free_new_mask;
7953again:
7954        retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER);
7955        if (retval)
7956                goto out_free_new_mask;
7957
7958        cpuset_cpus_allowed(p, cpus_allowed);
7959        if (!cpumask_subset(new_mask, cpus_allowed)) {
7960                /*
7961                 * We must have raced with a concurrent cpuset update.
7962                 * Just reset the cpumask to the cpuset's cpus_allowed.
7963                 */
7964                cpumask_copy(new_mask, cpus_allowed);
7965                goto again;
7966        }
7967
7968out_free_new_mask:
7969        free_cpumask_var(new_mask);
7970out_free_cpus_allowed:
7971        free_cpumask_var(cpus_allowed);
7972        return retval;
7973}
7974
7975long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
7976{
7977        struct task_struct *p;
7978        int retval;
7979
7980        rcu_read_lock();
7981
7982        p = find_process_by_pid(pid);
7983        if (!p) {
7984                rcu_read_unlock();
7985                return -ESRCH;
7986        }
7987
7988        /* Prevent p going away */
7989        get_task_struct(p);
7990        rcu_read_unlock();
7991
7992        if (p->flags & PF_NO_SETAFFINITY) {
7993                retval = -EINVAL;
7994                goto out_put_task;
7995        }
7996
7997        if (!check_same_owner(p)) {
7998                rcu_read_lock();
7999                if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
8000                        rcu_read_unlock();
8001                        retval = -EPERM;
8002                        goto out_put_task;
8003                }
8004                rcu_read_unlock();
8005        }
8006
8007        retval = security_task_setscheduler(p);
8008        if (retval)
8009                goto out_put_task;
8010
8011        retval = __sched_setaffinity(p, in_mask);
8012out_put_task:
8013        put_task_struct(p);
8014        return retval;
8015}
8016
8017static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
8018                             struct cpumask *new_mask)
8019{
8020        if (len < cpumask_size())
8021                cpumask_clear(new_mask);
8022        else if (len > cpumask_size())
8023                len = cpumask_size();
8024
8025        return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
8026}
8027
8028/**
8029 * sys_sched_setaffinity - set the CPU affinity of a process
8030 * @pid: pid of the process
8031 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
8032 * @user_mask_ptr: user-space pointer to the new CPU mask
8033 *
8034 * Return: 0 on success. An error code otherwise.
8035 */
8036SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
8037                unsigned long __user *, user_mask_ptr)
8038{
8039        cpumask_var_t new_mask;
8040        int retval;
8041
8042        if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
8043                return -ENOMEM;
8044
8045        retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
8046        if (retval == 0)
8047                retval = sched_setaffinity(pid, new_mask);
8048        free_cpumask_var(new_mask);
8049        return retval;
8050}
8051
8052long sched_getaffinity(pid_t pid, struct cpumask *mask)
8053{
8054        struct task_struct *p;
8055        unsigned long flags;
8056        int retval;
8057
8058        rcu_read_lock();
8059
8060        retval = -ESRCH;
8061        p = find_process_by_pid(pid);
8062        if (!p)
8063                goto out_unlock;
8064
8065        retval = security_task_getscheduler(p);
8066        if (retval)
8067                goto out_unlock;
8068
8069        raw_spin_lock_irqsave(&p->pi_lock, flags);
8070        cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
8071        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
8072
8073out_unlock:
8074        rcu_read_unlock();
8075
8076        return retval;
8077}
8078
8079/**
8080 * sys_sched_getaffinity - get the CPU affinity of a process
8081 * @pid: pid of the process
8082 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
8083 * @user_mask_ptr: user-space pointer to hold the current CPU mask
8084 *
8085 * Return: size of CPU mask copied to user_mask_ptr on success. An
8086 * error code otherwise.
8087 */
8088SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
8089                unsigned long __user *, user_mask_ptr)
8090{
8091        int ret;
8092        cpumask_var_t mask;
8093
8094        if ((len * BITS_PER_BYTE) < nr_cpu_ids)
8095                return -EINVAL;
8096        if (len & (sizeof(unsigned long)-1))
8097                return -EINVAL;
8098
8099        if (!alloc_cpumask_var(&mask, GFP_KERNEL))
8100                return -ENOMEM;
8101
8102        ret = sched_getaffinity(pid, mask);
8103        if (ret == 0) {
8104                unsigned int retlen = min(len, cpumask_size());
8105
8106                if (copy_to_user(user_mask_ptr, mask, retlen))
8107                        ret = -EFAULT;
8108                else
8109                        ret = retlen;
8110        }
8111        free_cpumask_var(mask);
8112
8113        return ret;
8114}
8115
8116static void do_sched_yield(void)
8117{
8118        struct rq_flags rf;
8119        struct rq *rq;
8120
8121        rq = this_rq_lock_irq(&rf);
8122
8123        schedstat_inc(rq->yld_count);
8124        current->sched_class->yield_task(rq);
8125
8126        preempt_disable();
8127        rq_unlock_irq(rq, &rf);
8128        sched_preempt_enable_no_resched();
8129
8130        schedule();
8131}
8132
8133/**
8134 * sys_sched_yield - yield the current processor to other threads.
8135 *
8136 * This function yields the current CPU to other tasks. If there are no
8137 * other threads running on this CPU then this function will return.
8138 *
8139 * Return: 0.
8140 */
8141SYSCALL_DEFINE0(sched_yield)
8142{
8143        do_sched_yield();
8144        return 0;
8145}
8146
8147#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
8148int __sched __cond_resched(void)
8149{
8150        if (should_resched(0)) {
8151                preempt_schedule_common();
8152                return 1;
8153        }
8154        /*
8155         * In preemptible kernels, ->rcu_read_lock_nesting tells the tick
8156         * whether the current CPU is in an RCU read-side critical section,
8157         * so the tick can report quiescent states even for CPUs looping
8158         * in kernel context.  In contrast, in non-preemptible kernels,
8159         * RCU readers leave no in-memory hints, which means that CPU-bound
8160         * processes executing in kernel context might never report an
8161         * RCU quiescent state.  Therefore, the following code causes
8162         * cond_resched() to report a quiescent state, but only when RCU
8163         * is in urgent need of one.
8164         */
8165#ifndef CONFIG_PREEMPT_RCU
8166        rcu_all_qs();
8167#endif
8168        return 0;
8169}
8170EXPORT_SYMBOL(__cond_resched);
8171#endif
8172
8173#ifdef CONFIG_PREEMPT_DYNAMIC
8174DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
8175EXPORT_STATIC_CALL_TRAMP(cond_resched);
8176
8177DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
8178EXPORT_STATIC_CALL_TRAMP(might_resched);
8179#endif
8180
8181/*
8182 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
8183 * call schedule, and on return reacquire the lock.
8184 *
8185 * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
8186 * operations here to prevent schedule() from being called twice (once via
8187 * spin_unlock(), once by hand).
8188 */
8189int __cond_resched_lock(spinlock_t *lock)
8190{
8191        int resched = should_resched(PREEMPT_LOCK_OFFSET);
8192        int ret = 0;
8193
8194        lockdep_assert_held(lock);
8195
8196        if (spin_needbreak(lock) || resched) {
8197                spin_unlock(lock);
8198                if (resched)
8199                        preempt_schedule_common();
8200                else
8201                        cpu_relax();
8202                ret = 1;
8203                spin_lock(lock);
8204        }
8205        return ret;
8206}
8207EXPORT_SYMBOL(__cond_resched_lock);
8208
8209int __cond_resched_rwlock_read(rwlock_t *lock)
8210{
8211        int resched = should_resched(PREEMPT_LOCK_OFFSET);
8212        int ret = 0;
8213
8214        lockdep_assert_held_read(lock);
8215
8216        if (rwlock_needbreak(lock) || resched) {
8217                read_unlock(lock);
8218                if (resched)
8219                        preempt_schedule_common();
8220                else
8221                        cpu_relax();
8222                ret = 1;
8223                read_lock(lock);
8224        }
8225        return ret;
8226}
8227EXPORT_SYMBOL(__cond_resched_rwlock_read);
8228
8229int __cond_resched_rwlock_write(rwlock_t *lock)
8230{
8231        int resched = should_resched(PREEMPT_LOCK_OFFSET);
8232        int ret = 0;
8233
8234        lockdep_assert_held_write(lock);
8235
8236        if (rwlock_needbreak(lock) || resched) {
8237                write_unlock(lock);
8238                if (resched)
8239                        preempt_schedule_common();
8240                else
8241                        cpu_relax();
8242                ret = 1;
8243                write_lock(lock);
8244        }
8245        return ret;
8246}
8247EXPORT_SYMBOL(__cond_resched_rwlock_write);
8248
8249/**
8250 * yield - yield the current processor to other threads.
8251 *
8252 * Do not ever use this function, there's a 99% chance you're doing it wrong.
8253 *
8254 * The scheduler is at all times free to pick the calling task as the most
8255 * eligible task to run, if removing the yield() call from your code breaks
8256 * it, it's already broken.
8257 *
8258 * Typical broken usage is:
8259 *
8260 * while (!event)
8261 *      yield();
8262 *
8263 * where one assumes that yield() will let 'the other' process run that will
8264 * make event true. If the current task is a SCHED_FIFO task that will never
8265 * happen. Never use yield() as a progress guarantee!!
8266 *
8267 * If you want to use yield() to wait for something, use wait_event().
8268 * If you want to use yield() to be 'nice' for others, use cond_resched().
8269 * If you still want to use yield(), do not!
8270 */
8271void __sched yield(void)
8272{
8273        set_current_state(TASK_RUNNING);
8274        do_sched_yield();
8275}
8276EXPORT_SYMBOL(yield);
8277
8278/**
8279 * yield_to - yield the current processor to another thread in
8280 * your thread group, or accelerate that thread toward the
8281 * processor it's on.
8282 * @p: target task
8283 * @preempt: whether task preemption is allowed or not
8284 *
8285 * It's the caller's job to ensure that the target task struct
8286 * can't go away on us before we can do any checks.
8287 *
8288 * Return:
8289 *      true (>0) if we indeed boosted the target task.
8290 *      false (0) if we failed to boost the target.
8291 *      -ESRCH if there's no task to yield to.
8292 */
8293int __sched yield_to(struct task_struct *p, bool preempt)
8294{
8295        struct task_struct *curr = current;
8296        struct rq *rq, *p_rq;
8297        unsigned long flags;
8298        int yielded = 0;
8299
8300        local_irq_save(flags);
8301        rq = this_rq();
8302
8303again:
8304        p_rq = task_rq(p);
8305        /*
8306         * If we're the only runnable task on the rq and target rq also
8307         * has only one task, there's absolutely no point in yielding.
8308         */
8309        if (rq->nr_running == 1 && p_rq->nr_running == 1) {
8310                yielded = -ESRCH;
8311                goto out_irq;
8312        }
8313
8314        double_rq_lock(rq, p_rq);
8315        if (task_rq(p) != p_rq) {
8316                double_rq_unlock(rq, p_rq);
8317                goto again;
8318        }
8319
8320        if (!curr->sched_class->yield_to_task)
8321                goto out_unlock;
8322
8323        if (curr->sched_class != p->sched_class)
8324                goto out_unlock;
8325
8326        if (task_running(p_rq, p) || !task_is_running(p))
8327                goto out_unlock;
8328
8329        yielded = curr->sched_class->yield_to_task(rq, p);
8330        if (yielded) {
8331                schedstat_inc(rq->yld_count);
8332                /*
8333                 * Make p's CPU reschedule; pick_next_entity takes care of
8334                 * fairness.
8335                 */
8336                if (preempt && rq != p_rq)
8337                        resched_curr(p_rq);
8338        }
8339
8340out_unlock:
8341        double_rq_unlock(rq, p_rq);
8342out_irq:
8343        local_irq_restore(flags);
8344
8345        if (yielded > 0)
8346                schedule();
8347
8348        return yielded;
8349}
8350EXPORT_SYMBOL_GPL(yield_to);
8351
8352int io_schedule_prepare(void)
8353{
8354        int old_iowait = current->in_iowait;
8355
8356        current->in_iowait = 1;
8357        blk_schedule_flush_plug(current);
8358
8359        return old_iowait;
8360}
8361
8362void io_schedule_finish(int token)
8363{
8364        current->in_iowait = token;
8365}
8366
8367/*
8368 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
8369 * that process accounting knows that this is a task in IO wait state.
8370 */
8371long __sched io_schedule_timeout(long timeout)
8372{
8373        int token;
8374        long ret;
8375
8376        token = io_schedule_prepare();
8377        ret = schedule_timeout(timeout);
8378        io_schedule_finish(token);
8379
8380        return ret;
8381}
8382EXPORT_SYMBOL(io_schedule_timeout);
8383
8384void __sched io_schedule(void)
8385{
8386        int token;
8387
8388        token = io_schedule_prepare();
8389        schedule();
8390        io_schedule_finish(token);
8391}
8392EXPORT_SYMBOL(io_schedule);
8393
8394/**
8395 * sys_sched_get_priority_max - return maximum RT priority.
8396 * @policy: scheduling class.
8397 *
8398 * Return: On success, this syscall returns the maximum
8399 * rt_priority that can be used by a given scheduling class.
8400 * On failure, a negative error code is returned.
8401 */
8402SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
8403{
8404        int ret = -EINVAL;
8405
8406        switch (policy) {
8407        case SCHED_FIFO:
8408        case SCHED_RR:
8409                ret = MAX_RT_PRIO-1;
8410                break;
8411        case SCHED_DEADLINE:
8412        case SCHED_NORMAL:
8413        case SCHED_BATCH:
8414        case SCHED_IDLE:
8415                ret = 0;
8416                break;
8417        }
8418        return ret;
8419}
8420
8421/**
8422 * sys_sched_get_priority_min - return minimum RT priority.
8423 * @policy: scheduling class.
8424 *
8425 * Return: On success, this syscall returns the minimum
8426 * rt_priority that can be used by a given scheduling class.
8427 * On failure, a negative error code is returned.
8428 */
8429SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
8430{
8431        int ret = -EINVAL;
8432
8433        switch (policy) {
8434        case SCHED_FIFO:
8435        case SCHED_RR:
8436                ret = 1;
8437                break;
8438        case SCHED_DEADLINE:
8439        case SCHED_NORMAL:
8440        case SCHED_BATCH:
8441        case SCHED_IDLE:
8442                ret = 0;
8443        }
8444        return ret;
8445}
8446
8447static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
8448{
8449        struct task_struct *p;
8450        unsigned int time_slice;
8451        struct rq_flags rf;
8452        struct rq *rq;
8453        int retval;
8454
8455        if (pid < 0)
8456                return -EINVAL;
8457
8458        retval = -ESRCH;
8459        rcu_read_lock();
8460        p = find_process_by_pid(pid);
8461        if (!p)
8462                goto out_unlock;
8463
8464        retval = security_task_getscheduler(p);
8465        if (retval)
8466                goto out_unlock;
8467
8468        rq = task_rq_lock(p, &rf);
8469        time_slice = 0;
8470        if (p->sched_class->get_rr_interval)
8471                time_slice = p->sched_class->get_rr_interval(rq, p);
8472        task_rq_unlock(rq, p, &rf);
8473
8474        rcu_read_unlock();
8475        jiffies_to_timespec64(time_slice, t);
8476        return 0;
8477
8478out_unlock:
8479        rcu_read_unlock();
8480        return retval;
8481}
8482
8483/**
8484 * sys_sched_rr_get_interval - return the default timeslice of a process.
8485 * @pid: pid of the process.
8486 * @interval: userspace pointer to the timeslice value.
8487 *
8488 * this syscall writes the default timeslice value of a given process
8489 * into the user-space timespec buffer. A value of '0' means infinity.
8490 *
8491 * Return: On success, 0 and the timeslice is in @interval. Otherwise,
8492 * an error code.
8493 */
8494SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
8495                struct __kernel_timespec __user *, interval)
8496{
8497        struct timespec64 t;
8498        int retval = sched_rr_get_interval(pid, &t);
8499
8500        if (retval == 0)
8501                retval = put_timespec64(&t, interval);
8502
8503        return retval;
8504}
8505
8506#ifdef CONFIG_COMPAT_32BIT_TIME
8507SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
8508                struct old_timespec32 __user *, interval)
8509{
8510        struct timespec64 t;
8511        int retval = sched_rr_get_interval(pid, &t);
8512
8513        if (retval == 0)
8514                retval = put_old_timespec32(&t, interval);
8515        return retval;
8516}
8517#endif
8518
8519void sched_show_task(struct task_struct *p)
8520{
8521        unsigned long free = 0;
8522        int ppid;
8523
8524        if (!try_get_task_stack(p))
8525                return;
8526
8527        pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
8528
8529        if (task_is_running(p))
8530                pr_cont("  running task    ");
8531#ifdef CONFIG_DEBUG_STACK_USAGE
8532        free = stack_not_used(p);
8533#endif
8534        ppid = 0;
8535        rcu_read_lock();
8536        if (pid_alive(p))
8537                ppid = task_pid_nr(rcu_dereference(p->real_parent));
8538        rcu_read_unlock();
8539        pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
8540                free, task_pid_nr(p), ppid,
8541                (unsigned long)task_thread_info(p)->flags);
8542
8543        print_worker_info(KERN_INFO, p);
8544        print_stop_info(KERN_INFO, p);
8545        show_stack(p, NULL, KERN_INFO);
8546        put_task_stack(p);
8547}
8548EXPORT_SYMBOL_GPL(sched_show_task);
8549
8550static inline bool
8551state_filter_match(unsigned long state_filter, struct task_struct *p)
8552{
8553        unsigned int state = READ_ONCE(p->__state);
8554
8555        /* no filter, everything matches */
8556        if (!state_filter)
8557                return true;
8558
8559        /* filter, but doesn't match */
8560        if (!(state & state_filter))
8561                return false;
8562
8563        /*
8564         * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
8565         * TASK_KILLABLE).
8566         */
8567        if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE)
8568                return false;
8569
8570        return true;
8571}
8572
8573
8574void show_state_filter(unsigned int state_filter)
8575{
8576        struct task_struct *g, *p;
8577
8578        rcu_read_lock();
8579        for_each_process_thread(g, p) {
8580                /*
8581                 * reset the NMI-timeout, listing all files on a slow
8582                 * console might take a lot of time:
8583                 * Also, reset softlockup watchdogs on all CPUs, because
8584                 * another CPU might be blocked waiting for us to process
8585                 * an IPI.
8586                 */
8587                touch_nmi_watchdog();
8588                touch_all_softlockup_watchdogs();
8589                if (state_filter_match(state_filter, p))
8590                        sched_show_task(p);
8591        }
8592
8593#ifdef CONFIG_SCHED_DEBUG
8594        if (!state_filter)
8595                sysrq_sched_debug_show();
8596#endif
8597        rcu_read_unlock();
8598        /*
8599         * Only show locks if all tasks are dumped:
8600         */
8601        if (!state_filter)
8602                debug_show_all_locks();
8603}
8604
8605/**
8606 * init_idle - set up an idle thread for a given CPU
8607 * @idle: task in question
8608 * @cpu: CPU the idle task belongs to
8609 *
8610 * NOTE: this function does not set the idle thread's NEED_RESCHED
8611 * flag, to make booting more robust.
8612 */
8613void __init init_idle(struct task_struct *idle, int cpu)
8614{
8615        struct rq *rq = cpu_rq(cpu);
8616        unsigned long flags;
8617
8618        __sched_fork(0, idle);
8619
8620        /*
8621         * The idle task doesn't need the kthread struct to function, but it
8622         * is dressed up as a per-CPU kthread and thus needs to play the part
8623         * if we want to avoid special-casing it in code that deals with per-CPU
8624         * kthreads.
8625         */
8626        set_kthread_struct(idle);
8627
8628        raw_spin_lock_irqsave(&idle->pi_lock, flags);
8629        raw_spin_rq_lock(rq);
8630
8631        idle->__state = TASK_RUNNING;
8632        idle->se.exec_start = sched_clock();
8633        /*
8634         * PF_KTHREAD should already be set at this point; regardless, make it
8635         * look like a proper per-CPU kthread.
8636         */
8637        idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY;
8638        kthread_set_per_cpu(idle, cpu);
8639
8640        scs_task_reset(idle);
8641        kasan_unpoison_task_stack(idle);
8642
8643#ifdef CONFIG_SMP
8644        /*
8645         * It's possible that init_idle() gets called multiple times on a task,
8646         * in that case do_set_cpus_allowed() will not do the right thing.
8647         *
8648         * And since this is boot we can forgo the serialization.
8649         */
8650        set_cpus_allowed_common(idle, cpumask_of(cpu), 0);
8651#endif
8652        /*
8653         * We're having a chicken and egg problem, even though we are
8654         * holding rq->lock, the CPU isn't yet set to this CPU so the
8655         * lockdep check in task_group() will fail.
8656         *
8657         * Similar case to sched_fork(). / Alternatively we could
8658         * use task_rq_lock() here and obtain the other rq->lock.
8659         *
8660         * Silence PROVE_RCU
8661         */
8662        rcu_read_lock();
8663        __set_task_cpu(idle, cpu);
8664        rcu_read_unlock();
8665
8666        rq->idle = idle;
8667        rcu_assign_pointer(rq->curr, idle);
8668        idle->on_rq = TASK_ON_RQ_QUEUED;
8669#ifdef CONFIG_SMP
8670        idle->on_cpu = 1;
8671#endif
8672        raw_spin_rq_unlock(rq);
8673        raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
8674
8675        /* Set the preempt count _outside_ the spinlocks! */
8676        init_idle_preempt_count(idle, cpu);
8677
8678        /*
8679         * The idle tasks have their own, simple scheduling class:
8680         */
8681        idle->sched_class = &idle_sched_class;
8682        ftrace_graph_init_idle_task(idle, cpu);
8683        vtime_init_idle(idle, cpu);
8684#ifdef CONFIG_SMP
8685        sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
8686#endif
8687}
8688
8689#ifdef CONFIG_SMP
8690
8691int cpuset_cpumask_can_shrink(const struct cpumask *cur,
8692                              const struct cpumask *trial)
8693{
8694        int ret = 1;
8695
8696        if (!cpumask_weight(cur))
8697                return ret;
8698
8699        ret = dl_cpuset_cpumask_can_shrink(cur, trial);
8700
8701        return ret;
8702}
8703
8704int task_can_attach(struct task_struct *p,
8705                    const struct cpumask *cs_cpus_allowed)
8706{
8707        int ret = 0;
8708
8709        /*
8710         * Kthreads which disallow setaffinity shouldn't be moved
8711         * to a new cpuset; we don't want to change their CPU
8712         * affinity and isolating such threads by their set of
8713         * allowed nodes is unnecessary.  Thus, cpusets are not
8714         * applicable for such threads.  This prevents checking for
8715         * success of set_cpus_allowed_ptr() on all attached tasks
8716         * before cpus_mask may be changed.
8717         */
8718        if (p->flags & PF_NO_SETAFFINITY) {
8719                ret = -EINVAL;
8720                goto out;
8721        }
8722
8723        if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
8724                                              cs_cpus_allowed))
8725                ret = dl_task_can_attach(p, cs_cpus_allowed);
8726
8727out:
8728        return ret;
8729}
8730
8731bool sched_smp_initialized __read_mostly;
8732
8733#ifdef CONFIG_NUMA_BALANCING
8734/* Migrate current task p to target_cpu */
8735int migrate_task_to(struct task_struct *p, int target_cpu)
8736{
8737        struct migration_arg arg = { p, target_cpu };
8738        int curr_cpu = task_cpu(p);
8739
8740        if (curr_cpu == target_cpu)
8741                return 0;
8742
8743        if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
8744                return -EINVAL;
8745
8746        /* TODO: This is not properly updating schedstats */
8747
8748        trace_sched_move_numa(p, curr_cpu, target_cpu);
8749        return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
8750}
8751
8752/*
8753 * Requeue a task on a given node and accurately track the number of NUMA
8754 * tasks on the runqueues
8755 */
8756void sched_setnuma(struct task_struct *p, int nid)
8757{
8758        bool queued, running;
8759        struct rq_flags rf;
8760        struct rq *rq;
8761
8762        rq = task_rq_lock(p, &rf);
8763        queued = task_on_rq_queued(p);
8764        running = task_current(rq, p);
8765
8766        if (queued)
8767                dequeue_task(rq, p, DEQUEUE_SAVE);
8768        if (running)
8769                put_prev_task(rq, p);
8770
8771        p->numa_preferred_nid = nid;
8772
8773        if (queued)
8774                enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
8775        if (running)
8776                set_next_task(rq, p);
8777        task_rq_unlock(rq, p, &rf);
8778}
8779#endif /* CONFIG_NUMA_BALANCING */
8780
8781#ifdef CONFIG_HOTPLUG_CPU
8782/*
8783 * Ensure that the idle task is using init_mm right before its CPU goes
8784 * offline.
8785 */
8786void idle_task_exit(void)
8787{
8788        struct mm_struct *mm = current->active_mm;
8789
8790        BUG_ON(cpu_online(smp_processor_id()));
8791        BUG_ON(current != this_rq()->idle);
8792
8793        if (mm != &init_mm) {
8794                switch_mm(mm, &init_mm, current);
8795                finish_arch_post_lock_switch();
8796        }
8797
8798        scs_task_reset(current);
8799        /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
8800}
8801
8802static int __balance_push_cpu_stop(void *arg)
8803{
8804        struct task_struct *p = arg;
8805        struct rq *rq = this_rq();
8806        struct rq_flags rf;
8807        int cpu;
8808
8809        raw_spin_lock_irq(&p->pi_lock);
8810        rq_lock(rq, &rf);
8811
8812        update_rq_clock(rq);
8813
8814        if (task_rq(p) == rq && task_on_rq_queued(p)) {
8815                cpu = select_fallback_rq(rq->cpu, p);
8816                rq = __migrate_task(rq, &rf, p, cpu);
8817        }
8818
8819        rq_unlock(rq, &rf);
8820        raw_spin_unlock_irq(&p->pi_lock);
8821
8822        put_task_struct(p);
8823
8824        return 0;
8825}
8826
8827static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
8828
8829/*
8830 * Ensure we only run per-cpu kthreads once the CPU goes !active.
8831 *
8832 * This is enabled below SCHED_AP_ACTIVE; when !cpu_active(), but only
8833 * effective when the hotplug motion is down.
8834 */
8835static void balance_push(struct rq *rq)
8836{
8837        struct task_struct *push_task = rq->curr;
8838
8839        lockdep_assert_rq_held(rq);
8840
8841        /*
8842         * Ensure the thing is persistent until balance_push_set(.on = false);
8843         */
8844        rq->balance_callback = &balance_push_callback;
8845
8846        /*
8847         * Only active while going offline and when invoked on the outgoing
8848         * CPU.
8849         */
8850        if (!cpu_dying(rq->cpu) || rq != this_rq())
8851                return;
8852
8853        /*
8854         * Both the cpu-hotplug and stop task are in this case and are
8855         * required to complete the hotplug process.
8856         */
8857        if (kthread_is_per_cpu(push_task) ||
8858            is_migration_disabled(push_task)) {
8859
8860                /*
8861                 * If this is the idle task on the outgoing CPU try to wake
8862                 * up the hotplug control thread which might wait for the
8863                 * last task to vanish. The rcuwait_active() check is
8864                 * accurate here because the waiter is pinned on this CPU
8865                 * and can't obviously be running in parallel.
8866                 *
8867                 * On RT kernels this also has to check whether there are
8868                 * pinned and scheduled out tasks on the runqueue. They
8869                 * need to leave the migrate disabled section first.
8870                 */
8871                if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
8872                    rcuwait_active(&rq->hotplug_wait)) {
8873                        raw_spin_rq_unlock(rq);
8874                        rcuwait_wake_up(&rq->hotplug_wait);
8875                        raw_spin_rq_lock(rq);
8876                }
8877                return;
8878        }
8879
8880        get_task_struct(push_task);
8881        /*
8882         * Temporarily drop rq->lock such that we can wake-up the stop task.
8883         * Both preemption and IRQs are still disabled.
8884         */
8885        raw_spin_rq_unlock(rq);
8886        stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
8887                            this_cpu_ptr(&push_work));
8888        /*
8889         * At this point need_resched() is true and we'll take the loop in
8890         * schedule(). The next pick is obviously going to be the stop task
8891         * which kthread_is_per_cpu() and will push this task away.
8892         */
8893        raw_spin_rq_lock(rq);
8894}
8895
8896static void balance_push_set(int cpu, bool on)
8897{
8898        struct rq *rq = cpu_rq(cpu);
8899        struct rq_flags rf;
8900
8901        rq_lock_irqsave(rq, &rf);
8902        if (on) {
8903                WARN_ON_ONCE(rq->balance_callback);
8904                rq->balance_callback = &balance_push_callback;
8905        } else if (rq->balance_callback == &balance_push_callback) {
8906                rq->balance_callback = NULL;
8907        }
8908        rq_unlock_irqrestore(rq, &rf);
8909}
8910
8911/*
8912 * Invoked from a CPUs hotplug control thread after the CPU has been marked
8913 * inactive. All tasks which are not per CPU kernel threads are either
8914 * pushed off this CPU now via balance_push() or placed on a different CPU
8915 * during wakeup. Wait until the CPU is quiescent.
8916 */
8917static void balance_hotplug_wait(void)
8918{
8919        struct rq *rq = this_rq();
8920
8921        rcuwait_wait_event(&rq->hotplug_wait,
8922                           rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
8923                           TASK_UNINTERRUPTIBLE);
8924}
8925
8926#else
8927
8928static inline void balance_push(struct rq *rq)
8929{
8930}
8931
8932static inline void balance_push_set(int cpu, bool on)
8933{
8934}
8935
8936static inline void balance_hotplug_wait(void)
8937{
8938}
8939
8940#endif /* CONFIG_HOTPLUG_CPU */
8941
8942void set_rq_online(struct rq *rq)
8943{
8944        if (!rq->online) {
8945                const struct sched_class *class;
8946
8947                cpumask_set_cpu(rq->cpu, rq->rd->online);
8948                rq->online = 1;
8949
8950                for_each_class(class) {
8951                        if (class->rq_online)
8952                                class->rq_online(rq);
8953                }
8954        }
8955}
8956
8957void set_rq_offline(struct rq *rq)
8958{
8959        if (rq->online) {
8960                const struct sched_class *class;
8961
8962                for_each_class(class) {
8963                        if (class->rq_offline)
8964                                class->rq_offline(rq);
8965                }
8966
8967                cpumask_clear_cpu(rq->cpu, rq->rd->online);
8968                rq->online = 0;
8969        }
8970}
8971
8972/*
8973 * used to mark begin/end of suspend/resume:
8974 */
8975static int num_cpus_frozen;
8976
8977/*
8978 * Update cpusets according to cpu_active mask.  If cpusets are
8979 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
8980 * around partition_sched_domains().
8981 *
8982 * If we come here as part of a suspend/resume, don't touch cpusets because we
8983 * want to restore it back to its original state upon resume anyway.
8984 */
8985static void cpuset_cpu_active(void)
8986{
8987        if (cpuhp_tasks_frozen) {
8988                /*
8989                 * num_cpus_frozen tracks how many CPUs are involved in suspend
8990                 * resume sequence. As long as this is not the last online
8991                 * operation in the resume sequence, just build a single sched
8992                 * domain, ignoring cpusets.
8993                 */
8994                partition_sched_domains(1, NULL, NULL);
8995                if (--num_cpus_frozen)
8996                        return;
8997                /*
8998                 * This is the last CPU online operation. So fall through and
8999                 * restore the original sched domains by considering the
9000                 * cpuset configurations.
9001                 */
9002                cpuset_force_rebuild();
9003        }
9004        cpuset_update_active_cpus();
9005}
9006
9007static int cpuset_cpu_inactive(unsigned int cpu)
9008{
9009        if (!cpuhp_tasks_frozen) {
9010                if (dl_cpu_busy(cpu))
9011                        return -EBUSY;
9012                cpuset_update_active_cpus();
9013        } else {
9014                num_cpus_frozen++;
9015                partition_sched_domains(1, NULL, NULL);
9016        }
9017        return 0;
9018}
9019
9020int sched_cpu_activate(unsigned int cpu)
9021{
9022        struct rq *rq = cpu_rq(cpu);
9023        struct rq_flags rf;
9024
9025        /*
9026         * Clear the balance_push callback and prepare to schedule
9027         * regular tasks.
9028         */
9029        balance_push_set(cpu, false);
9030
9031#ifdef CONFIG_SCHED_SMT
9032        /*
9033         * When going up, increment the number of cores with SMT present.
9034         */
9035        if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
9036                static_branch_inc_cpuslocked(&sched_smt_present);
9037#endif
9038        set_cpu_active(cpu, true);
9039
9040        if (sched_smp_initialized) {
9041                sched_domains_numa_masks_set(cpu);
9042                cpuset_cpu_active();
9043        }
9044
9045        /*
9046         * Put the rq online, if not already. This happens:
9047         *
9048         * 1) In the early boot process, because we build the real domains
9049         *    after all CPUs have been brought up.
9050         *
9051         * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
9052         *    domains.
9053         */
9054        rq_lock_irqsave(rq, &rf);
9055        if (rq->rd) {
9056                BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
9057                set_rq_online(rq);
9058        }
9059        rq_unlock_irqrestore(rq, &rf);
9060
9061        return 0;
9062}
9063
9064int sched_cpu_deactivate(unsigned int cpu)
9065{
9066        struct rq *rq = cpu_rq(cpu);
9067        struct rq_flags rf;
9068        int ret;
9069
9070        /*
9071         * Remove CPU from nohz.idle_cpus_mask to prevent participating in
9072         * load balancing when not active
9073         */
9074        nohz_balance_exit_idle(rq);
9075
9076        set_cpu_active(cpu, false);
9077
9078        /*
9079         * From this point forward, this CPU will refuse to run any task that
9080         * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively
9081         * push those tasks away until this gets cleared, see
9082         * sched_cpu_dying().
9083         */
9084        balance_push_set(cpu, true);
9085
9086        /*
9087         * We've cleared cpu_active_mask / set balance_push, wait for all
9088         * preempt-disabled and RCU users of this state to go away such that
9089         * all new such users will observe it.
9090         *
9091         * Specifically, we rely on ttwu to no longer target this CPU, see
9092         * ttwu_queue_cond() and is_cpu_allowed().
9093         *
9094         * Do sync before park smpboot threads to take care the rcu boost case.
9095         */
9096        synchronize_rcu();
9097
9098        rq_lock_irqsave(rq, &rf);
9099        if (rq->rd) {
9100                update_rq_clock(rq);
9101                BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
9102                set_rq_offline(rq);
9103        }
9104        rq_unlock_irqrestore(rq, &rf);
9105
9106#ifdef CONFIG_SCHED_SMT
9107        /*
9108         * When going down, decrement the number of cores with SMT present.
9109         */
9110        if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
9111                static_branch_dec_cpuslocked(&sched_smt_present);
9112
9113        sched_core_cpu_deactivate(cpu);
9114#endif
9115
9116        if (!sched_smp_initialized)
9117                return 0;
9118
9119        ret = cpuset_cpu_inactive(cpu);
9120        if (ret) {
9121                balance_push_set(cpu, false);
9122                set_cpu_active(cpu, true);
9123                return ret;
9124        }
9125        sched_domains_numa_masks_clear(cpu);
9126        return 0;
9127}
9128
9129static void sched_rq_cpu_starting(unsigned int cpu)
9130{
9131        struct rq *rq = cpu_rq(cpu);
9132
9133        rq->calc_load_update = calc_load_update;
9134        update_max_interval();
9135}
9136
9137int sched_cpu_starting(unsigned int cpu)
9138{
9139        sched_core_cpu_starting(cpu);
9140        sched_rq_cpu_starting(cpu);
9141        sched_tick_start(cpu);
9142        return 0;
9143}
9144
9145#ifdef CONFIG_HOTPLUG_CPU
9146
9147/*
9148 * Invoked immediately before the stopper thread is invoked to bring the
9149 * CPU down completely. At this point all per CPU kthreads except the
9150 * hotplug thread (current) and the stopper thread (inactive) have been
9151 * either parked or have been unbound from the outgoing CPU. Ensure that
9152 * any of those which might be on the way out are gone.
9153 *
9154 * If after this point a bound task is being woken on this CPU then the
9155 * responsible hotplug callback has failed to do it's job.
9156 * sched_cpu_dying() will catch it with the appropriate fireworks.
9157 */
9158int sched_cpu_wait_empty(unsigned int cpu)
9159{
9160        balance_hotplug_wait();
9161        return 0;
9162}
9163
9164/*
9165 * Since this CPU is going 'away' for a while, fold any nr_active delta we
9166 * might have. Called from the CPU stopper task after ensuring that the
9167 * stopper is the last running task on the CPU, so nr_active count is
9168 * stable. We need to take the teardown thread which is calling this into
9169 * account, so we hand in adjust = 1 to the load calculation.
9170 *
9171 * Also see the comment "Global load-average calculations".
9172 */
9173static void calc_load_migrate(struct rq *rq)
9174{
9175        long delta = calc_load_fold_active(rq, 1);
9176
9177        if (delta)
9178                atomic_long_add(delta, &calc_load_tasks);
9179}
9180
9181static void dump_rq_tasks(struct rq *rq, const char *loglvl)
9182{
9183        struct task_struct *g, *p;
9184        int cpu = cpu_of(rq);
9185
9186        lockdep_assert_rq_held(rq);
9187
9188        printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);
9189        for_each_process_thread(g, p) {
9190                if (task_cpu(p) != cpu)
9191                        continue;
9192
9193                if (!task_on_rq_queued(p))
9194                        continue;
9195
9196                printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm);
9197        }
9198}
9199
9200int sched_cpu_dying(unsigned int cpu)
9201{
9202        struct rq *rq = cpu_rq(cpu);
9203        struct rq_flags rf;
9204
9205        /* Handle pending wakeups and then migrate everything off */
9206        sched_tick_stop(cpu);
9207
9208        rq_lock_irqsave(rq, &rf);
9209        if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {
9210                WARN(true, "Dying CPU not properly vacated!");
9211                dump_rq_tasks(rq, KERN_WARNING);
9212        }
9213        rq_unlock_irqrestore(rq, &rf);
9214
9215        calc_load_migrate(rq);
9216        update_max_interval();
9217        hrtick_clear(rq);
9218        sched_core_cpu_dying(cpu);
9219        return 0;
9220}
9221#endif
9222
9223void __init sched_init_smp(void)
9224{
9225        sched_init_numa();
9226
9227        /*
9228         * There's no userspace yet to cause hotplug operations; hence all the
9229         * CPU masks are stable and all blatant races in the below code cannot
9230         * happen.
9231         */
9232        mutex_lock(&sched_domains_mutex);
9233        sched_init_domains(cpu_active_mask);
9234        mutex_unlock(&sched_domains_mutex);
9235
9236        /* Move init over to a non-isolated CPU */
9237        if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
9238                BUG();
9239        current->flags &= ~PF_NO_SETAFFINITY;
9240        sched_init_granularity();
9241
9242        init_sched_rt_class();
9243        init_sched_dl_class();
9244
9245        sched_smp_initialized = true;
9246}
9247
9248static int __init migration_init(void)
9249{
9250        sched_cpu_starting(smp_processor_id());
9251        return 0;
9252}
9253early_initcall(migration_init);
9254
9255#else
9256void __init sched_init_smp(void)
9257{
9258        sched_init_granularity();
9259}
9260#endif /* CONFIG_SMP */
9261
9262int in_sched_functions(unsigned long addr)
9263{
9264        return in_lock_functions(addr) ||
9265                (addr >= (unsigned long)__sched_text_start
9266                && addr < (unsigned long)__sched_text_end);
9267}
9268
9269#ifdef CONFIG_CGROUP_SCHED
9270/*
9271 * Default task group.
9272 * Every task in system belongs to this group at bootup.
9273 */
9274struct task_group root_task_group;
9275LIST_HEAD(task_groups);
9276
9277/* Cacheline aligned slab cache for task_group */
9278static struct kmem_cache *task_group_cache __read_mostly;
9279#endif
9280
9281DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
9282DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
9283
9284void __init sched_init(void)
9285{
9286        unsigned long ptr = 0;
9287        int i;
9288
9289        /* Make sure the linker didn't screw up */
9290        BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
9291               &fair_sched_class + 1 != &rt_sched_class ||
9292               &rt_sched_class + 1   != &dl_sched_class);
9293#ifdef CONFIG_SMP
9294        BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
9295#endif
9296
9297        wait_bit_init();
9298
9299#ifdef CONFIG_FAIR_GROUP_SCHED
9300        ptr += 2 * nr_cpu_ids * sizeof(void **);
9301#endif
9302#ifdef CONFIG_RT_GROUP_SCHED
9303        ptr += 2 * nr_cpu_ids * sizeof(void **);
9304#endif
9305        if (ptr) {
9306                ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
9307
9308#ifdef CONFIG_FAIR_GROUP_SCHED
9309                root_task_group.se = (struct sched_entity **)ptr;
9310                ptr += nr_cpu_ids * sizeof(void **);
9311
9312                root_task_group.cfs_rq = (struct cfs_rq **)ptr;
9313                ptr += nr_cpu_ids * sizeof(void **);
9314
9315                root_task_group.shares = ROOT_TASK_GROUP_LOAD;
9316                init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
9317#endif /* CONFIG_FAIR_GROUP_SCHED */
9318#ifdef CONFIG_RT_GROUP_SCHED
9319                root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9320                ptr += nr_cpu_ids * sizeof(void **);
9321
9322                root_task_group.rt_rq = (struct rt_rq **)ptr;
9323                ptr += nr_cpu_ids * sizeof(void **);
9324
9325#endif /* CONFIG_RT_GROUP_SCHED */
9326        }
9327#ifdef CONFIG_CPUMASK_OFFSTACK
9328        for_each_possible_cpu(i) {
9329                per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
9330                        cpumask_size(), GFP_KERNEL, cpu_to_node(i));
9331                per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
9332                        cpumask_size(), GFP_KERNEL, cpu_to_node(i));
9333        }
9334#endif /* CONFIG_CPUMASK_OFFSTACK */
9335
9336        init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
9337        init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
9338
9339#ifdef CONFIG_SMP
9340        init_defrootdomain();
9341#endif
9342
9343#ifdef CONFIG_RT_GROUP_SCHED
9344        init_rt_bandwidth(&root_task_group.rt_bandwidth,
9345                        global_rt_period(), global_rt_runtime());
9346#endif /* CONFIG_RT_GROUP_SCHED */
9347
9348#ifdef CONFIG_CGROUP_SCHED
9349        task_group_cache = KMEM_CACHE(task_group, 0);
9350
9351        list_add(&root_task_group.list, &task_groups);
9352        INIT_LIST_HEAD(&root_task_group.children);
9353        INIT_LIST_HEAD(&root_task_group.siblings);
9354        autogroup_init(&init_task);
9355#endif /* CONFIG_CGROUP_SCHED */
9356
9357        for_each_possible_cpu(i) {
9358                struct rq *rq;
9359
9360                rq = cpu_rq(i);
9361                raw_spin_lock_init(&rq->__lock);
9362                rq->nr_running = 0;
9363                rq->calc_load_active = 0;
9364                rq->calc_load_update = jiffies + LOAD_FREQ;
9365                init_cfs_rq(&rq->cfs);
9366                init_rt_rq(&rq->rt);
9367                init_dl_rq(&rq->dl);
9368#ifdef CONFIG_FAIR_GROUP_SCHED
9369                INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
9370                rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
9371                /*
9372                 * How much CPU bandwidth does root_task_group get?
9373                 *
9374                 * In case of task-groups formed thr' the cgroup filesystem, it
9375                 * gets 100% of the CPU resources in the system. This overall
9376                 * system CPU resource is divided among the tasks of
9377                 * root_task_group and its child task-groups in a fair manner,
9378                 * based on each entity's (task or task-group's) weight
9379                 * (se->load.weight).
9380                 *
9381                 * In other words, if root_task_group has 10 tasks of weight
9382                 * 1024) and two child groups A0 and A1 (of weight 1024 each),
9383                 * then A0's share of the CPU resource is:
9384                 *
9385                 *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
9386                 *
9387                 * We achieve this by letting root_task_group's tasks sit
9388                 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
9389                 */
9390                init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
9391#endif /* CONFIG_FAIR_GROUP_SCHED */
9392
9393                rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
9394#ifdef CONFIG_RT_GROUP_SCHED
9395                init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
9396#endif
9397#ifdef CONFIG_SMP
9398                rq->sd = NULL;
9399                rq->rd = NULL;
9400                rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
9401                rq->balance_callback = &balance_push_callback;
9402                rq->active_balance = 0;
9403                rq->next_balance = jiffies;
9404                rq->push_cpu = 0;
9405                rq->cpu = i;
9406                rq->online = 0;
9407                rq->idle_stamp = 0;
9408                rq->avg_idle = 2*sysctl_sched_migration_cost;
9409                rq->wake_stamp = jiffies;
9410                rq->wake_avg_idle = rq->avg_idle;
9411                rq->max_idle_balance_cost = sysctl_sched_migration_cost;
9412
9413                INIT_LIST_HEAD(&rq->cfs_tasks);
9414
9415                rq_attach_root(rq, &def_root_domain);
9416#ifdef CONFIG_NO_HZ_COMMON
9417                rq->last_blocked_load_update_tick = jiffies;
9418                atomic_set(&rq->nohz_flags, 0);
9419
9420                INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);
9421#endif
9422#ifdef CONFIG_HOTPLUG_CPU
9423                rcuwait_init(&rq->hotplug_wait);
9424#endif
9425#endif /* CONFIG_SMP */
9426                hrtick_rq_init(rq);
9427                atomic_set(&rq->nr_iowait, 0);
9428
9429#ifdef CONFIG_SCHED_CORE
9430                rq->core = rq;
9431                rq->core_pick = NULL;
9432                rq->core_enabled = 0;
9433                rq->core_tree = RB_ROOT;
9434                rq->core_forceidle = false;
9435
9436                rq->core_cookie = 0UL;
9437#endif
9438        }
9439
9440        set_load_weight(&init_task, false);
9441
9442        /*
9443         * The boot idle thread does lazy MMU switching as well:
9444         */
9445        mmgrab(&init_mm);
9446        enter_lazy_tlb(&init_mm, current);
9447
9448        /*
9449         * Make us the idle thread. Technically, schedule() should not be
9450         * called from this thread, however somewhere below it might be,
9451         * but because we are the idle thread, we just pick up running again
9452         * when this runqueue becomes "idle".
9453         */
9454        init_idle(current, smp_processor_id());
9455
9456        calc_load_update = jiffies + LOAD_FREQ;
9457
9458#ifdef CONFIG_SMP
9459        idle_thread_set_boot_cpu();
9460        balance_push_set(smp_processor_id(), false);
9461#endif
9462        init_sched_fair_class();
9463
9464        psi_init();
9465
9466        init_uclamp();
9467
9468        scheduler_running = 1;
9469}
9470
9471#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
9472static inline int preempt_count_equals(int preempt_offset)
9473{
9474        int nested = preempt_count() + rcu_preempt_depth();
9475
9476        return (nested == preempt_offset);
9477}
9478
9479void __might_sleep(const char *file, int line, int preempt_offset)
9480{
9481        unsigned int state = get_current_state();
9482        /*
9483         * Blocking primitives will set (and therefore destroy) current->state,
9484         * since we will exit with TASK_RUNNING make sure we enter with it,
9485         * otherwise we will destroy state.
9486         */
9487        WARN_ONCE(state != TASK_RUNNING && current->task_state_change,
9488                        "do not call blocking ops when !TASK_RUNNING; "
9489                        "state=%x set at [<%p>] %pS\n", state,
9490                        (void *)current->task_state_change,
9491                        (void *)current->task_state_change);
9492
9493        ___might_sleep(file, line, preempt_offset);
9494}
9495EXPORT_SYMBOL(__might_sleep);
9496
9497void ___might_sleep(const char *file, int line, int preempt_offset)
9498{
9499        /* Ratelimiting timestamp: */
9500        static unsigned long prev_jiffy;
9501
9502        unsigned long preempt_disable_ip;
9503
9504        /* WARN_ON_ONCE() by default, no rate limit required: */
9505        rcu_sleep_check();
9506
9507        if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
9508             !is_idle_task(current) && !current->non_block_count) ||
9509            system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
9510            oops_in_progress)
9511                return;
9512
9513        if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9514                return;
9515        prev_jiffy = jiffies;
9516
9517        /* Save this before calling printk(), since that will clobber it: */
9518        preempt_disable_ip = get_preempt_disable_ip(current);
9519
9520        printk(KERN_ERR
9521                "BUG: sleeping function called from invalid context at %s:%d\n",
9522                        file, line);
9523        printk(KERN_ERR
9524                "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
9525                        in_atomic(), irqs_disabled(), current->non_block_count,
9526                        current->pid, current->comm);
9527
9528        if (task_stack_end_corrupted(current))
9529                printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
9530
9531        debug_show_held_locks(current);
9532        if (irqs_disabled())
9533                print_irqtrace_events(current);
9534        if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
9535            && !preempt_count_equals(preempt_offset)) {
9536                pr_err("Preemption disabled at:");
9537                print_ip_sym(KERN_ERR, preempt_disable_ip);
9538        }
9539        dump_stack();
9540        add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
9541}
9542EXPORT_SYMBOL(___might_sleep);
9543
9544void __cant_sleep(const char *file, int line, int preempt_offset)
9545{
9546        static unsigned long prev_jiffy;
9547
9548        if (irqs_disabled())
9549                return;
9550
9551        if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
9552                return;
9553
9554        if (preempt_count() > preempt_offset)
9555                return;
9556
9557        if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9558                return;
9559        prev_jiffy = jiffies;
9560
9561        printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
9562        printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
9563                        in_atomic(), irqs_disabled(),
9564                        current->pid, current->comm);
9565
9566        debug_show_held_locks(current);
9567        dump_stack();
9568        add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
9569}
9570EXPORT_SYMBOL_GPL(__cant_sleep);
9571
9572#ifdef CONFIG_SMP
9573void __cant_migrate(const char *file, int line)
9574{
9575        static unsigned long prev_jiffy;
9576
9577        if (irqs_disabled())
9578                return;
9579
9580        if (is_migration_disabled(current))
9581                return;
9582
9583        if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
9584                return;
9585
9586        if (preempt_count() > 0)
9587                return;
9588
9589        if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9590                return;
9591        prev_jiffy = jiffies;
9592
9593        pr_err("BUG: assuming non migratable context at %s:%d\n", file, line);
9594        pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n",
9595               in_atomic(), irqs_disabled(), is_migration_disabled(current),
9596               current->pid, current->comm);
9597
9598        debug_show_held_locks(current);
9599        dump_stack();
9600        add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
9601}
9602EXPORT_SYMBOL_GPL(__cant_migrate);
9603#endif
9604#endif
9605
9606#ifdef CONFIG_MAGIC_SYSRQ
9607void normalize_rt_tasks(void)
9608{
9609        struct task_struct *g, *p;
9610        struct sched_attr attr = {
9611                .sched_policy = SCHED_NORMAL,
9612        };
9613
9614        read_lock(&tasklist_lock);
9615        for_each_process_thread(g, p) {
9616                /*
9617                 * Only normalize user tasks:
9618                 */
9619                if (p->flags & PF_KTHREAD)
9620                        continue;
9621
9622                p->se.exec_start = 0;
9623                schedstat_set(p->se.statistics.wait_start,  0);
9624                schedstat_set(p->se.statistics.sleep_start, 0);
9625                schedstat_set(p->se.statistics.block_start, 0);
9626
9627                if (!dl_task(p) && !rt_task(p)) {
9628                        /*
9629                         * Renice negative nice level userspace
9630                         * tasks back to 0:
9631                         */
9632                        if (task_nice(p) < 0)
9633                                set_user_nice(p, 0);
9634                        continue;
9635                }
9636
9637                __sched_setscheduler(p, &attr, false, false);
9638        }
9639        read_unlock(&tasklist_lock);
9640}
9641
9642#endif /* CONFIG_MAGIC_SYSRQ */
9643
9644#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
9645/*
9646 * These functions are only useful for the IA64 MCA handling, or kdb.
9647 *
9648 * They can only be called when the whole system has been
9649 * stopped - every CPU needs to be quiescent, and no scheduling
9650 * activity can take place. Using them for anything else would
9651 * be a serious bug, and as a result, they aren't even visible
9652 * under any other configuration.
9653 */
9654
9655/**
9656 * curr_task - return the current task for a given CPU.
9657 * @cpu: the processor in question.
9658 *
9659 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
9660 *
9661 * Return: The current task for @cpu.
9662 */
9663struct task_struct *curr_task(int cpu)
9664{
9665        return cpu_curr(cpu);
9666}
9667
9668#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
9669
9670#ifdef CONFIG_IA64
9671/**
9672 * ia64_set_curr_task - set the current task for a given CPU.
9673 * @cpu: the processor in question.
9674 * @p: the task pointer to set.
9675 *
9676 * Description: This function must only be used when non-maskable interrupts
9677 * are serviced on a separate stack. It allows the architecture to switch the
9678 * notion of the current task on a CPU in a non-blocking manner. This function
9679 * must be called with all CPU's synchronized, and interrupts disabled, the
9680 * and caller must save the original value of the current task (see
9681 * curr_task() above) and restore that value before reenabling interrupts and
9682 * re-starting the system.
9683 *
9684 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
9685 */
9686void ia64_set_curr_task(int cpu, struct task_struct *p)
9687{
9688        cpu_curr(cpu) = p;
9689}
9690
9691#endif
9692
9693#ifdef CONFIG_CGROUP_SCHED
9694/* task_group_lock serializes the addition/removal of task groups */
9695static DEFINE_SPINLOCK(task_group_lock);
9696
9697static inline void alloc_uclamp_sched_group(struct task_group *tg,
9698                                            struct task_group *parent)
9699{
9700#ifdef CONFIG_UCLAMP_TASK_GROUP
9701        enum uclamp_id clamp_id;
9702
9703        for_each_clamp_id(clamp_id) {
9704                uclamp_se_set(&tg->uclamp_req[clamp_id],
9705                              uclamp_none(clamp_id), false);
9706                tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
9707        }
9708#endif
9709}
9710
9711static void sched_free_group(struct task_group *tg)
9712{
9713        free_fair_sched_group(tg);
9714        free_rt_sched_group(tg);
9715        autogroup_free(tg);
9716        kmem_cache_free(task_group_cache, tg);
9717}
9718
9719/* allocate runqueue etc for a new task group */
9720struct task_group *sched_create_group(struct task_group *parent)
9721{
9722        struct task_group *tg;
9723
9724        tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
9725        if (!tg)
9726                return ERR_PTR(-ENOMEM);
9727
9728        if (!alloc_fair_sched_group(tg, parent))
9729                goto err;
9730
9731        if (!alloc_rt_sched_group(tg, parent))
9732                goto err;
9733
9734        alloc_uclamp_sched_group(tg, parent);
9735
9736        return tg;
9737
9738err:
9739        sched_free_group(tg);
9740        return ERR_PTR(-ENOMEM);
9741}
9742
9743void sched_online_group(struct task_group *tg, struct task_group *parent)
9744{
9745        unsigned long flags;
9746
9747        spin_lock_irqsave(&task_group_lock, flags);
9748        list_add_rcu(&tg->list, &task_groups);
9749
9750        /* Root should already exist: */
9751        WARN_ON(!parent);
9752
9753        tg->parent = parent;
9754        INIT_LIST_HEAD(&tg->children);
9755        list_add_rcu(&tg->siblings, &parent->children);
9756        spin_unlock_irqrestore(&task_group_lock, flags);
9757
9758        online_fair_sched_group(tg);
9759}
9760
9761/* rcu callback to free various structures associated with a task group */
9762static void sched_free_group_rcu(struct rcu_head *rhp)
9763{
9764        /* Now it should be safe to free those cfs_rqs: */
9765        sched_free_group(container_of(rhp, struct task_group, rcu));
9766}
9767
9768void sched_destroy_group(struct task_group *tg)
9769{
9770        /* Wait for possible concurrent references to cfs_rqs complete: */
9771        call_rcu(&tg->rcu, sched_free_group_rcu);
9772}
9773
9774void sched_offline_group(struct task_group *tg)
9775{
9776        unsigned long flags;
9777
9778        /* End participation in shares distribution: */
9779        unregister_fair_sched_group(tg);
9780
9781        spin_lock_irqsave(&task_group_lock, flags);
9782        list_del_rcu(&tg->list);
9783        list_del_rcu(&tg->siblings);
9784        spin_unlock_irqrestore(&task_group_lock, flags);
9785}
9786
9787static void sched_change_group(struct task_struct *tsk, int type)
9788{
9789        struct task_group *tg;
9790
9791        /*
9792         * All callers are synchronized by task_rq_lock(); we do not use RCU
9793         * which is pointless here. Thus, we pass "true" to task_css_check()
9794         * to prevent lockdep warnings.
9795         */
9796        tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
9797                          struct task_group, css);
9798        tg = autogroup_task_group(tsk, tg);
9799        tsk->sched_task_group = tg;
9800
9801#ifdef CONFIG_FAIR_GROUP_SCHED
9802        if (tsk->sched_class->task_change_group)
9803                tsk->sched_class->task_change_group(tsk, type);
9804        else
9805#endif
9806                set_task_rq(tsk, task_cpu(tsk));
9807}
9808
9809/*
9810 * Change task's runqueue when it moves between groups.
9811 *
9812 * The caller of this function should have put the task in its new group by
9813 * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
9814 * its new group.
9815 */
9816void sched_move_task(struct task_struct *tsk)
9817{
9818        int queued, running, queue_flags =
9819                DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
9820        struct rq_flags rf;
9821        struct rq *rq;
9822
9823        rq = task_rq_lock(tsk, &rf);
9824        update_rq_clock(rq);
9825
9826        running = task_current(rq, tsk);
9827        queued = task_on_rq_queued(tsk);
9828
9829        if (queued)
9830                dequeue_task(rq, tsk, queue_flags);
9831        if (running)
9832                put_prev_task(rq, tsk);
9833
9834        sched_change_group(tsk, TASK_MOVE_GROUP);
9835
9836        if (queued)
9837                enqueue_task(rq, tsk, queue_flags);
9838        if (running) {
9839                set_next_task(rq, tsk);
9840                /*
9841                 * After changing group, the running task may have joined a
9842                 * throttled one but it's still the running task. Trigger a
9843                 * resched to make sure that task can still run.
9844                 */
9845                resched_curr(rq);
9846        }
9847
9848        task_rq_unlock(rq, tsk, &rf);
9849}
9850
9851static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
9852{
9853        return css ? container_of(css, struct task_group, css) : NULL;
9854}
9855
9856static struct cgroup_subsys_state *
9857cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
9858{
9859        struct task_group *parent = css_tg(parent_css);
9860        struct task_group *tg;
9861
9862        if (!parent) {
9863                /* This is early initialization for the top cgroup */
9864                return &root_task_group.css;
9865        }
9866
9867        tg = sched_create_group(parent);
9868        if (IS_ERR(tg))
9869                return ERR_PTR(-ENOMEM);
9870
9871        return &tg->css;
9872}
9873
9874/* Expose task group only after completing cgroup initialization */
9875static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
9876{
9877        struct task_group *tg = css_tg(css);
9878        struct task_group *parent = css_tg(css->parent);
9879
9880        if (parent)
9881                sched_online_group(tg, parent);
9882
9883#ifdef CONFIG_UCLAMP_TASK_GROUP
9884        /* Propagate the effective uclamp value for the new group */
9885        mutex_lock(&uclamp_mutex);
9886        rcu_read_lock();
9887        cpu_util_update_eff(css);
9888        rcu_read_unlock();
9889        mutex_unlock(&uclamp_mutex);
9890#endif
9891
9892        return 0;
9893}
9894
9895static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
9896{
9897        struct task_group *tg = css_tg(css);
9898
9899        sched_offline_group(tg);
9900}
9901
9902static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
9903{
9904        struct task_group *tg = css_tg(css);
9905
9906        /*
9907         * Relies on the RCU grace period between css_released() and this.
9908         */
9909        sched_free_group(tg);
9910}
9911
9912/*
9913 * This is called before wake_up_new_task(), therefore we really only
9914 * have to set its group bits, all the other stuff does not apply.
9915 */
9916static void cpu_cgroup_fork(struct task_struct *task)
9917{
9918        struct rq_flags rf;
9919        struct rq *rq;
9920
9921        rq = task_rq_lock(task, &rf);
9922
9923        update_rq_clock(rq);
9924        sched_change_group(task, TASK_SET_GROUP);
9925
9926        task_rq_unlock(rq, task, &rf);
9927}
9928
9929static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
9930{
9931        struct task_struct *task;
9932        struct cgroup_subsys_state *css;
9933        int ret = 0;
9934
9935        cgroup_taskset_for_each(task, css, tset) {
9936#ifdef CONFIG_RT_GROUP_SCHED
9937                if (!sched_rt_can_attach(css_tg(css), task))
9938                        return -EINVAL;
9939#endif
9940                /*
9941                 * Serialize against wake_up_new_task() such that if it's
9942                 * running, we're sure to observe its full state.
9943                 */
9944                raw_spin_lock_irq(&task->pi_lock);
9945                /*
9946                 * Avoid calling sched_move_task() before wake_up_new_task()
9947                 * has happened. This would lead to problems with PELT, due to
9948                 * move wanting to detach+attach while we're not attached yet.
9949                 */
9950                if (READ_ONCE(task->__state) == TASK_NEW)
9951                        ret = -EINVAL;
9952                raw_spin_unlock_irq(&task->pi_lock);
9953
9954                if (ret)
9955                        break;
9956        }
9957        return ret;
9958}
9959
9960static void cpu_cgroup_attach(struct cgroup_taskset *tset)
9961{
9962        struct task_struct *task;
9963        struct cgroup_subsys_state *css;
9964
9965        cgroup_taskset_for_each(task, css, tset)
9966                sched_move_task(task);
9967}
9968
9969#ifdef CONFIG_UCLAMP_TASK_GROUP
9970static void cpu_util_update_eff(struct cgroup_subsys_state *css)
9971{
9972        struct cgroup_subsys_state *top_css = css;
9973        struct uclamp_se *uc_parent = NULL;
9974        struct uclamp_se *uc_se = NULL;
9975        unsigned int eff[UCLAMP_CNT];
9976        enum uclamp_id clamp_id;
9977        unsigned int clamps;
9978
9979        lockdep_assert_held(&uclamp_mutex);
9980        SCHED_WARN_ON(!rcu_read_lock_held());
9981
9982        css_for_each_descendant_pre(css, top_css) {
9983                uc_parent = css_tg(css)->parent
9984                        ? css_tg(css)->parent->uclamp : NULL;
9985
9986                for_each_clamp_id(clamp_id) {
9987                        /* Assume effective clamps matches requested clamps */
9988                        eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
9989                        /* Cap effective clamps with parent's effective clamps */
9990                        if (uc_parent &&
9991                            eff[clamp_id] > uc_parent[clamp_id].value) {
9992                                eff[clamp_id] = uc_parent[clamp_id].value;
9993                        }
9994                }
9995                /* Ensure protection is always capped by limit */
9996                eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
9997
9998                /* Propagate most restrictive effective clamps */
9999                clamps = 0x0;
10000                uc_se = css_tg(css)->uclamp;
10001                for_each_clamp_id(clamp_id) {
10002                        if (eff[clamp_id] == uc_se[clamp_id].value)
10003                                continue;
10004                        uc_se[clamp_id].value = eff[clamp_id];
10005                        uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
10006                        clamps |= (0x1 << clamp_id);
10007                }
10008                if (!clamps) {
10009                        css = css_rightmost_descendant(css);
10010                        continue;
10011                }
10012
10013                /* Immediately update descendants RUNNABLE tasks */
10014                uclamp_update_active_tasks(css);
10015        }
10016}
10017
10018/*
10019 * Integer 10^N with a given N exponent by casting to integer the literal "1eN"
10020 * C expression. Since there is no way to convert a macro argument (N) into a
10021 * character constant, use two levels of macros.
10022 */
10023#define _POW10(exp) ((unsigned int)1e##exp)
10024#define POW10(exp) _POW10(exp)
10025
10026struct uclamp_request {
10027#define UCLAMP_PERCENT_SHIFT    2
10028#define UCLAMP_PERCENT_SCALE    (100 * POW10(UCLAMP_PERCENT_SHIFT))
10029        s64 percent;
10030        u64 util;
10031        int ret;
10032};
10033
10034static inline struct uclamp_request
10035capacity_from_percent(char *buf)
10036{
10037        struct uclamp_request req = {
10038                .percent = UCLAMP_PERCENT_SCALE,
10039                .util = SCHED_CAPACITY_SCALE,
10040                .ret = 0,
10041        };
10042
10043        buf = strim(buf);
10044        if (strcmp(buf, "max")) {
10045                req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
10046                                             &req.percent);
10047                if (req.ret)
10048                        return req;
10049                if ((u64)req.percent > UCLAMP_PERCENT_SCALE) {
10050                        req.ret = -ERANGE;
10051                        return req;
10052                }
10053
10054                req.util = req.percent << SCHED_CAPACITY_SHIFT;
10055                req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
10056        }
10057
10058        return req;
10059}
10060
10061static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
10062                                size_t nbytes, loff_t off,
10063                                enum uclamp_id clamp_id)
10064{
10065        struct uclamp_request req;
10066        struct task_group *tg;
10067
10068        req = capacity_from_percent(buf);
10069        if (req.ret)
10070                return req.ret;
10071
10072        static_branch_enable(&sched_uclamp_used);
10073
10074        mutex_lock(&uclamp_mutex);
10075        rcu_read_lock();
10076
10077        tg = css_tg(of_css(of));
10078        if (tg->uclamp_req[clamp_id].value != req.util)
10079                uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
10080
10081        /*
10082         * Because of not recoverable conversion rounding we keep track of the
10083         * exact requested value
10084         */
10085        tg->uclamp_pct[clamp_id] = req.percent;
10086
10087        /* Update effective clamps to track the most restrictive value */
10088        cpu_util_update_eff(of_css(of));
10089
10090        rcu_read_unlock();
10091        mutex_unlock(&uclamp_mutex);
10092
10093        return nbytes;
10094}
10095
10096static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
10097                                    char *buf, size_t nbytes,
10098                                    loff_t off)
10099{
10100        return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
10101}
10102
10103static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
10104                                    char *buf, size_t nbytes,
10105                                    loff_t off)
10106{
10107        return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
10108}
10109
10110static inline void cpu_uclamp_print(struct seq_file *sf,
10111                                    enum uclamp_id clamp_id)
10112{
10113        struct task_group *tg;
10114        u64 util_clamp;
10115        u64 percent;
10116        u32 rem;
10117
10118        rcu_read_lock();
10119        tg = css_tg(seq_css(sf));
10120        util_clamp = tg->uclamp_req[clamp_id].value;
10121        rcu_read_unlock();
10122
10123        if (util_clamp == SCHED_CAPACITY_SCALE) {
10124                seq_puts(sf, "max\n");
10125                return;
10126        }
10127
10128        percent = tg->uclamp_pct[clamp_id];
10129        percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
10130        seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
10131}
10132
10133static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
10134{
10135        cpu_uclamp_print(sf, UCLAMP_MIN);
10136        return 0;
10137}
10138
10139static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
10140{
10141        cpu_uclamp_print(sf, UCLAMP_MAX);
10142        return 0;
10143}
10144#endif /* CONFIG_UCLAMP_TASK_GROUP */
10145
10146#ifdef CONFIG_FAIR_GROUP_SCHED
10147static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
10148                                struct cftype *cftype, u64 shareval)
10149{
10150        if (shareval > scale_load_down(ULONG_MAX))
10151                shareval = MAX_SHARES;
10152        return sched_group_set_shares(css_tg(css), scale_load(shareval));
10153}
10154
10155static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
10156                               struct cftype *cft)
10157{
10158        struct task_group *tg = css_tg(css);
10159
10160        return (u64) scale_load_down(tg->shares);
10161}
10162
10163#ifdef CONFIG_CFS_BANDWIDTH
10164static DEFINE_MUTEX(cfs_constraints_mutex);
10165
10166const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
10167static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
10168/* More than 203 days if BW_SHIFT equals 20. */
10169static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
10170
10171static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
10172
10173static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
10174                                u64 burst)
10175{
10176        int i, ret = 0, runtime_enabled, runtime_was_enabled;
10177        struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
10178
10179        if (tg == &root_task_group)
10180                return -EINVAL;
10181
10182        /*
10183         * Ensure we have at some amount of bandwidth every period.  This is
10184         * to prevent reaching a state of large arrears when throttled via
10185         * entity_tick() resulting in prolonged exit starvation.
10186         */
10187        if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
10188                return -EINVAL;
10189
10190        /*
10191         * Likewise, bound things on the other side by preventing insane quota
10192         * periods.  This also allows us to normalize in computing quota
10193         * feasibility.
10194         */
10195        if (period > max_cfs_quota_period)
10196                return -EINVAL;
10197
10198        /*
10199         * Bound quota to defend quota against overflow during bandwidth shift.
10200         */
10201        if (quota != RUNTIME_INF && quota > max_cfs_runtime)
10202                return -EINVAL;
10203
10204        if (quota != RUNTIME_INF && (burst > quota ||
10205                                     burst + quota > max_cfs_runtime))
10206                return -EINVAL;
10207
10208        /*
10209         * Prevent race between setting of cfs_rq->runtime_enabled and
10210         * unthrottle_offline_cfs_rqs().
10211         */
10212        cpus_read_lock();
10213        mutex_lock(&cfs_constraints_mutex);
10214        ret = __cfs_schedulable(tg, period, quota);
10215        if (ret)
10216                goto out_unlock;
10217
10218        runtime_enabled = quota != RUNTIME_INF;
10219        runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
10220        /*
10221         * If we need to toggle cfs_bandwidth_used, off->on must occur
10222         * before making related changes, and on->off must occur afterwards
10223         */
10224        if (runtime_enabled && !runtime_was_enabled)
10225                cfs_bandwidth_usage_inc();
10226        raw_spin_lock_irq(&cfs_b->lock);
10227        cfs_b->period = ns_to_ktime(period);
10228        cfs_b->quota = quota;
10229        cfs_b->burst = burst;
10230
10231        __refill_cfs_bandwidth_runtime(cfs_b);
10232
10233        /* Restart the period timer (if active) to handle new period expiry: */
10234        if (runtime_enabled)
10235                start_cfs_bandwidth(cfs_b);
10236
10237        raw_spin_unlock_irq(&cfs_b->lock);
10238
10239        for_each_online_cpu(i) {
10240                struct cfs_rq *cfs_rq = tg->cfs_rq[i];
10241                struct rq *rq = cfs_rq->rq;
10242                struct rq_flags rf;
10243
10244                rq_lock_irq(rq, &rf);
10245                cfs_rq->runtime_enabled = runtime_enabled;
10246                cfs_rq->runtime_remaining = 0;
10247
10248                if (cfs_rq->throttled)
10249                        unthrottle_cfs_rq(cfs_rq);
10250                rq_unlock_irq(rq, &rf);
10251        }
10252        if (runtime_was_enabled && !runtime_enabled)
10253                cfs_bandwidth_usage_dec();
10254out_unlock:
10255        mutex_unlock(&cfs_constraints_mutex);
10256        cpus_read_unlock();
10257
10258        return ret;
10259}
10260
10261static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
10262{
10263        u64 quota, period, burst;
10264
10265        period = ktime_to_ns(tg->cfs_bandwidth.period);
10266        burst = tg->cfs_bandwidth.burst;
10267        if (cfs_quota_us < 0)
10268                quota = RUNTIME_INF;
10269        else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
10270                quota = (u64)cfs_quota_us * NSEC_PER_USEC;
10271        else
10272                return -EINVAL;
10273
10274        return tg_set_cfs_bandwidth(tg, period, quota, burst);
10275}
10276
10277static long tg_get_cfs_quota(struct task_group *tg)
10278{
10279        u64 quota_us;
10280
10281        if (tg->cfs_bandwidth.quota == RUNTIME_INF)
10282                return -1;
10283
10284        quota_us = tg->cfs_bandwidth.quota;
10285        do_div(quota_us, NSEC_PER_USEC);
10286
10287        return quota_us;
10288}
10289
10290static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
10291{
10292        u64 quota, period, burst;
10293
10294        if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
10295                return -EINVAL;
10296
10297        period = (u64)cfs_period_us * NSEC_PER_USEC;
10298        quota = tg->cfs_bandwidth.quota;
10299        burst = tg->cfs_bandwidth.burst;
10300
10301        return tg_set_cfs_bandwidth(tg, period, quota, burst);
10302}
10303
10304static long tg_get_cfs_period(struct task_group *tg)
10305{
10306        u64 cfs_period_us;
10307
10308        cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
10309        do_div(cfs_period_us, NSEC_PER_USEC);
10310
10311        return cfs_period_us;
10312}
10313
10314static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us)
10315{
10316        u64 quota, period, burst;
10317
10318        if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC)
10319                return -EINVAL;
10320
10321        burst = (u64)cfs_burst_us * NSEC_PER_USEC;
10322        period = ktime_to_ns(tg->cfs_bandwidth.period);
10323        quota = tg->cfs_bandwidth.quota;
10324
10325        return tg_set_cfs_bandwidth(tg, period, quota, burst);
10326}
10327
10328static long tg_get_cfs_burst(struct task_group *tg)
10329{
10330        u64 burst_us;
10331
10332        burst_us = tg->cfs_bandwidth.burst;
10333        do_div(burst_us, NSEC_PER_USEC);
10334
10335        return burst_us;
10336}
10337
10338static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
10339                                  struct cftype *cft)
10340{
10341        return tg_get_cfs_quota(css_tg(css));
10342}
10343
10344static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
10345                                   struct cftype *cftype, s64 cfs_quota_us)
10346{
10347        return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
10348}
10349
10350static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
10351                                   struct cftype *cft)
10352{
10353        return tg_get_cfs_period(css_tg(css));
10354}
10355
10356static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
10357                                    struct cftype *cftype, u64 cfs_period_us)
10358{
10359        return tg_set_cfs_period(css_tg(css), cfs_period_us);
10360}
10361
10362static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css,
10363                                  struct cftype *cft)
10364{
10365        return tg_get_cfs_burst(css_tg(css));
10366}
10367
10368static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css,
10369                                   struct cftype *cftype, u64 cfs_burst_us)
10370{
10371        return tg_set_cfs_burst(css_tg(css), cfs_burst_us);
10372}
10373
10374struct cfs_schedulable_data {
10375        struct task_group *tg;
10376        u64 period, quota;
10377};
10378
10379/*
10380 * normalize group quota/period to be quota/max_period
10381 * note: units are usecs
10382 */
10383static u64 normalize_cfs_quota(struct task_group *tg,
10384                               struct cfs_schedulable_data *d)
10385{
10386        u64 quota, period;
10387
10388        if (tg == d->tg) {
10389                period = d->period;
10390                quota = d->quota;
10391        } else {
10392                period = tg_get_cfs_period(tg);
10393                quota = tg_get_cfs_quota(tg);
10394        }
10395
10396        /* note: these should typically be equivalent */
10397        if (quota == RUNTIME_INF || quota == -1)
10398                return RUNTIME_INF;
10399
10400        return to_ratio(period, quota);
10401}
10402
10403static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
10404{
10405        struct cfs_schedulable_data *d = data;
10406        struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
10407        s64 quota = 0, parent_quota = -1;
10408
10409        if (!tg->parent) {
10410                quota = RUNTIME_INF;
10411        } else {
10412                struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
10413
10414                quota = normalize_cfs_quota(tg, d);
10415                parent_quota = parent_b->hierarchical_quota;
10416
10417                /*
10418                 * Ensure max(child_quota) <= parent_quota.  On cgroup2,
10419                 * always take the min.  On cgroup1, only inherit when no
10420                 * limit is set:
10421                 */
10422                if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
10423                        quota = min(quota, parent_quota);
10424                } else {
10425                        if (quota == RUNTIME_INF)
10426                                quota = parent_quota;
10427                        else if (parent_quota != RUNTIME_INF && quota > parent_quota)
10428                                return -EINVAL;
10429                }
10430        }
10431        cfs_b->hierarchical_quota = quota;
10432
10433        return 0;
10434}
10435
10436static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
10437{
10438        int ret;
10439        struct cfs_schedulable_data data = {
10440                .tg = tg,
10441                .period = period,
10442                .quota = quota,
10443        };
10444
10445        if (quota != RUNTIME_INF) {
10446                do_div(data.period, NSEC_PER_USEC);
10447                do_div(data.quota, NSEC_PER_USEC);
10448        }
10449
10450        rcu_read_lock();
10451        ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
10452        rcu_read_unlock();
10453
10454        return ret;
10455}
10456
10457static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
10458{
10459        struct task_group *tg = css_tg(seq_css(sf));
10460        struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
10461
10462        seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
10463        seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
10464        seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
10465
10466        if (schedstat_enabled() && tg != &root_task_group) {
10467                u64 ws = 0;
10468                int i;
10469
10470                for_each_possible_cpu(i)
10471                        ws += schedstat_val(tg->se[i]->statistics.wait_sum);
10472
10473                seq_printf(sf, "wait_sum %llu\n", ws);
10474        }
10475
10476        return 0;
10477}
10478#endif /* CONFIG_CFS_BANDWIDTH */
10479#endif /* CONFIG_FAIR_GROUP_SCHED */
10480
10481#ifdef CONFIG_RT_GROUP_SCHED
10482static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
10483                                struct cftype *cft, s64 val)
10484{
10485        return sched_group_set_rt_runtime(css_tg(css), val);
10486}
10487
10488static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
10489                               struct cftype *cft)
10490{
10491        return sched_group_rt_runtime(css_tg(css));
10492}
10493
10494static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
10495                                    struct cftype *cftype, u64 rt_period_us)
10496{
10497        return sched_group_set_rt_period(css_tg(css), rt_period_us);
10498}
10499
10500static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
10501                                   struct cftype *cft)
10502{
10503        return sched_group_rt_period(css_tg(css));
10504}
10505#endif /* CONFIG_RT_GROUP_SCHED */
10506
10507#ifdef CONFIG_FAIR_GROUP_SCHED
10508static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
10509                               struct cftype *cft)
10510{
10511        return css_tg(css)->idle;
10512}
10513
10514static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
10515                                struct cftype *cft, s64 idle)
10516{
10517        return sched_group_set_idle(css_tg(css), idle);
10518}
10519#endif
10520
10521static struct cftype cpu_legacy_files[] = {
10522#ifdef CONFIG_FAIR_GROUP_SCHED
10523        {
10524                .name = "shares",
10525                .read_u64 = cpu_shares_read_u64,
10526                .write_u64 = cpu_shares_write_u64,
10527        },
10528        {
10529                .name = "idle",
10530                .read_s64 = cpu_idle_read_s64,
10531                .write_s64 = cpu_idle_write_s64,
10532        },
10533#endif
10534#ifdef CONFIG_CFS_BANDWIDTH
10535        {
10536                .name = "cfs_quota_us",
10537                .read_s64 = cpu_cfs_quota_read_s64,
10538                .write_s64 = cpu_cfs_quota_write_s64,
10539        },
10540        {
10541                .name = "cfs_period_us",
10542                .read_u64 = cpu_cfs_period_read_u64,
10543                .write_u64 = cpu_cfs_period_write_u64,
10544        },
10545        {
10546                .name = "cfs_burst_us",
10547                .read_u64 = cpu_cfs_burst_read_u64,
10548                .write_u64 = cpu_cfs_burst_write_u64,
10549        },
10550        {
10551                .name = "stat",
10552                .seq_show = cpu_cfs_stat_show,
10553        },
10554#endif
10555#ifdef CONFIG_RT_GROUP_SCHED
10556        {
10557                .name = "rt_runtime_us",
10558                .read_s64 = cpu_rt_runtime_read,
10559                .write_s64 = cpu_rt_runtime_write,
10560        },
10561        {
10562                .name = "rt_period_us",
10563                .read_u64 = cpu_rt_period_read_uint,
10564                .write_u64 = cpu_rt_period_write_uint,
10565        },
10566#endif
10567#ifdef CONFIG_UCLAMP_TASK_GROUP
10568        {
10569                .name = "uclamp.min",
10570                .flags = CFTYPE_NOT_ON_ROOT,
10571                .seq_show = cpu_uclamp_min_show,
10572                .write = cpu_uclamp_min_write,
10573        },
10574        {
10575                .name = "uclamp.max",
10576                .flags = CFTYPE_NOT_ON_ROOT,
10577                .seq_show = cpu_uclamp_max_show,
10578                .write = cpu_uclamp_max_write,
10579        },
10580#endif
10581        { }     /* Terminate */
10582};
10583
10584static int cpu_extra_stat_show(struct seq_file *sf,
10585                               struct cgroup_subsys_state *css)
10586{
10587#ifdef CONFIG_CFS_BANDWIDTH
10588        {
10589                struct task_group *tg = css_tg(css);
10590                struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
10591                u64 throttled_usec;
10592
10593                throttled_usec = cfs_b->throttled_time;
10594                do_div(throttled_usec, NSEC_PER_USEC);
10595
10596                seq_printf(sf, "nr_periods %d\n"
10597                           "nr_throttled %d\n"
10598                           "throttled_usec %llu\n",
10599                           cfs_b->nr_periods, cfs_b->nr_throttled,
10600                           throttled_usec);
10601        }
10602#endif
10603        return 0;
10604}
10605
10606#ifdef CONFIG_FAIR_GROUP_SCHED
10607static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
10608                               struct cftype *cft)
10609{
10610        struct task_group *tg = css_tg(css);
10611        u64 weight = scale_load_down(tg->shares);
10612
10613        return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
10614}
10615
10616static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
10617                                struct cftype *cft, u64 weight)
10618{
10619        /*
10620         * cgroup weight knobs should use the common MIN, DFL and MAX
10621         * values which are 1, 100 and 10000 respectively.  While it loses
10622         * a bit of range on both ends, it maps pretty well onto the shares
10623         * value used by scheduler and the round-trip conversions preserve
10624         * the original value over the entire range.
10625         */
10626        if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
10627                return -ERANGE;
10628
10629        weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
10630
10631        return sched_group_set_shares(css_tg(css), scale_load(weight));
10632}
10633
10634static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
10635                                    struct cftype *cft)
10636{
10637        unsigned long weight = scale_load_down(css_tg(css)->shares);
10638        int last_delta = INT_MAX;
10639        int prio, delta;
10640
10641        /* find the closest nice value to the current weight */
10642        for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
10643                delta = abs(sched_prio_to_weight[prio] - weight);
10644                if (delta >= last_delta)
10645                        break;
10646                last_delta = delta;
10647        }
10648
10649        return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
10650}
10651
10652static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
10653                                     struct cftype *cft, s64 nice)
10654{
10655        unsigned long weight;
10656        int idx;
10657
10658        if (nice < MIN_NICE || nice > MAX_NICE)
10659                return -ERANGE;
10660
10661        idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO;
10662        idx = array_index_nospec(idx, 40);
10663        weight = sched_prio_to_weight[idx];
10664
10665        return sched_group_set_shares(css_tg(css), scale_load(weight));
10666}
10667#endif
10668
10669static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
10670                                                  long period, long quota)
10671{
10672        if (quota < 0)
10673                seq_puts(sf, "max");
10674        else
10675                seq_printf(sf, "%ld", quota);
10676
10677        seq_printf(sf, " %ld\n", period);
10678}
10679
10680/* caller should put the current value in *@periodp before calling */
10681static int __maybe_unused cpu_period_quota_parse(char *buf,
10682                                                 u64 *periodp, u64 *quotap)
10683{
10684        char tok[21];   /* U64_MAX */
10685
10686        if (sscanf(buf, "%20s %llu", tok, periodp) < 1)
10687                return -EINVAL;
10688
10689        *periodp *= NSEC_PER_USEC;
10690
10691        if (sscanf(tok, "%llu", quotap))
10692                *quotap *= NSEC_PER_USEC;
10693        else if (!strcmp(tok, "max"))
10694                *quotap = RUNTIME_INF;
10695        else
10696                return -EINVAL;
10697
10698        return 0;
10699}
10700
10701#ifdef CONFIG_CFS_BANDWIDTH
10702static int cpu_max_show(struct seq_file *sf, void *v)
10703{
10704        struct task_group *tg = css_tg(seq_css(sf));
10705
10706        cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
10707        return 0;
10708}
10709
10710static ssize_t cpu_max_write(struct kernfs_open_file *of,
10711                             char *buf, size_t nbytes, loff_t off)
10712{
10713        struct task_group *tg = css_tg(of_css(of));
10714        u64 period = tg_get_cfs_period(tg);
10715        u64 burst = tg_get_cfs_burst(tg);
10716        u64 quota;
10717        int ret;
10718
10719        ret = cpu_period_quota_parse(buf, &period, &quota);
10720        if (!ret)
10721                ret = tg_set_cfs_bandwidth(tg, period, quota, burst);
10722        return ret ?: nbytes;
10723}
10724#endif
10725
10726static struct cftype cpu_files[] = {
10727#ifdef CONFIG_FAIR_GROUP_SCHED
10728        {
10729                .name = "weight",
10730                .flags = CFTYPE_NOT_ON_ROOT,
10731                .read_u64 = cpu_weight_read_u64,
10732                .write_u64 = cpu_weight_write_u64,
10733        },
10734        {
10735                .name = "weight.nice",
10736                .flags = CFTYPE_NOT_ON_ROOT,
10737                .read_s64 = cpu_weight_nice_read_s64,
10738                .write_s64 = cpu_weight_nice_write_s64,
10739        },
10740        {
10741                .name = "idle",
10742                .flags = CFTYPE_NOT_ON_ROOT,
10743                .read_s64 = cpu_idle_read_s64,
10744                .write_s64 = cpu_idle_write_s64,
10745        },
10746#endif
10747#ifdef CONFIG_CFS_BANDWIDTH
10748        {
10749                .name = "max",
10750                .flags = CFTYPE_NOT_ON_ROOT,
10751                .seq_show = cpu_max_show,
10752                .write = cpu_max_write,
10753        },
10754        {
10755                .name = "max.burst",
10756                .flags = CFTYPE_NOT_ON_ROOT,
10757                .read_u64 = cpu_cfs_burst_read_u64,
10758                .write_u64 = cpu_cfs_burst_write_u64,
10759        },
10760#endif
10761#ifdef CONFIG_UCLAMP_TASK_GROUP
10762        {
10763                .name = "uclamp.min",
10764                .flags = CFTYPE_NOT_ON_ROOT,
10765                .seq_show = cpu_uclamp_min_show,
10766                .write = cpu_uclamp_min_write,
10767        },
10768        {
10769                .name = "uclamp.max",
10770                .flags = CFTYPE_NOT_ON_ROOT,
10771                .seq_show = cpu_uclamp_max_show,
10772                .write = cpu_uclamp_max_write,
10773        },
10774#endif
10775        { }     /* terminate */
10776};
10777
10778struct cgroup_subsys cpu_cgrp_subsys = {
10779        .css_alloc      = cpu_cgroup_css_alloc,
10780        .css_online     = cpu_cgroup_css_online,
10781        .css_released   = cpu_cgroup_css_released,
10782        .css_free       = cpu_cgroup_css_free,
10783        .css_extra_stat_show = cpu_extra_stat_show,
10784        .fork           = cpu_cgroup_fork,
10785        .can_attach     = cpu_cgroup_can_attach,
10786        .attach         = cpu_cgroup_attach,
10787        .legacy_cftypes = cpu_legacy_files,
10788        .dfl_cftypes    = cpu_files,
10789        .early_init     = true,
10790        .threaded       = true,
10791};
10792
10793#endif  /* CONFIG_CGROUP_SCHED */
10794
10795void dump_cpu_task(int cpu)
10796{
10797        pr_info("Task dump for CPU %d:\n", cpu);
10798        sched_show_task(cpu_curr(cpu));
10799}
10800
10801/*
10802 * Nice levels are multiplicative, with a gentle 10% change for every
10803 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
10804 * nice 1, it will get ~10% less CPU time than another CPU-bound task
10805 * that remained on nice 0.
10806 *
10807 * The "10% effect" is relative and cumulative: from _any_ nice level,
10808 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
10809 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
10810 * If a task goes up by ~10% and another task goes down by ~10% then
10811 * the relative distance between them is ~25%.)
10812 */
10813const int sched_prio_to_weight[40] = {
10814 /* -20 */     88761,     71755,     56483,     46273,     36291,
10815 /* -15 */     29154,     23254,     18705,     14949,     11916,
10816 /* -10 */      9548,      7620,      6100,      4904,      3906,
10817 /*  -5 */      3121,      2501,      1991,      1586,      1277,
10818 /*   0 */      1024,       820,       655,       526,       423,
10819 /*   5 */       335,       272,       215,       172,       137,
10820 /*  10 */       110,        87,        70,        56,        45,
10821 /*  15 */        36,        29,        23,        18,        15,
10822};
10823
10824/*
10825 * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
10826 *
10827 * In cases where the weight does not change often, we can use the
10828 * precalculated inverse to speed up arithmetics by turning divisions
10829 * into multiplications:
10830 */
10831const u32 sched_prio_to_wmult[40] = {
10832 /* -20 */     48388,     59856,     76040,     92818,    118348,
10833 /* -15 */    147320,    184698,    229616,    287308,    360437,
10834 /* -10 */    449829,    563644,    704093,    875809,   1099582,
10835 /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
10836 /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
10837 /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
10838 /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
10839 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
10840};
10841
10842void call_trace_sched_update_nr_running(struct rq *rq, int count)
10843{
10844        trace_sched_update_nr_running_tp(rq, count);
10845}
10846