linux/kernel/sched/pelt.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Per Entity Load Tracking
   4 *
   5 *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   6 *
   7 *  Interactivity improvements by Mike Galbraith
   8 *  (C) 2007 Mike Galbraith <efault@gmx.de>
   9 *
  10 *  Various enhancements by Dmitry Adamushko.
  11 *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  12 *
  13 *  Group scheduling enhancements by Srivatsa Vaddagiri
  14 *  Copyright IBM Corporation, 2007
  15 *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  16 *
  17 *  Scaled math optimizations by Thomas Gleixner
  18 *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  19 *
  20 *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  21 *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  22 *
  23 *  Move PELT related code from fair.c into this pelt.c file
  24 *  Author: Vincent Guittot <vincent.guittot@linaro.org>
  25 */
  26
  27/*
  28 * Approximate:
  29 *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
  30 */
  31static u64 decay_load(u64 val, u64 n)
  32{
  33        unsigned int local_n;
  34
  35        if (unlikely(n > LOAD_AVG_PERIOD * 63))
  36                return 0;
  37
  38        /* after bounds checking we can collapse to 32-bit */
  39        local_n = n;
  40
  41        /*
  42         * As y^PERIOD = 1/2, we can combine
  43         *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
  44         * With a look-up table which covers y^n (n<PERIOD)
  45         *
  46         * To achieve constant time decay_load.
  47         */
  48        if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
  49                val >>= local_n / LOAD_AVG_PERIOD;
  50                local_n %= LOAD_AVG_PERIOD;
  51        }
  52
  53        val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
  54        return val;
  55}
  56
  57static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
  58{
  59        u32 c1, c2, c3 = d3; /* y^0 == 1 */
  60
  61        /*
  62         * c1 = d1 y^p
  63         */
  64        c1 = decay_load((u64)d1, periods);
  65
  66        /*
  67         *            p-1
  68         * c2 = 1024 \Sum y^n
  69         *            n=1
  70         *
  71         *              inf        inf
  72         *    = 1024 ( \Sum y^n - \Sum y^n - y^0 )
  73         *              n=0        n=p
  74         */
  75        c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
  76
  77        return c1 + c2 + c3;
  78}
  79
  80/*
  81 * Accumulate the three separate parts of the sum; d1 the remainder
  82 * of the last (incomplete) period, d2 the span of full periods and d3
  83 * the remainder of the (incomplete) current period.
  84 *
  85 *           d1          d2           d3
  86 *           ^           ^            ^
  87 *           |           |            |
  88 *         |<->|<----------------->|<--->|
  89 * ... |---x---|------| ... |------|-----x (now)
  90 *
  91 *                           p-1
  92 * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
  93 *                           n=1
  94 *
  95 *    = u y^p +                                 (Step 1)
  96 *
  97 *                     p-1
  98 *      d1 y^p + 1024 \Sum y^n + d3 y^0         (Step 2)
  99 *                     n=1
 100 */
 101static __always_inline u32
 102accumulate_sum(u64 delta, struct sched_avg *sa,
 103               unsigned long load, unsigned long runnable, int running)
 104{
 105        u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
 106        u64 periods;
 107
 108        delta += sa->period_contrib;
 109        periods = delta / 1024; /* A period is 1024us (~1ms) */
 110
 111        /*
 112         * Step 1: decay old *_sum if we crossed period boundaries.
 113         */
 114        if (periods) {
 115                sa->load_sum = decay_load(sa->load_sum, periods);
 116                sa->runnable_sum =
 117                        decay_load(sa->runnable_sum, periods);
 118                sa->util_sum = decay_load((u64)(sa->util_sum), periods);
 119
 120                /*
 121                 * Step 2
 122                 */
 123                delta %= 1024;
 124                if (load) {
 125                        /*
 126                         * This relies on the:
 127                         *
 128                         * if (!load)
 129                         *      runnable = running = 0;
 130                         *
 131                         * clause from ___update_load_sum(); this results in
 132                         * the below usage of @contrib to disappear entirely,
 133                         * so no point in calculating it.
 134                         */
 135                        contrib = __accumulate_pelt_segments(periods,
 136                                        1024 - sa->period_contrib, delta);
 137                }
 138        }
 139        sa->period_contrib = delta;
 140
 141        if (load)
 142                sa->load_sum += load * contrib;
 143        if (runnable)
 144                sa->runnable_sum += runnable * contrib << SCHED_CAPACITY_SHIFT;
 145        if (running)
 146                sa->util_sum += contrib << SCHED_CAPACITY_SHIFT;
 147
 148        return periods;
 149}
 150
 151/*
 152 * We can represent the historical contribution to runnable average as the
 153 * coefficients of a geometric series.  To do this we sub-divide our runnable
 154 * history into segments of approximately 1ms (1024us); label the segment that
 155 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
 156 *
 157 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
 158 *      p0            p1           p2
 159 *     (now)       (~1ms ago)  (~2ms ago)
 160 *
 161 * Let u_i denote the fraction of p_i that the entity was runnable.
 162 *
 163 * We then designate the fractions u_i as our co-efficients, yielding the
 164 * following representation of historical load:
 165 *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
 166 *
 167 * We choose y based on the with of a reasonably scheduling period, fixing:
 168 *   y^32 = 0.5
 169 *
 170 * This means that the contribution to load ~32ms ago (u_32) will be weighted
 171 * approximately half as much as the contribution to load within the last ms
 172 * (u_0).
 173 *
 174 * When a period "rolls over" and we have new u_0`, multiplying the previous
 175 * sum again by y is sufficient to update:
 176 *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
 177 *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
 178 */
 179static __always_inline int
 180___update_load_sum(u64 now, struct sched_avg *sa,
 181                  unsigned long load, unsigned long runnable, int running)
 182{
 183        u64 delta;
 184
 185        delta = now - sa->last_update_time;
 186        /*
 187         * This should only happen when time goes backwards, which it
 188         * unfortunately does during sched clock init when we swap over to TSC.
 189         */
 190        if ((s64)delta < 0) {
 191                sa->last_update_time = now;
 192                return 0;
 193        }
 194
 195        /*
 196         * Use 1024ns as the unit of measurement since it's a reasonable
 197         * approximation of 1us and fast to compute.
 198         */
 199        delta >>= 10;
 200        if (!delta)
 201                return 0;
 202
 203        sa->last_update_time += delta << 10;
 204
 205        /*
 206         * running is a subset of runnable (weight) so running can't be set if
 207         * runnable is clear. But there are some corner cases where the current
 208         * se has been already dequeued but cfs_rq->curr still points to it.
 209         * This means that weight will be 0 but not running for a sched_entity
 210         * but also for a cfs_rq if the latter becomes idle. As an example,
 211         * this happens during idle_balance() which calls
 212         * update_blocked_averages().
 213         *
 214         * Also see the comment in accumulate_sum().
 215         */
 216        if (!load)
 217                runnable = running = 0;
 218
 219        /*
 220         * Now we know we crossed measurement unit boundaries. The *_avg
 221         * accrues by two steps:
 222         *
 223         * Step 1: accumulate *_sum since last_update_time. If we haven't
 224         * crossed period boundaries, finish.
 225         */
 226        if (!accumulate_sum(delta, sa, load, runnable, running))
 227                return 0;
 228
 229        return 1;
 230}
 231
 232/*
 233 * When syncing *_avg with *_sum, we must take into account the current
 234 * position in the PELT segment otherwise the remaining part of the segment
 235 * will be considered as idle time whereas it's not yet elapsed and this will
 236 * generate unwanted oscillation in the range [1002..1024[.
 237 *
 238 * The max value of *_sum varies with the position in the time segment and is
 239 * equals to :
 240 *
 241 *   LOAD_AVG_MAX*y + sa->period_contrib
 242 *
 243 * which can be simplified into:
 244 *
 245 *   LOAD_AVG_MAX - 1024 + sa->period_contrib
 246 *
 247 * because LOAD_AVG_MAX*y == LOAD_AVG_MAX-1024
 248 *
 249 * The same care must be taken when a sched entity is added, updated or
 250 * removed from a cfs_rq and we need to update sched_avg. Scheduler entities
 251 * and the cfs rq, to which they are attached, have the same position in the
 252 * time segment because they use the same clock. This means that we can use
 253 * the period_contrib of cfs_rq when updating the sched_avg of a sched_entity
 254 * if it's more convenient.
 255 */
 256static __always_inline void
 257___update_load_avg(struct sched_avg *sa, unsigned long load)
 258{
 259        u32 divider = get_pelt_divider(sa);
 260
 261        /*
 262         * Step 2: update *_avg.
 263         */
 264        sa->load_avg = div_u64(load * sa->load_sum, divider);
 265        sa->runnable_avg = div_u64(sa->runnable_sum, divider);
 266        WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
 267}
 268
 269/*
 270 * sched_entity:
 271 *
 272 *   task:
 273 *     se_weight()   = se->load.weight
 274 *     se_runnable() = !!on_rq
 275 *
 276 *   group: [ see update_cfs_group() ]
 277 *     se_weight()   = tg->weight * grq->load_avg / tg->load_avg
 278 *     se_runnable() = grq->h_nr_running
 279 *
 280 *   runnable_sum = se_runnable() * runnable = grq->runnable_sum
 281 *   runnable_avg = runnable_sum
 282 *
 283 *   load_sum := runnable
 284 *   load_avg = se_weight(se) * load_sum
 285 *
 286 * cfq_rq:
 287 *
 288 *   runnable_sum = \Sum se->avg.runnable_sum
 289 *   runnable_avg = \Sum se->avg.runnable_avg
 290 *
 291 *   load_sum = \Sum se_weight(se) * se->avg.load_sum
 292 *   load_avg = \Sum se->avg.load_avg
 293 */
 294
 295int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
 296{
 297        if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
 298                ___update_load_avg(&se->avg, se_weight(se));
 299                trace_pelt_se_tp(se);
 300                return 1;
 301        }
 302
 303        return 0;
 304}
 305
 306int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
 307{
 308        if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se),
 309                                cfs_rq->curr == se)) {
 310
 311                ___update_load_avg(&se->avg, se_weight(se));
 312                cfs_se_util_change(&se->avg);
 313                trace_pelt_se_tp(se);
 314                return 1;
 315        }
 316
 317        return 0;
 318}
 319
 320int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
 321{
 322        if (___update_load_sum(now, &cfs_rq->avg,
 323                                scale_load_down(cfs_rq->load.weight),
 324                                cfs_rq->h_nr_running,
 325                                cfs_rq->curr != NULL)) {
 326
 327                ___update_load_avg(&cfs_rq->avg, 1);
 328                trace_pelt_cfs_tp(cfs_rq);
 329                return 1;
 330        }
 331
 332        return 0;
 333}
 334
 335/*
 336 * rt_rq:
 337 *
 338 *   util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
 339 *   util_sum = cpu_scale * load_sum
 340 *   runnable_sum = util_sum
 341 *
 342 *   load_avg and runnable_avg are not supported and meaningless.
 343 *
 344 */
 345
 346int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
 347{
 348        if (___update_load_sum(now, &rq->avg_rt,
 349                                running,
 350                                running,
 351                                running)) {
 352
 353                ___update_load_avg(&rq->avg_rt, 1);
 354                trace_pelt_rt_tp(rq);
 355                return 1;
 356        }
 357
 358        return 0;
 359}
 360
 361/*
 362 * dl_rq:
 363 *
 364 *   util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
 365 *   util_sum = cpu_scale * load_sum
 366 *   runnable_sum = util_sum
 367 *
 368 *   load_avg and runnable_avg are not supported and meaningless.
 369 *
 370 */
 371
 372int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
 373{
 374        if (___update_load_sum(now, &rq->avg_dl,
 375                                running,
 376                                running,
 377                                running)) {
 378
 379                ___update_load_avg(&rq->avg_dl, 1);
 380                trace_pelt_dl_tp(rq);
 381                return 1;
 382        }
 383
 384        return 0;
 385}
 386
 387#ifdef CONFIG_SCHED_THERMAL_PRESSURE
 388/*
 389 * thermal:
 390 *
 391 *   load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked
 392 *
 393 *   util_avg and runnable_load_avg are not supported and meaningless.
 394 *
 395 * Unlike rt/dl utilization tracking that track time spent by a cpu
 396 * running a rt/dl task through util_avg, the average thermal pressure is
 397 * tracked through load_avg. This is because thermal pressure signal is
 398 * time weighted "delta" capacity unlike util_avg which is binary.
 399 * "delta capacity" =  actual capacity  -
 400 *                      capped capacity a cpu due to a thermal event.
 401 */
 402
 403int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
 404{
 405        if (___update_load_sum(now, &rq->avg_thermal,
 406                               capacity,
 407                               capacity,
 408                               capacity)) {
 409                ___update_load_avg(&rq->avg_thermal, 1);
 410                trace_pelt_thermal_tp(rq);
 411                return 1;
 412        }
 413
 414        return 0;
 415}
 416#endif
 417
 418#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
 419/*
 420 * irq:
 421 *
 422 *   util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
 423 *   util_sum = cpu_scale * load_sum
 424 *   runnable_sum = util_sum
 425 *
 426 *   load_avg and runnable_avg are not supported and meaningless.
 427 *
 428 */
 429
 430int update_irq_load_avg(struct rq *rq, u64 running)
 431{
 432        int ret = 0;
 433
 434        /*
 435         * We can't use clock_pelt because irq time is not accounted in
 436         * clock_task. Instead we directly scale the running time to
 437         * reflect the real amount of computation
 438         */
 439        running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq)));
 440        running = cap_scale(running, arch_scale_cpu_capacity(cpu_of(rq)));
 441
 442        /*
 443         * We know the time that has been used by interrupt since last update
 444         * but we don't when. Let be pessimistic and assume that interrupt has
 445         * happened just before the update. This is not so far from reality
 446         * because interrupt will most probably wake up task and trig an update
 447         * of rq clock during which the metric is updated.
 448         * We start to decay with normal context time and then we add the
 449         * interrupt context time.
 450         * We can safely remove running from rq->clock because
 451         * rq->clock += delta with delta >= running
 452         */
 453        ret = ___update_load_sum(rq->clock - running, &rq->avg_irq,
 454                                0,
 455                                0,
 456                                0);
 457        ret += ___update_load_sum(rq->clock, &rq->avg_irq,
 458                                1,
 459                                1,
 460                                1);
 461
 462        if (ret) {
 463                ___update_load_avg(&rq->avg_irq, 1);
 464                trace_pelt_irq_tp(rq);
 465        }
 466
 467        return ret;
 468}
 469#endif
 470