LXR linux/kernel/sched/pelt.c

   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Per Entity Load Tracking
   4 *
   5 *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   6 *
   7 *  Interactivity improvements by Mike Galbraith
   8 *  (C) 2007 Mike Galbraith <efault@gmx.de>
   9 *
  10 *  Various enhancements by Dmitry Adamushko.
  11 *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  12 *
  13 *  Group scheduling enhancements by Srivatsa Vaddagiri
  14 *  Copyright IBM Corporation, 2007
  15 *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  16 *
  17 *  Scaled math optimizations by Thomas Gleixner
  18 *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  19 *
  20 *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  21 *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  22 *
  23 *  Move PELT related code from fair.c into this pelt.c file
  24 *  Author: Vincent Guittot <vincent.guittot@linaro.org>
  25 */
  26
  27#include <linux/sched.h>
  28#include "sched.h"
  29#include "sched-pelt.h"
  30#include "pelt.h"
  31
  32/*
  33 * Approximate:
  34 *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
  35 */
  36static u64 decay_load(u64 val, u64 n)
  37{
  38        unsigned int local_n;
  39
  40        if (unlikely(n > LOAD_AVG_PERIOD * 63))
  41                return 0;
  42
  43        /* after bounds checking we can collapse to 32-bit */
  44        local_n = n;
  45
  46        /*
  47         * As y^PERIOD = 1/2, we can combine
  48         *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
  49         * With a look-up table which covers y^n (n<PERIOD)
  50         *
  51         * To achieve constant time decay_load.
  52         */
  53        if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
  54                val >>= local_n / LOAD_AVG_PERIOD;
  55                local_n %= LOAD_AVG_PERIOD;
  56        }
  57
  58        val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
  59        return val;
  60}
  61
  62static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
  63{
  64        u32 c1, c2, c3 = d3; /* y^0 == 1 */
  65
  66        /*
  67         * c1 = d1 y^p
  68         */
  69        c1 = decay_load((u64)d1, periods);
  70
  71        /*
  72         *            p-1
  73         * c2 = 1024 \Sum y^n
  74         *            n=1
  75         *
  76         *              inf        inf
  77         *    = 1024 ( \Sum y^n - \Sum y^n - y^0 )
  78         *              n=0        n=p
  79         */
  80        c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
  81
  82        return c1 + c2 + c3;
  83}
  84
  85#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
  86
  87/*
  88 * Accumulate the three separate parts of the sum; d1 the remainder
  89 * of the last (incomplete) period, d2 the span of full periods and d3
  90 * the remainder of the (incomplete) current period.
  91 *
  92 *           d1          d2           d3
  93 *           ^           ^            ^
  94 *           |           |            |
  95 *         |<->|<----------------->|<--->|
  96 * ... |---x---|------| ... |------|-----x (now)
  97 *
  98 *                           p-1
  99 * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
 100 *                           n=1
 101 *
 102 *    = u y^p +                                 (Step 1)
 103 *
 104 *                     p-1
 105 *      d1 y^p + 1024 \Sum y^n + d3 y^0         (Step 2)
 106 *                     n=1
 107 */
 108static __always_inline u32
 109accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
 110               unsigned long load, unsigned long runnable, int running)
 111{
 112        unsigned long scale_freq, scale_cpu;
 113        u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
 114        u64 periods;
 115
 116        scale_freq = arch_scale_freq_capacity(cpu);
 117        scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
 118
 119        delta += sa->period_contrib;
 120        periods = delta / 1024; /* A period is 1024us (~1ms) */
 121
 122        /*
 123         * Step 1: decay old *_sum if we crossed period boundaries.
 124         */
 125        if (periods) {
 126                sa->load_sum = decay_load(sa->load_sum, periods);
 127                sa->runnable_load_sum =
 128                        decay_load(sa->runnable_load_sum, periods);
 129                sa->util_sum = decay_load((u64)(sa->util_sum), periods);
 130
 131                /*
 132                 * Step 2
 133                 */
 134                delta %= 1024;
 135                contrib = __accumulate_pelt_segments(periods,
 136                                1024 - sa->period_contrib, delta);
 137        }
 138        sa->period_contrib = delta;
 139
 140        contrib = cap_scale(contrib, scale_freq);
 141        if (load)
 142                sa->load_sum += load * contrib;
 143        if (runnable)
 144                sa->runnable_load_sum += runnable * contrib;
 145        if (running)
 146                sa->util_sum += contrib * scale_cpu;
 147
 148        return periods;
 149}
 150
 151/*
 152 * We can represent the historical contribution to runnable average as the
 153 * coefficients of a geometric series.  To do this we sub-divide our runnable
 154 * history into segments of approximately 1ms (1024us); label the segment that
 155 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
 156 *
 157 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
 158 *      p0            p1           p2
 159 *     (now)       (~1ms ago)  (~2ms ago)
 160 *
 161 * Let u_i denote the fraction of p_i that the entity was runnable.
 162 *
 163 * We then designate the fractions u_i as our co-efficients, yielding the
 164 * following representation of historical load:
 165 *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
 166 *
 167 * We choose y based on the with of a reasonably scheduling period, fixing:
 168 *   y^32 = 0.5
 169 *
 170 * This means that the contribution to load ~32ms ago (u_32) will be weighted
 171 * approximately half as much as the contribution to load within the last ms
 172 * (u_0).
 173 *
 174 * When a period "rolls over" and we have new u_0`, multiplying the previous
 175 * sum again by y is sufficient to update:
 176 *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
 177 *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
 178 */
 179static __always_inline int
 180___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
 181                  unsigned long load, unsigned long runnable, int running)
 182{
 183        u64 delta;
 184
 185        delta = now - sa->last_update_time;
 186        /*
 187         * This should only happen when time goes backwards, which it
 188         * unfortunately does during sched clock init when we swap over to TSC.
 189         */
 190        if ((s64)delta < 0) {
 191                sa->last_update_time = now;
 192                return 0;
 193        }
 194
 195        /*
 196         * Use 1024ns as the unit of measurement since it's a reasonable
 197         * approximation of 1us and fast to compute.
 198         */
 199        delta >>= 10;
 200        if (!delta)
 201                return 0;
 202
 203        sa->last_update_time += delta << 10;
 204
 205        /*
 206         * running is a subset of runnable (weight) so running can't be set if
 207         * runnable is clear. But there are some corner cases where the current
 208         * se has been already dequeued but cfs_rq->curr still points to it.
 209         * This means that weight will be 0 but not running for a sched_entity
 210         * but also for a cfs_rq if the latter becomes idle. As an example,
 211         * this happens during idle_balance() which calls
 212         * update_blocked_averages()
 213         */
 214        if (!load)
 215                runnable = running = 0;
 216
 217        /*
 218         * Now we know we crossed measurement unit boundaries. The *_avg
 219         * accrues by two steps:
 220         *
 221         * Step 1: accumulate *_sum since last_update_time. If we haven't
 222         * crossed period boundaries, finish.
 223         */
 224        if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
 225                return 0;
 226
 227        return 1;
 228}
 229
 230static __always_inline void
 231___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
 232{
 233        u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
 234
 235        /*
 236         * Step 2: update *_avg.
 237         */
 238        sa->load_avg = div_u64(load * sa->load_sum, divider);
 239        sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
 240        WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
 241}
 242
 243/*
 244 * sched_entity:
 245 *
 246 *   task:
 247 *     se_runnable() == se_weight()
 248 *
 249 *   group: [ see update_cfs_group() ]
 250 *     se_weight()   = tg->weight * grq->load_avg / tg->load_avg
 251 *     se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
 252 *
 253 *   load_sum := runnable_sum
 254 *   load_avg = se_weight(se) * runnable_avg
 255 *
 256 *   runnable_load_sum := runnable_sum
 257 *   runnable_load_avg = se_runnable(se) * runnable_avg
 258 *
 259 * XXX collapse load_sum and runnable_load_sum
 260 *
 261 * cfq_rq:
 262 *
 263 *   load_sum = \Sum se_weight(se) * se->avg.load_sum
 264 *   load_avg = \Sum se->avg.load_avg
 265 *
 266 *   runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
 267 *   runnable_load_avg = \Sum se->avg.runable_load_avg
 268 */
 269
 270int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
 271{
 272        if (entity_is_task(se))
 273                se->runnable_weight = se->load.weight;
 274
 275        if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
 276                ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
 277                return 1;
 278        }
 279
 280        return 0;
 281}
 282
 283int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
 284{
 285        if (entity_is_task(se))
 286                se->runnable_weight = se->load.weight;
 287
 288        if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
 289                                cfs_rq->curr == se)) {
 290
 291                ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
 292                cfs_se_util_change(&se->avg);
 293                return 1;
 294        }
 295
 296        return 0;
 297}
 298
 299int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
 300{
 301        if (___update_load_sum(now, cpu, &cfs_rq->avg,
 302                                scale_load_down(cfs_rq->load.weight),
 303                                scale_load_down(cfs_rq->runnable_weight),
 304                                cfs_rq->curr != NULL)) {
 305
 306                ___update_load_avg(&cfs_rq->avg, 1, 1);
 307                return 1;
 308        }
 309
 310        return 0;
 311}
 312
 313/*
 314 * rt_rq:
 315 *
 316 *   util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
 317 *   util_sum = cpu_scale * load_sum
 318 *   runnable_load_sum = load_sum
 319 *
 320 *   load_avg and runnable_load_avg are not supported and meaningless.
 321 *
 322 */
 323
 324int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
 325{
 326        if (___update_load_sum(now, rq->cpu, &rq->avg_rt,
 327                                running,
 328                                running,
 329                                running)) {
 330
 331                ___update_load_avg(&rq->avg_rt, 1, 1);
 332                return 1;
 333        }
 334
 335        return 0;
 336}
 337
 338/*
 339 * dl_rq:
 340 *
 341 *   util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
 342 *   util_sum = cpu_scale * load_sum
 343 *   runnable_load_sum = load_sum
 344 *
 345 */
 346
 347int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
 348{
 349        if (___update_load_sum(now, rq->cpu, &rq->avg_dl,
 350                                running,
 351                                running,
 352                                running)) {
 353
 354                ___update_load_avg(&rq->avg_dl, 1, 1);
 355                return 1;
 356        }
 357
 358        return 0;
 359}
 360
 361#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
 362/*
 363 * irq:
 364 *
 365 *   util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
 366 *   util_sum = cpu_scale * load_sum
 367 *   runnable_load_sum = load_sum
 368 *
 369 */
 370
 371int update_irq_load_avg(struct rq *rq, u64 running)
 372{
 373        int ret = 0;
 374        /*
 375         * We know the time that has been used by interrupt since last update
 376         * but we don't when. Let be pessimistic and assume that interrupt has
 377         * happened just before the update. This is not so far from reality
 378         * because interrupt will most probably wake up task and trig an update
 379         * of rq clock during which the metric si updated.
 380         * We start to decay with normal context time and then we add the
 381         * interrupt context time.
 382         * We can safely remove running from rq->clock because
 383         * rq->clock += delta with delta >= running
 384         */
 385        ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq,
 386                                0,
 387                                0,
 388                                0);
 389        ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq,
 390                                1,
 391                                1,
 392                                1);
 393
 394        if (ret)
 395                ___update_load_avg(&rq->avg_irq, 1, 1);
 396
 397        return ret;
 398}
 399#endif
 400