linux/samples/bpf/cpustat_kern.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2
   3#include <linux/version.h>
   4#include <linux/ptrace.h>
   5#include <uapi/linux/bpf.h>
   6#include "bpf_helpers.h"
   7
   8/*
   9 * The CPU number, cstate number and pstate number are based
  10 * on 96boards Hikey with octa CA53 CPUs.
  11 *
  12 * Every CPU have three idle states for cstate:
  13 *   WFI, CPU_OFF, CLUSTER_OFF
  14 *
  15 * Every CPU have 5 operating points:
  16 *   208MHz, 432MHz, 729MHz, 960MHz, 1200MHz
  17 *
  18 * This code is based on these assumption and other platforms
  19 * need to adjust these definitions.
  20 */
  21#define MAX_CPU                 8
  22#define MAX_PSTATE_ENTRIES      5
  23#define MAX_CSTATE_ENTRIES      3
  24
  25static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 };
  26
  27/*
  28 * my_map structure is used to record cstate and pstate index and
  29 * timestamp (Idx, Ts), when new event incoming we need to update
  30 * combination for new state index and timestamp (Idx`, Ts`).
  31 *
  32 * Based on (Idx, Ts) and (Idx`, Ts`) we can calculate the time
  33 * interval for the previous state: Duration(Idx) = Ts` - Ts.
  34 *
  35 * Every CPU has one below array for recording state index and
  36 * timestamp, and record for cstate and pstate saperately:
  37 *
  38 * +--------------------------+
  39 * | cstate timestamp         |
  40 * +--------------------------+
  41 * | cstate index             |
  42 * +--------------------------+
  43 * | pstate timestamp         |
  44 * +--------------------------+
  45 * | pstate index             |
  46 * +--------------------------+
  47 */
  48#define MAP_OFF_CSTATE_TIME     0
  49#define MAP_OFF_CSTATE_IDX      1
  50#define MAP_OFF_PSTATE_TIME     2
  51#define MAP_OFF_PSTATE_IDX      3
  52#define MAP_OFF_NUM             4
  53
  54struct bpf_map_def SEC("maps") my_map = {
  55        .type = BPF_MAP_TYPE_ARRAY,
  56        .key_size = sizeof(u32),
  57        .value_size = sizeof(u64),
  58        .max_entries = MAX_CPU * MAP_OFF_NUM,
  59};
  60
  61/* cstate_duration records duration time for every idle state per CPU */
  62struct bpf_map_def SEC("maps") cstate_duration = {
  63        .type = BPF_MAP_TYPE_ARRAY,
  64        .key_size = sizeof(u32),
  65        .value_size = sizeof(u64),
  66        .max_entries = MAX_CPU * MAX_CSTATE_ENTRIES,
  67};
  68
  69/* pstate_duration records duration time for every operating point per CPU */
  70struct bpf_map_def SEC("maps") pstate_duration = {
  71        .type = BPF_MAP_TYPE_ARRAY,
  72        .key_size = sizeof(u32),
  73        .value_size = sizeof(u64),
  74        .max_entries = MAX_CPU * MAX_PSTATE_ENTRIES,
  75};
  76
  77/*
  78 * The trace events for cpu_idle and cpu_frequency are taken from:
  79 * /sys/kernel/debug/tracing/events/power/cpu_idle/format
  80 * /sys/kernel/debug/tracing/events/power/cpu_frequency/format
  81 *
  82 * These two events have same format, so define one common structure.
  83 */
  84struct cpu_args {
  85        u64 pad;
  86        u32 state;
  87        u32 cpu_id;
  88};
  89
  90/* calculate pstate index, returns MAX_PSTATE_ENTRIES for failure */
  91static u32 find_cpu_pstate_idx(u32 frequency)
  92{
  93        u32 i;
  94
  95        for (i = 0; i < sizeof(cpu_opps) / sizeof(u32); i++) {
  96                if (frequency == cpu_opps[i])
  97                        return i;
  98        }
  99
 100        return i;
 101}
 102
 103SEC("tracepoint/power/cpu_idle")
 104int bpf_prog1(struct cpu_args *ctx)
 105{
 106        u64 *cts, *pts, *cstate, *pstate, prev_state, cur_ts, delta;
 107        u32 key, cpu, pstate_idx;
 108        u64 *val;
 109
 110        if (ctx->cpu_id > MAX_CPU)
 111                return 0;
 112
 113        cpu = ctx->cpu_id;
 114
 115        key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_TIME;
 116        cts = bpf_map_lookup_elem(&my_map, &key);
 117        if (!cts)
 118                return 0;
 119
 120        key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
 121        cstate = bpf_map_lookup_elem(&my_map, &key);
 122        if (!cstate)
 123                return 0;
 124
 125        key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
 126        pts = bpf_map_lookup_elem(&my_map, &key);
 127        if (!pts)
 128                return 0;
 129
 130        key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
 131        pstate = bpf_map_lookup_elem(&my_map, &key);
 132        if (!pstate)
 133                return 0;
 134
 135        prev_state = *cstate;
 136        *cstate = ctx->state;
 137
 138        if (!*cts) {
 139                *cts = bpf_ktime_get_ns();
 140                return 0;
 141        }
 142
 143        cur_ts = bpf_ktime_get_ns();
 144        delta = cur_ts - *cts;
 145        *cts = cur_ts;
 146
 147        /*
 148         * When state doesn't equal to (u32)-1, the cpu will enter
 149         * one idle state; for this case we need to record interval
 150         * for the pstate.
 151         *
 152         *                 OPP2
 153         *            +---------------------+
 154         *     OPP1   |                     |
 155         *   ---------+                     |
 156         *                                  |  Idle state
 157         *                                  +---------------
 158         *
 159         *            |<- pstate duration ->|
 160         *            ^                     ^
 161         *           pts                  cur_ts
 162         */
 163        if (ctx->state != (u32)-1) {
 164
 165                /* record pstate after have first cpu_frequency event */
 166                if (!*pts)
 167                        return 0;
 168
 169                delta = cur_ts - *pts;
 170
 171                pstate_idx = find_cpu_pstate_idx(*pstate);
 172                if (pstate_idx >= MAX_PSTATE_ENTRIES)
 173                        return 0;
 174
 175                key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
 176                val = bpf_map_lookup_elem(&pstate_duration, &key);
 177                if (val)
 178                        __sync_fetch_and_add((long *)val, delta);
 179
 180        /*
 181         * When state equal to (u32)-1, the cpu just exits from one
 182         * specific idle state; for this case we need to record
 183         * interval for the pstate.
 184         *
 185         *       OPP2
 186         *   -----------+
 187         *              |                          OPP1
 188         *              |                     +-----------
 189         *              |     Idle state      |
 190         *              +---------------------+
 191         *
 192         *              |<- cstate duration ->|
 193         *              ^                     ^
 194         *             cts                  cur_ts
 195         */
 196        } else {
 197
 198                key = cpu * MAX_CSTATE_ENTRIES + prev_state;
 199                val = bpf_map_lookup_elem(&cstate_duration, &key);
 200                if (val)
 201                        __sync_fetch_and_add((long *)val, delta);
 202        }
 203
 204        /* Update timestamp for pstate as new start time */
 205        if (*pts)
 206                *pts = cur_ts;
 207
 208        return 0;
 209}
 210
 211SEC("tracepoint/power/cpu_frequency")
 212int bpf_prog2(struct cpu_args *ctx)
 213{
 214        u64 *pts, *cstate, *pstate, prev_state, cur_ts, delta;
 215        u32 key, cpu, pstate_idx;
 216        u64 *val;
 217
 218        cpu = ctx->cpu_id;
 219
 220        key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
 221        pts = bpf_map_lookup_elem(&my_map, &key);
 222        if (!pts)
 223                return 0;
 224
 225        key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
 226        pstate = bpf_map_lookup_elem(&my_map, &key);
 227        if (!pstate)
 228                return 0;
 229
 230        key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
 231        cstate = bpf_map_lookup_elem(&my_map, &key);
 232        if (!cstate)
 233                return 0;
 234
 235        prev_state = *pstate;
 236        *pstate = ctx->state;
 237
 238        if (!*pts) {
 239                *pts = bpf_ktime_get_ns();
 240                return 0;
 241        }
 242
 243        cur_ts = bpf_ktime_get_ns();
 244        delta = cur_ts - *pts;
 245        *pts = cur_ts;
 246
 247        /* When CPU is in idle, bail out to skip pstate statistics */
 248        if (*cstate != (u32)(-1))
 249                return 0;
 250
 251        /*
 252         * The cpu changes to another different OPP (in below diagram
 253         * change frequency from OPP3 to OPP1), need recording interval
 254         * for previous frequency OPP3 and update timestamp as start
 255         * time for new frequency OPP1.
 256         *
 257         *                 OPP3
 258         *            +---------------------+
 259         *     OPP2   |                     |
 260         *   ---------+                     |
 261         *                                  |    OPP1
 262         *                                  +---------------
 263         *
 264         *            |<- pstate duration ->|
 265         *            ^                     ^
 266         *           pts                  cur_ts
 267         */
 268        pstate_idx = find_cpu_pstate_idx(*pstate);
 269        if (pstate_idx >= MAX_PSTATE_ENTRIES)
 270                return 0;
 271
 272        key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
 273        val = bpf_map_lookup_elem(&pstate_duration, &key);
 274        if (val)
 275                __sync_fetch_and_add((long *)val, delta);
 276
 277        return 0;
 278}
 279
 280char _license[] SEC("license") = "GPL";
 281u32 _version SEC("version") = LINUX_VERSION_CODE;
 282