linux/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
<<
>>
Prefs
   1// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
   2// Copyright (c) 2021 Facebook
   3// Copyright (c) 2021 Google
   4#include "vmlinux.h"
   5#include <bpf/bpf_helpers.h>
   6#include <bpf/bpf_tracing.h>
   7#include <bpf/bpf_core_read.h>
   8
   9#define MAX_LEVELS  10  // max cgroup hierarchy level: arbitrary
  10#define MAX_EVENTS  32  // max events per cgroup: arbitrary
  11
  12// NOTE: many of map and global data will be modified before loading
  13//       from the userspace (perf tool) using the skeleton helpers.
  14
  15// single set of global perf events to measure
  16struct {
  17        __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
  18        __uint(key_size, sizeof(__u32));
  19        __uint(value_size, sizeof(int));
  20        __uint(max_entries, 1);
  21} events SEC(".maps");
  22
  23// from cgroup id to event index
  24struct {
  25        __uint(type, BPF_MAP_TYPE_HASH);
  26        __uint(key_size, sizeof(__u64));
  27        __uint(value_size, sizeof(__u32));
  28        __uint(max_entries, 1);
  29} cgrp_idx SEC(".maps");
  30
  31// per-cpu event snapshots to calculate delta
  32struct {
  33        __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
  34        __uint(key_size, sizeof(__u32));
  35        __uint(value_size, sizeof(struct bpf_perf_event_value));
  36} prev_readings SEC(".maps");
  37
  38// aggregated event values for each cgroup (per-cpu)
  39// will be read from the user-space
  40struct {
  41        __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
  42        __uint(key_size, sizeof(__u32));
  43        __uint(value_size, sizeof(struct bpf_perf_event_value));
  44} cgrp_readings SEC(".maps");
  45
  46const volatile __u32 num_events = 1;
  47const volatile __u32 num_cpus = 1;
  48
  49int enabled = 0;
  50int use_cgroup_v2 = 0;
  51
  52static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
  53{
  54        struct task_struct *p = (void *)bpf_get_current_task();
  55        struct cgroup *cgrp;
  56        register int i = 0;
  57        __u32 *elem;
  58        int level;
  59        int cnt;
  60
  61        cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_event_cgrp_id], cgroup);
  62        level = BPF_CORE_READ(cgrp, level);
  63
  64        for (cnt = 0; i < MAX_LEVELS; i++) {
  65                __u64 cgrp_id;
  66
  67                if (i > level)
  68                        break;
  69
  70                // convert cgroup-id to a map index
  71                cgrp_id = BPF_CORE_READ(cgrp, ancestor_ids[i]);
  72                elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
  73                if (!elem)
  74                        continue;
  75
  76                cgrps[cnt++] = *elem;
  77                if (cnt == size)
  78                        break;
  79        }
  80
  81        return cnt;
  82}
  83
  84static inline int get_cgroup_v2_idx(__u32 *cgrps, int size)
  85{
  86        register int i = 0;
  87        __u32 *elem;
  88        int cnt;
  89
  90        for (cnt = 0; i < MAX_LEVELS; i++) {
  91                __u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i);
  92
  93                if (cgrp_id == 0)
  94                        break;
  95
  96                // convert cgroup-id to a map index
  97                elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
  98                if (!elem)
  99                        continue;
 100
 101                cgrps[cnt++] = *elem;
 102                if (cnt == size)
 103                        break;
 104        }
 105
 106        return cnt;
 107}
 108
 109static int bperf_cgroup_count(void)
 110{
 111        register __u32 idx = 0;  // to have it in a register to pass BPF verifier
 112        register int c = 0;
 113        struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val;
 114        __u32 cpu = bpf_get_smp_processor_id();
 115        __u32 cgrp_idx[MAX_LEVELS];
 116        int cgrp_cnt;
 117        __u32 key, cgrp;
 118        long err;
 119
 120        if (use_cgroup_v2)
 121                cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, MAX_LEVELS);
 122        else
 123                cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, MAX_LEVELS);
 124
 125        for ( ; idx < MAX_EVENTS; idx++) {
 126                if (idx == num_events)
 127                        break;
 128
 129                // XXX: do not pass idx directly (for verifier)
 130                key = idx;
 131                // this is per-cpu array for diff
 132                prev_val = bpf_map_lookup_elem(&prev_readings, &key);
 133                if (!prev_val) {
 134                        val.counter = val.enabled = val.running = 0;
 135                        bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY);
 136
 137                        prev_val = bpf_map_lookup_elem(&prev_readings, &key);
 138                        if (!prev_val)
 139                                continue;
 140                }
 141
 142                // read from global perf_event array
 143                key = idx * num_cpus + cpu;
 144                err = bpf_perf_event_read_value(&events, key, &val, sizeof(val));
 145                if (err)
 146                        continue;
 147
 148                if (enabled) {
 149                        delta.counter = val.counter - prev_val->counter;
 150                        delta.enabled = val.enabled - prev_val->enabled;
 151                        delta.running = val.running - prev_val->running;
 152
 153                        for (c = 0; c < MAX_LEVELS; c++) {
 154                                if (c == cgrp_cnt)
 155                                        break;
 156
 157                                cgrp = cgrp_idx[c];
 158
 159                                // aggregate the result by cgroup
 160                                key = cgrp * num_events + idx;
 161                                cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key);
 162                                if (cgrp_val) {
 163                                        cgrp_val->counter += delta.counter;
 164                                        cgrp_val->enabled += delta.enabled;
 165                                        cgrp_val->running += delta.running;
 166                                } else {
 167                                        bpf_map_update_elem(&cgrp_readings, &key,
 168                                                            &delta, BPF_ANY);
 169                                }
 170                        }
 171                }
 172
 173                *prev_val = val;
 174        }
 175        return 0;
 176}
 177
 178// This will be attached to cgroup-switches event for each cpu
 179SEC("perf_events")
 180int BPF_PROG(on_cgrp_switch)
 181{
 182        return bperf_cgroup_count();
 183}
 184
 185SEC("raw_tp/sched_switch")
 186int BPF_PROG(trigger_read)
 187{
 188        return bperf_cgroup_count();
 189}
 190
 191char LICENSE[] SEC("license") = "Dual BSD/GPL";
 192