linux/kernel/bpf/stackmap.c
<<
>>
Prefs
   1/* Copyright (c) 2016 Facebook
   2 *
   3 * This program is free software; you can redistribute it and/or
   4 * modify it under the terms of version 2 of the GNU General Public
   5 * License as published by the Free Software Foundation.
   6 */
   7#include <linux/bpf.h>
   8#include <linux/jhash.h>
   9#include <linux/filter.h>
  10#include <linux/vmalloc.h>
  11#include <linux/stacktrace.h>
  12#include <linux/perf_event.h>
  13#include "percpu_freelist.h"
  14
  15struct stack_map_bucket {
  16        struct pcpu_freelist_node fnode;
  17        u32 hash;
  18        u32 nr;
  19        u64 ip[];
  20};
  21
  22struct bpf_stack_map {
  23        struct bpf_map map;
  24        void *elems;
  25        struct pcpu_freelist freelist;
  26        u32 n_buckets;
  27        struct stack_map_bucket *buckets[];
  28};
  29
  30static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
  31{
  32        u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
  33        int err;
  34
  35        smap->elems = vzalloc(elem_size * smap->map.max_entries);
  36        if (!smap->elems)
  37                return -ENOMEM;
  38
  39        err = pcpu_freelist_init(&smap->freelist);
  40        if (err)
  41                goto free_elems;
  42
  43        pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size,
  44                               smap->map.max_entries);
  45        return 0;
  46
  47free_elems:
  48        vfree(smap->elems);
  49        return err;
  50}
  51
  52/* Called from syscall */
  53static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
  54{
  55        u32 value_size = attr->value_size;
  56        struct bpf_stack_map *smap;
  57        u64 cost, n_buckets;
  58        int err;
  59
  60        if (!capable(CAP_SYS_ADMIN))
  61                return ERR_PTR(-EPERM);
  62
  63        if (attr->map_flags)
  64                return ERR_PTR(-EINVAL);
  65
  66        /* check sanity of attributes */
  67        if (attr->max_entries == 0 || attr->key_size != 4 ||
  68            value_size < 8 || value_size % 8 ||
  69            value_size / 8 > sysctl_perf_event_max_stack)
  70                return ERR_PTR(-EINVAL);
  71
  72        /* hash table size must be power of 2 */
  73        n_buckets = roundup_pow_of_two(attr->max_entries);
  74
  75        cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
  76        if (cost >= U32_MAX - PAGE_SIZE)
  77                return ERR_PTR(-E2BIG);
  78
  79        smap = kzalloc(cost, GFP_USER | __GFP_NOWARN);
  80        if (!smap) {
  81                smap = vzalloc(cost);
  82                if (!smap)
  83                        return ERR_PTR(-ENOMEM);
  84        }
  85
  86        err = -E2BIG;
  87        cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
  88        if (cost >= U32_MAX - PAGE_SIZE)
  89                goto free_smap;
  90
  91        smap->map.map_type = attr->map_type;
  92        smap->map.key_size = attr->key_size;
  93        smap->map.value_size = value_size;
  94        smap->map.max_entries = attr->max_entries;
  95        smap->n_buckets = n_buckets;
  96        smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
  97
  98        err = bpf_map_precharge_memlock(smap->map.pages);
  99        if (err)
 100                goto free_smap;
 101
 102        err = get_callchain_buffers(sysctl_perf_event_max_stack);
 103        if (err)
 104                goto free_smap;
 105
 106        err = prealloc_elems_and_freelist(smap);
 107        if (err)
 108                goto put_buffers;
 109
 110        return &smap->map;
 111
 112put_buffers:
 113        put_callchain_buffers();
 114free_smap:
 115        kvfree(smap);
 116        return ERR_PTR(err);
 117}
 118
 119u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 120{
 121        struct pt_regs *regs = (struct pt_regs *) (long) r1;
 122        struct bpf_map *map = (struct bpf_map *) (long) r2;
 123        struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
 124        struct perf_callchain_entry *trace;
 125        struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
 126        u32 max_depth = map->value_size / 8;
 127        /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
 128        u32 init_nr = sysctl_perf_event_max_stack - max_depth;
 129        u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
 130        u32 hash, id, trace_nr, trace_len;
 131        bool user = flags & BPF_F_USER_STACK;
 132        bool kernel = !user;
 133        u64 *ips;
 134
 135        if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
 136                               BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
 137                return -EINVAL;
 138
 139        trace = get_perf_callchain(regs, init_nr, kernel, user,
 140                                   sysctl_perf_event_max_stack, false, false);
 141
 142        if (unlikely(!trace))
 143                /* couldn't fetch the stack trace */
 144                return -EFAULT;
 145
 146        /* get_perf_callchain() guarantees that trace->nr >= init_nr
 147         * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
 148         */
 149        trace_nr = trace->nr - init_nr;
 150
 151        if (trace_nr <= skip)
 152                /* skipping more than usable stack trace */
 153                return -EFAULT;
 154
 155        trace_nr -= skip;
 156        trace_len = trace_nr * sizeof(u64);
 157        ips = trace->ip + skip + init_nr;
 158        hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
 159        id = hash & (smap->n_buckets - 1);
 160        bucket = READ_ONCE(smap->buckets[id]);
 161
 162        if (bucket && bucket->hash == hash) {
 163                if (flags & BPF_F_FAST_STACK_CMP)
 164                        return id;
 165                if (bucket->nr == trace_nr &&
 166                    memcmp(bucket->ip, ips, trace_len) == 0)
 167                        return id;
 168        }
 169
 170        /* this call stack is not in the map, try to add it */
 171        if (bucket && !(flags & BPF_F_REUSE_STACKID))
 172                return -EEXIST;
 173
 174        new_bucket = (struct stack_map_bucket *)
 175                pcpu_freelist_pop(&smap->freelist);
 176        if (unlikely(!new_bucket))
 177                return -ENOMEM;
 178
 179        memcpy(new_bucket->ip, ips, trace_len);
 180        new_bucket->hash = hash;
 181        new_bucket->nr = trace_nr;
 182
 183        old_bucket = xchg(&smap->buckets[id], new_bucket);
 184        if (old_bucket)
 185                pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
 186        return id;
 187}
 188
 189const struct bpf_func_proto bpf_get_stackid_proto = {
 190        .func           = bpf_get_stackid,
 191        .gpl_only       = true,
 192        .ret_type       = RET_INTEGER,
 193        .arg1_type      = ARG_PTR_TO_CTX,
 194        .arg2_type      = ARG_CONST_MAP_PTR,
 195        .arg3_type      = ARG_ANYTHING,
 196};
 197
 198/* Called from eBPF program */
 199static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
 200{
 201        return NULL;
 202}
 203
 204/* Called from syscall */
 205int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
 206{
 207        struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
 208        struct stack_map_bucket *bucket, *old_bucket;
 209        u32 id = *(u32 *)key, trace_len;
 210
 211        if (unlikely(id >= smap->n_buckets))
 212                return -ENOENT;
 213
 214        bucket = xchg(&smap->buckets[id], NULL);
 215        if (!bucket)
 216                return -ENOENT;
 217
 218        trace_len = bucket->nr * sizeof(u64);
 219        memcpy(value, bucket->ip, trace_len);
 220        memset(value + trace_len, 0, map->value_size - trace_len);
 221
 222        old_bucket = xchg(&smap->buckets[id], bucket);
 223        if (old_bucket)
 224                pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
 225        return 0;
 226}
 227
 228static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 229{
 230        return -EINVAL;
 231}
 232
 233static int stack_map_update_elem(struct bpf_map *map, void *key, void *value,
 234                                 u64 map_flags)
 235{
 236        return -EINVAL;
 237}
 238
 239/* Called from syscall or from eBPF program */
 240static int stack_map_delete_elem(struct bpf_map *map, void *key)
 241{
 242        struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
 243        struct stack_map_bucket *old_bucket;
 244        u32 id = *(u32 *)key;
 245
 246        if (unlikely(id >= smap->n_buckets))
 247                return -E2BIG;
 248
 249        old_bucket = xchg(&smap->buckets[id], NULL);
 250        if (old_bucket) {
 251                pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
 252                return 0;
 253        } else {
 254                return -ENOENT;
 255        }
 256}
 257
 258/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
 259static void stack_map_free(struct bpf_map *map)
 260{
 261        struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
 262
 263        /* wait for bpf programs to complete before freeing stack map */
 264        synchronize_rcu();
 265
 266        vfree(smap->elems);
 267        pcpu_freelist_destroy(&smap->freelist);
 268        kvfree(smap);
 269        put_callchain_buffers();
 270}
 271
 272static const struct bpf_map_ops stack_map_ops = {
 273        .map_alloc = stack_map_alloc,
 274        .map_free = stack_map_free,
 275        .map_get_next_key = stack_map_get_next_key,
 276        .map_lookup_elem = stack_map_lookup_elem,
 277        .map_update_elem = stack_map_update_elem,
 278        .map_delete_elem = stack_map_delete_elem,
 279};
 280
 281static struct bpf_map_type_list stack_map_type __read_mostly = {
 282        .ops = &stack_map_ops,
 283        .type = BPF_MAP_TYPE_STACK_TRACE,
 284};
 285
 286static int __init register_stack_map(void)
 287{
 288        bpf_register_map_type(&stack_map_type);
 289        return 0;
 290}
 291late_initcall(register_stack_map);
 292