linux/kernel/bpf/stackmap.c
<<
>>
Prefs
   1/* Copyright (c) 2016 Facebook
   2 *
   3 * This program is free software; you can redistribute it and/or
   4 * modify it under the terms of version 2 of the GNU General Public
   5 * License as published by the Free Software Foundation.
   6 */
   7#include <linux/bpf.h>
   8#include <linux/jhash.h>
   9#include <linux/filter.h>
  10#include <linux/vmalloc.h>
  11#include <linux/stacktrace.h>
  12#include <linux/perf_event.h>
  13#include "percpu_freelist.h"
  14
  15struct stack_map_bucket {
  16        struct pcpu_freelist_node fnode;
  17        u32 hash;
  18        u32 nr;
  19        u64 ip[];
  20};
  21
  22struct bpf_stack_map {
  23        struct bpf_map map;
  24        void *elems;
  25        struct pcpu_freelist freelist;
  26        u32 n_buckets;
  27        struct stack_map_bucket *buckets[];
  28};
  29
  30static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
  31{
  32        u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
  33        int err;
  34
  35        smap->elems = vzalloc(elem_size * smap->map.max_entries);
  36        if (!smap->elems)
  37                return -ENOMEM;
  38
  39        err = pcpu_freelist_init(&smap->freelist);
  40        if (err)
  41                goto free_elems;
  42
  43        pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size,
  44                               smap->map.max_entries);
  45        return 0;
  46
  47free_elems:
  48        vfree(smap->elems);
  49        return err;
  50}
  51
  52/* Called from syscall */
  53static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
  54{
  55        u32 value_size = attr->value_size;
  56        struct bpf_stack_map *smap;
  57        u64 cost, n_buckets;
  58        int err;
  59
  60        if (!capable(CAP_SYS_ADMIN))
  61                return ERR_PTR(-EPERM);
  62
  63        if (attr->map_flags)
  64                return ERR_PTR(-EINVAL);
  65
  66        /* check sanity of attributes */
  67        if (attr->max_entries == 0 || attr->key_size != 4 ||
  68            value_size < 8 || value_size % 8 ||
  69            value_size / 8 > PERF_MAX_STACK_DEPTH)
  70                return ERR_PTR(-EINVAL);
  71
  72        /* hash table size must be power of 2 */
  73        n_buckets = roundup_pow_of_two(attr->max_entries);
  74
  75        cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
  76        if (cost >= U32_MAX - PAGE_SIZE)
  77                return ERR_PTR(-E2BIG);
  78
  79        smap = kzalloc(cost, GFP_USER | __GFP_NOWARN);
  80        if (!smap) {
  81                smap = vzalloc(cost);
  82                if (!smap)
  83                        return ERR_PTR(-ENOMEM);
  84        }
  85
  86        err = -E2BIG;
  87        cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
  88        if (cost >= U32_MAX - PAGE_SIZE)
  89                goto free_smap;
  90
  91        smap->map.map_type = attr->map_type;
  92        smap->map.key_size = attr->key_size;
  93        smap->map.value_size = value_size;
  94        smap->map.max_entries = attr->max_entries;
  95        smap->n_buckets = n_buckets;
  96        smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
  97
  98        err = bpf_map_precharge_memlock(smap->map.pages);
  99        if (err)
 100                goto free_smap;
 101
 102        err = get_callchain_buffers();
 103        if (err)
 104                goto free_smap;
 105
 106        err = prealloc_elems_and_freelist(smap);
 107        if (err)
 108                goto put_buffers;
 109
 110        return &smap->map;
 111
 112put_buffers:
 113        put_callchain_buffers();
 114free_smap:
 115        kvfree(smap);
 116        return ERR_PTR(err);
 117}
 118
 119static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 120{
 121        struct pt_regs *regs = (struct pt_regs *) (long) r1;
 122        struct bpf_map *map = (struct bpf_map *) (long) r2;
 123        struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
 124        struct perf_callchain_entry *trace;
 125        struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
 126        u32 max_depth = map->value_size / 8;
 127        /* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */
 128        u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth;
 129        u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
 130        u32 hash, id, trace_nr, trace_len;
 131        bool user = flags & BPF_F_USER_STACK;
 132        bool kernel = !user;
 133        u64 *ips;
 134
 135        if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
 136                               BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
 137                return -EINVAL;
 138
 139        trace = get_perf_callchain(regs, init_nr, kernel, user, false, false);
 140
 141        if (unlikely(!trace))
 142                /* couldn't fetch the stack trace */
 143                return -EFAULT;
 144
 145        /* get_perf_callchain() guarantees that trace->nr >= init_nr
 146         * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth
 147         */
 148        trace_nr = trace->nr - init_nr;
 149
 150        if (trace_nr <= skip)
 151                /* skipping more than usable stack trace */
 152                return -EFAULT;
 153
 154        trace_nr -= skip;
 155        trace_len = trace_nr * sizeof(u64);
 156        ips = trace->ip + skip + init_nr;
 157        hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
 158        id = hash & (smap->n_buckets - 1);
 159        bucket = READ_ONCE(smap->buckets[id]);
 160
 161        if (bucket && bucket->hash == hash) {
 162                if (flags & BPF_F_FAST_STACK_CMP)
 163                        return id;
 164                if (bucket->nr == trace_nr &&
 165                    memcmp(bucket->ip, ips, trace_len) == 0)
 166                        return id;
 167        }
 168
 169        /* this call stack is not in the map, try to add it */
 170        if (bucket && !(flags & BPF_F_REUSE_STACKID))
 171                return -EEXIST;
 172
 173        new_bucket = (struct stack_map_bucket *)
 174                pcpu_freelist_pop(&smap->freelist);
 175        if (unlikely(!new_bucket))
 176                return -ENOMEM;
 177
 178        memcpy(new_bucket->ip, ips, trace_len);
 179        new_bucket->hash = hash;
 180        new_bucket->nr = trace_nr;
 181
 182        old_bucket = xchg(&smap->buckets[id], new_bucket);
 183        if (old_bucket)
 184                pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
 185        return id;
 186}
 187
 188const struct bpf_func_proto bpf_get_stackid_proto = {
 189        .func           = bpf_get_stackid,
 190        .gpl_only       = true,
 191        .ret_type       = RET_INTEGER,
 192        .arg1_type      = ARG_PTR_TO_CTX,
 193        .arg2_type      = ARG_CONST_MAP_PTR,
 194        .arg3_type      = ARG_ANYTHING,
 195};
 196
 197/* Called from eBPF program */
 198static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
 199{
 200        return NULL;
 201}
 202
 203/* Called from syscall */
 204int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
 205{
 206        struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
 207        struct stack_map_bucket *bucket, *old_bucket;
 208        u32 id = *(u32 *)key, trace_len;
 209
 210        if (unlikely(id >= smap->n_buckets))
 211                return -ENOENT;
 212
 213        bucket = xchg(&smap->buckets[id], NULL);
 214        if (!bucket)
 215                return -ENOENT;
 216
 217        trace_len = bucket->nr * sizeof(u64);
 218        memcpy(value, bucket->ip, trace_len);
 219        memset(value + trace_len, 0, map->value_size - trace_len);
 220
 221        old_bucket = xchg(&smap->buckets[id], bucket);
 222        if (old_bucket)
 223                pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
 224        return 0;
 225}
 226
 227static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 228{
 229        return -EINVAL;
 230}
 231
 232static int stack_map_update_elem(struct bpf_map *map, void *key, void *value,
 233                                 u64 map_flags)
 234{
 235        return -EINVAL;
 236}
 237
 238/* Called from syscall or from eBPF program */
 239static int stack_map_delete_elem(struct bpf_map *map, void *key)
 240{
 241        struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
 242        struct stack_map_bucket *old_bucket;
 243        u32 id = *(u32 *)key;
 244
 245        if (unlikely(id >= smap->n_buckets))
 246                return -E2BIG;
 247
 248        old_bucket = xchg(&smap->buckets[id], NULL);
 249        if (old_bucket) {
 250                pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
 251                return 0;
 252        } else {
 253                return -ENOENT;
 254        }
 255}
 256
 257/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
 258static void stack_map_free(struct bpf_map *map)
 259{
 260        struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
 261
 262        /* wait for bpf programs to complete before freeing stack map */
 263        synchronize_rcu();
 264
 265        vfree(smap->elems);
 266        pcpu_freelist_destroy(&smap->freelist);
 267        kvfree(smap);
 268        put_callchain_buffers();
 269}
 270
 271static const struct bpf_map_ops stack_map_ops = {
 272        .map_alloc = stack_map_alloc,
 273        .map_free = stack_map_free,
 274        .map_get_next_key = stack_map_get_next_key,
 275        .map_lookup_elem = stack_map_lookup_elem,
 276        .map_update_elem = stack_map_update_elem,
 277        .map_delete_elem = stack_map_delete_elem,
 278};
 279
 280static struct bpf_map_type_list stack_map_type __read_mostly = {
 281        .ops = &stack_map_ops,
 282        .type = BPF_MAP_TYPE_STACK_TRACE,
 283};
 284
 285static int __init register_stack_map(void)
 286{
 287        bpf_register_map_type(&stack_map_type);
 288        return 0;
 289}
 290late_initcall(register_stack_map);
 291