linux/kernel/bpf/cgroup.c
<<
>>
Prefs
   1/*
   2 * Functions to manage eBPF programs attached to cgroups
   3 *
   4 * Copyright (c) 2016 Daniel Mack
   5 *
   6 * This file is subject to the terms and conditions of version 2 of the GNU
   7 * General Public License.  See the file COPYING in the main directory of the
   8 * Linux distribution for more details.
   9 */
  10
  11#include <linux/kernel.h>
  12#include <linux/atomic.h>
  13#include <linux/cgroup.h>
  14#include <linux/slab.h>
  15#include <linux/bpf.h>
  16#include <linux/bpf-cgroup.h>
  17#include <net/sock.h>
  18
  19DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
  20EXPORT_SYMBOL(cgroup_bpf_enabled_key);
  21
  22/**
  23 * cgroup_bpf_put() - put references of all bpf programs
  24 * @cgrp: the cgroup to modify
  25 */
  26void cgroup_bpf_put(struct cgroup *cgrp)
  27{
  28        unsigned int type;
  29
  30        for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
  31                struct list_head *progs = &cgrp->bpf.progs[type];
  32                struct bpf_prog_list *pl, *tmp;
  33
  34                list_for_each_entry_safe(pl, tmp, progs, node) {
  35                        list_del(&pl->node);
  36                        bpf_prog_put(pl->prog);
  37                        kfree(pl);
  38                        static_branch_dec(&cgroup_bpf_enabled_key);
  39                }
  40                bpf_prog_array_free(cgrp->bpf.effective[type]);
  41        }
  42}
  43
  44/* count number of elements in the list.
  45 * it's slow but the list cannot be long
  46 */
  47static u32 prog_list_length(struct list_head *head)
  48{
  49        struct bpf_prog_list *pl;
  50        u32 cnt = 0;
  51
  52        list_for_each_entry(pl, head, node) {
  53                if (!pl->prog)
  54                        continue;
  55                cnt++;
  56        }
  57        return cnt;
  58}
  59
  60/* if parent has non-overridable prog attached,
  61 * disallow attaching new programs to the descendent cgroup.
  62 * if parent has overridable or multi-prog, allow attaching
  63 */
  64static bool hierarchy_allows_attach(struct cgroup *cgrp,
  65                                    enum bpf_attach_type type,
  66                                    u32 new_flags)
  67{
  68        struct cgroup *p;
  69
  70        p = cgroup_parent(cgrp);
  71        if (!p)
  72                return true;
  73        do {
  74                u32 flags = p->bpf.flags[type];
  75                u32 cnt;
  76
  77                if (flags & BPF_F_ALLOW_MULTI)
  78                        return true;
  79                cnt = prog_list_length(&p->bpf.progs[type]);
  80                WARN_ON_ONCE(cnt > 1);
  81                if (cnt == 1)
  82                        return !!(flags & BPF_F_ALLOW_OVERRIDE);
  83                p = cgroup_parent(p);
  84        } while (p);
  85        return true;
  86}
  87
  88/* compute a chain of effective programs for a given cgroup:
  89 * start from the list of programs in this cgroup and add
  90 * all parent programs.
  91 * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
  92 * to programs in this cgroup
  93 */
  94static int compute_effective_progs(struct cgroup *cgrp,
  95                                   enum bpf_attach_type type,
  96                                   struct bpf_prog_array __rcu **array)
  97{
  98        struct bpf_prog_array __rcu *progs;
  99        struct bpf_prog_list *pl;
 100        struct cgroup *p = cgrp;
 101        int cnt = 0;
 102
 103        /* count number of effective programs by walking parents */
 104        do {
 105                if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
 106                        cnt += prog_list_length(&p->bpf.progs[type]);
 107                p = cgroup_parent(p);
 108        } while (p);
 109
 110        progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
 111        if (!progs)
 112                return -ENOMEM;
 113
 114        /* populate the array with effective progs */
 115        cnt = 0;
 116        p = cgrp;
 117        do {
 118                if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
 119                        list_for_each_entry(pl,
 120                                            &p->bpf.progs[type], node) {
 121                                if (!pl->prog)
 122                                        continue;
 123                                rcu_dereference_protected(progs, 1)->
 124                                        progs[cnt++] = pl->prog;
 125                        }
 126                p = cgroup_parent(p);
 127        } while (p);
 128
 129        *array = progs;
 130        return 0;
 131}
 132
 133static void activate_effective_progs(struct cgroup *cgrp,
 134                                     enum bpf_attach_type type,
 135                                     struct bpf_prog_array __rcu *array)
 136{
 137        struct bpf_prog_array __rcu *old_array;
 138
 139        old_array = xchg(&cgrp->bpf.effective[type], array);
 140        /* free prog array after grace period, since __cgroup_bpf_run_*()
 141         * might be still walking the array
 142         */
 143        bpf_prog_array_free(old_array);
 144}
 145
 146/**
 147 * cgroup_bpf_inherit() - inherit effective programs from parent
 148 * @cgrp: the cgroup to modify
 149 */
 150int cgroup_bpf_inherit(struct cgroup *cgrp)
 151{
 152/* has to use marco instead of const int, since compiler thinks
 153 * that array below is variable length
 154 */
 155#define NR ARRAY_SIZE(cgrp->bpf.effective)
 156        struct bpf_prog_array __rcu *arrays[NR] = {};
 157        int i;
 158
 159        for (i = 0; i < NR; i++)
 160                INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
 161
 162        for (i = 0; i < NR; i++)
 163                if (compute_effective_progs(cgrp, i, &arrays[i]))
 164                        goto cleanup;
 165
 166        for (i = 0; i < NR; i++)
 167                activate_effective_progs(cgrp, i, arrays[i]);
 168
 169        return 0;
 170cleanup:
 171        for (i = 0; i < NR; i++)
 172                bpf_prog_array_free(arrays[i]);
 173        return -ENOMEM;
 174}
 175
 176#define BPF_CGROUP_MAX_PROGS 64
 177
 178/**
 179 * __cgroup_bpf_attach() - Attach the program to a cgroup, and
 180 *                         propagate the change to descendants
 181 * @cgrp: The cgroup which descendants to traverse
 182 * @prog: A program to attach
 183 * @type: Type of attach operation
 184 *
 185 * Must be called with cgroup_mutex held.
 186 */
 187int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 188                        enum bpf_attach_type type, u32 flags)
 189{
 190        struct list_head *progs = &cgrp->bpf.progs[type];
 191        struct bpf_prog *old_prog = NULL;
 192        struct cgroup_subsys_state *css;
 193        struct bpf_prog_list *pl;
 194        bool pl_was_allocated;
 195        int err;
 196
 197        if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
 198                /* invalid combination */
 199                return -EINVAL;
 200
 201        if (!hierarchy_allows_attach(cgrp, type, flags))
 202                return -EPERM;
 203
 204        if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
 205                /* Disallow attaching non-overridable on top
 206                 * of existing overridable in this cgroup.
 207                 * Disallow attaching multi-prog if overridable or none
 208                 */
 209                return -EPERM;
 210
 211        if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
 212                return -E2BIG;
 213
 214        if (flags & BPF_F_ALLOW_MULTI) {
 215                list_for_each_entry(pl, progs, node)
 216                        if (pl->prog == prog)
 217                                /* disallow attaching the same prog twice */
 218                                return -EINVAL;
 219
 220                pl = kmalloc(sizeof(*pl), GFP_KERNEL);
 221                if (!pl)
 222                        return -ENOMEM;
 223                pl_was_allocated = true;
 224                pl->prog = prog;
 225                list_add_tail(&pl->node, progs);
 226        } else {
 227                if (list_empty(progs)) {
 228                        pl = kmalloc(sizeof(*pl), GFP_KERNEL);
 229                        if (!pl)
 230                                return -ENOMEM;
 231                        pl_was_allocated = true;
 232                        list_add_tail(&pl->node, progs);
 233                } else {
 234                        pl = list_first_entry(progs, typeof(*pl), node);
 235                        old_prog = pl->prog;
 236                        pl_was_allocated = false;
 237                }
 238                pl->prog = prog;
 239        }
 240
 241        cgrp->bpf.flags[type] = flags;
 242
 243        /* allocate and recompute effective prog arrays */
 244        css_for_each_descendant_pre(css, &cgrp->self) {
 245                struct cgroup *desc = container_of(css, struct cgroup, self);
 246
 247                err = compute_effective_progs(desc, type, &desc->bpf.inactive);
 248                if (err)
 249                        goto cleanup;
 250        }
 251
 252        /* all allocations were successful. Activate all prog arrays */
 253        css_for_each_descendant_pre(css, &cgrp->self) {
 254                struct cgroup *desc = container_of(css, struct cgroup, self);
 255
 256                activate_effective_progs(desc, type, desc->bpf.inactive);
 257                desc->bpf.inactive = NULL;
 258        }
 259
 260        static_branch_inc(&cgroup_bpf_enabled_key);
 261        if (old_prog) {
 262                bpf_prog_put(old_prog);
 263                static_branch_dec(&cgroup_bpf_enabled_key);
 264        }
 265        return 0;
 266
 267cleanup:
 268        /* oom while computing effective. Free all computed effective arrays
 269         * since they were not activated
 270         */
 271        css_for_each_descendant_pre(css, &cgrp->self) {
 272                struct cgroup *desc = container_of(css, struct cgroup, self);
 273
 274                bpf_prog_array_free(desc->bpf.inactive);
 275                desc->bpf.inactive = NULL;
 276        }
 277
 278        /* and cleanup the prog list */
 279        pl->prog = old_prog;
 280        if (pl_was_allocated) {
 281                list_del(&pl->node);
 282                kfree(pl);
 283        }
 284        return err;
 285}
 286
 287/**
 288 * __cgroup_bpf_detach() - Detach the program from a cgroup, and
 289 *                         propagate the change to descendants
 290 * @cgrp: The cgroup which descendants to traverse
 291 * @prog: A program to detach or NULL
 292 * @type: Type of detach operation
 293 *
 294 * Must be called with cgroup_mutex held.
 295 */
 296int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 297                        enum bpf_attach_type type, u32 unused_flags)
 298{
 299        struct list_head *progs = &cgrp->bpf.progs[type];
 300        u32 flags = cgrp->bpf.flags[type];
 301        struct bpf_prog *old_prog = NULL;
 302        struct cgroup_subsys_state *css;
 303        struct bpf_prog_list *pl;
 304        int err;
 305
 306        if (flags & BPF_F_ALLOW_MULTI) {
 307                if (!prog)
 308                        /* to detach MULTI prog the user has to specify valid FD
 309                         * of the program to be detached
 310                         */
 311                        return -EINVAL;
 312        } else {
 313                if (list_empty(progs))
 314                        /* report error when trying to detach and nothing is attached */
 315                        return -ENOENT;
 316        }
 317
 318        if (flags & BPF_F_ALLOW_MULTI) {
 319                /* find the prog and detach it */
 320                list_for_each_entry(pl, progs, node) {
 321                        if (pl->prog != prog)
 322                                continue;
 323                        old_prog = prog;
 324                        /* mark it deleted, so it's ignored while
 325                         * recomputing effective
 326                         */
 327                        pl->prog = NULL;
 328                        break;
 329                }
 330                if (!old_prog)
 331                        return -ENOENT;
 332        } else {
 333                /* to maintain backward compatibility NONE and OVERRIDE cgroups
 334                 * allow detaching with invalid FD (prog==NULL)
 335                 */
 336                pl = list_first_entry(progs, typeof(*pl), node);
 337                old_prog = pl->prog;
 338                pl->prog = NULL;
 339        }
 340
 341        /* allocate and recompute effective prog arrays */
 342        css_for_each_descendant_pre(css, &cgrp->self) {
 343                struct cgroup *desc = container_of(css, struct cgroup, self);
 344
 345                err = compute_effective_progs(desc, type, &desc->bpf.inactive);
 346                if (err)
 347                        goto cleanup;
 348        }
 349
 350        /* all allocations were successful. Activate all prog arrays */
 351        css_for_each_descendant_pre(css, &cgrp->self) {
 352                struct cgroup *desc = container_of(css, struct cgroup, self);
 353
 354                activate_effective_progs(desc, type, desc->bpf.inactive);
 355                desc->bpf.inactive = NULL;
 356        }
 357
 358        /* now can actually delete it from this cgroup list */
 359        list_del(&pl->node);
 360        kfree(pl);
 361        if (list_empty(progs))
 362                /* last program was detached, reset flags to zero */
 363                cgrp->bpf.flags[type] = 0;
 364
 365        bpf_prog_put(old_prog);
 366        static_branch_dec(&cgroup_bpf_enabled_key);
 367        return 0;
 368
 369cleanup:
 370        /* oom while computing effective. Free all computed effective arrays
 371         * since they were not activated
 372         */
 373        css_for_each_descendant_pre(css, &cgrp->self) {
 374                struct cgroup *desc = container_of(css, struct cgroup, self);
 375
 376                bpf_prog_array_free(desc->bpf.inactive);
 377                desc->bpf.inactive = NULL;
 378        }
 379
 380        /* and restore back old_prog */
 381        pl->prog = old_prog;
 382        return err;
 383}
 384
 385/* Must be called with cgroup_mutex held to avoid races. */
 386int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 387                       union bpf_attr __user *uattr)
 388{
 389        __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
 390        enum bpf_attach_type type = attr->query.attach_type;
 391        struct list_head *progs = &cgrp->bpf.progs[type];
 392        u32 flags = cgrp->bpf.flags[type];
 393        int cnt, ret = 0, i;
 394
 395        if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
 396                cnt = bpf_prog_array_length(cgrp->bpf.effective[type]);
 397        else
 398                cnt = prog_list_length(progs);
 399
 400        if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
 401                return -EFAULT;
 402        if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
 403                return -EFAULT;
 404        if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
 405                /* return early if user requested only program count + flags */
 406                return 0;
 407        if (attr->query.prog_cnt < cnt) {
 408                cnt = attr->query.prog_cnt;
 409                ret = -ENOSPC;
 410        }
 411
 412        if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
 413                return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type],
 414                                                   prog_ids, cnt);
 415        } else {
 416                struct bpf_prog_list *pl;
 417                u32 id;
 418
 419                i = 0;
 420                list_for_each_entry(pl, progs, node) {
 421                        id = pl->prog->aux->id;
 422                        if (copy_to_user(prog_ids + i, &id, sizeof(id)))
 423                                return -EFAULT;
 424                        if (++i == cnt)
 425                                break;
 426                }
 427        }
 428        return ret;
 429}
 430
 431/**
 432 * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
 433 * @sk: The socket sending or receiving traffic
 434 * @skb: The skb that is being sent or received
 435 * @type: The type of program to be exectuted
 436 *
 437 * If no socket is passed, or the socket is not of type INET or INET6,
 438 * this function does nothing and returns 0.
 439 *
 440 * The program type passed in via @type must be suitable for network
 441 * filtering. No further check is performed to assert that.
 442 *
 443 * This function will return %-EPERM if any if an attached program was found
 444 * and if it returned != 1 during execution. In all other cases, 0 is returned.
 445 */
 446int __cgroup_bpf_run_filter_skb(struct sock *sk,
 447                                struct sk_buff *skb,
 448                                enum bpf_attach_type type)
 449{
 450        unsigned int offset = skb->data - skb_network_header(skb);
 451        struct sock *save_sk;
 452        struct cgroup *cgrp;
 453        int ret;
 454
 455        if (!sk || !sk_fullsock(sk))
 456                return 0;
 457
 458        if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
 459                return 0;
 460
 461        cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 462        save_sk = skb->sk;
 463        skb->sk = sk;
 464        __skb_push(skb, offset);
 465        ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
 466                                 bpf_prog_run_save_cb);
 467        __skb_pull(skb, offset);
 468        skb->sk = save_sk;
 469        return ret == 1 ? 0 : -EPERM;
 470}
 471EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
 472
 473/**
 474 * __cgroup_bpf_run_filter_sk() - Run a program on a sock
 475 * @sk: sock structure to manipulate
 476 * @type: The type of program to be exectuted
 477 *
 478 * socket is passed is expected to be of type INET or INET6.
 479 *
 480 * The program type passed in via @type must be suitable for sock
 481 * filtering. No further check is performed to assert that.
 482 *
 483 * This function will return %-EPERM if any if an attached program was found
 484 * and if it returned != 1 during execution. In all other cases, 0 is returned.
 485 */
 486int __cgroup_bpf_run_filter_sk(struct sock *sk,
 487                               enum bpf_attach_type type)
 488{
 489        struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 490        int ret;
 491
 492        ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
 493        return ret == 1 ? 0 : -EPERM;
 494}
 495EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
 496
 497/**
 498 * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
 499 *                                       provided by user sockaddr
 500 * @sk: sock struct that will use sockaddr
 501 * @uaddr: sockaddr struct provided by user
 502 * @type: The type of program to be exectuted
 503 *
 504 * socket is expected to be of type INET or INET6.
 505 *
 506 * This function will return %-EPERM if an attached program is found and
 507 * returned value != 1 during execution. In all other cases, 0 is returned.
 508 */
 509int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 510                                      struct sockaddr *uaddr,
 511                                      enum bpf_attach_type type)
 512{
 513        struct bpf_sock_addr_kern ctx = {
 514                .sk = sk,
 515                .uaddr = uaddr,
 516        };
 517        struct cgroup *cgrp;
 518        int ret;
 519
 520        /* Check socket family since not all sockets represent network
 521         * endpoint (e.g. AF_UNIX).
 522         */
 523        if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
 524                return 0;
 525
 526        cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 527        ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
 528
 529        return ret == 1 ? 0 : -EPERM;
 530}
 531EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
 532
 533/**
 534 * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
 535 * @sk: socket to get cgroup from
 536 * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
 537 * sk with connection information (IP addresses, etc.) May not contain
 538 * cgroup info if it is a req sock.
 539 * @type: The type of program to be exectuted
 540 *
 541 * socket passed is expected to be of type INET or INET6.
 542 *
 543 * The program type passed in via @type must be suitable for sock_ops
 544 * filtering. No further check is performed to assert that.
 545 *
 546 * This function will return %-EPERM if any if an attached program was found
 547 * and if it returned != 1 during execution. In all other cases, 0 is returned.
 548 */
 549int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 550                                     struct bpf_sock_ops_kern *sock_ops,
 551                                     enum bpf_attach_type type)
 552{
 553        struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 554        int ret;
 555
 556        ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
 557                                 BPF_PROG_RUN);
 558        return ret == 1 ? 0 : -EPERM;
 559}
 560EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
 561
 562int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 563                                      short access, enum bpf_attach_type type)
 564{
 565        struct cgroup *cgrp;
 566        struct bpf_cgroup_dev_ctx ctx = {
 567                .access_type = (access << 16) | dev_type,
 568                .major = major,
 569                .minor = minor,
 570        };
 571        int allow = 1;
 572
 573        rcu_read_lock();
 574        cgrp = task_dfl_cgroup(current);
 575        allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
 576                                   BPF_PROG_RUN);
 577        rcu_read_unlock();
 578
 579        return !allow;
 580}
 581EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
 582
 583static const struct bpf_func_proto *
 584cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 585{
 586        switch (func_id) {
 587        case BPF_FUNC_map_lookup_elem:
 588                return &bpf_map_lookup_elem_proto;
 589        case BPF_FUNC_map_update_elem:
 590                return &bpf_map_update_elem_proto;
 591        case BPF_FUNC_map_delete_elem:
 592                return &bpf_map_delete_elem_proto;
 593        case BPF_FUNC_get_current_uid_gid:
 594                return &bpf_get_current_uid_gid_proto;
 595        case BPF_FUNC_trace_printk:
 596                if (capable(CAP_SYS_ADMIN))
 597                        return bpf_get_trace_printk_proto();
 598        default:
 599                return NULL;
 600        }
 601}
 602
 603static bool cgroup_dev_is_valid_access(int off, int size,
 604                                       enum bpf_access_type type,
 605                                       const struct bpf_prog *prog,
 606                                       struct bpf_insn_access_aux *info)
 607{
 608        const int size_default = sizeof(__u32);
 609
 610        if (type == BPF_WRITE)
 611                return false;
 612
 613        if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
 614                return false;
 615        /* The verifier guarantees that size > 0. */
 616        if (off % size != 0)
 617                return false;
 618
 619        switch (off) {
 620        case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
 621                bpf_ctx_record_field_size(info, size_default);
 622                if (!bpf_ctx_narrow_access_ok(off, size, size_default))
 623                        return false;
 624                break;
 625        default:
 626                if (size != size_default)
 627                        return false;
 628        }
 629
 630        return true;
 631}
 632
 633const struct bpf_prog_ops cg_dev_prog_ops = {
 634};
 635
 636const struct bpf_verifier_ops cg_dev_verifier_ops = {
 637        .get_func_proto         = cgroup_dev_func_proto,
 638        .is_valid_access        = cgroup_dev_is_valid_access,
 639};
 640