linux/kernel/bpf/cgroup.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Functions to manage eBPF programs attached to cgroups
   4 *
   5 * Copyright (c) 2016 Daniel Mack
   6 */
   7
   8#include <linux/kernel.h>
   9#include <linux/atomic.h>
  10#include <linux/cgroup.h>
  11#include <linux/filter.h>
  12#include <linux/slab.h>
  13#include <linux/sysctl.h>
  14#include <linux/string.h>
  15#include <linux/bpf.h>
  16#include <linux/bpf-cgroup.h>
  17#include <net/sock.h>
  18#include <net/bpf_sk_storage.h>
  19
  20#include "../cgroup/cgroup-internal.h"
  21
  22DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
  23EXPORT_SYMBOL(cgroup_bpf_enabled_key);
  24
  25void cgroup_bpf_offline(struct cgroup *cgrp)
  26{
  27        cgroup_get(cgrp);
  28        percpu_ref_kill(&cgrp->bpf.refcnt);
  29}
  30
  31/**
  32 * cgroup_bpf_release() - put references of all bpf programs and
  33 *                        release all cgroup bpf data
  34 * @work: work structure embedded into the cgroup to modify
  35 */
  36static void cgroup_bpf_release(struct work_struct *work)
  37{
  38        struct cgroup *cgrp = container_of(work, struct cgroup,
  39                                           bpf.release_work);
  40        enum bpf_cgroup_storage_type stype;
  41        struct bpf_prog_array *old_array;
  42        unsigned int type;
  43
  44        mutex_lock(&cgroup_mutex);
  45
  46        for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
  47                struct list_head *progs = &cgrp->bpf.progs[type];
  48                struct bpf_prog_list *pl, *tmp;
  49
  50                list_for_each_entry_safe(pl, tmp, progs, node) {
  51                        list_del(&pl->node);
  52                        bpf_prog_put(pl->prog);
  53                        for_each_cgroup_storage_type(stype) {
  54                                bpf_cgroup_storage_unlink(pl->storage[stype]);
  55                                bpf_cgroup_storage_free(pl->storage[stype]);
  56                        }
  57                        kfree(pl);
  58                        static_branch_dec(&cgroup_bpf_enabled_key);
  59                }
  60                old_array = rcu_dereference_protected(
  61                                cgrp->bpf.effective[type],
  62                                lockdep_is_held(&cgroup_mutex));
  63                bpf_prog_array_free(old_array);
  64        }
  65
  66        mutex_unlock(&cgroup_mutex);
  67
  68        percpu_ref_exit(&cgrp->bpf.refcnt);
  69        cgroup_put(cgrp);
  70}
  71
  72/**
  73 * cgroup_bpf_release_fn() - callback used to schedule releasing
  74 *                           of bpf cgroup data
  75 * @ref: percpu ref counter structure
  76 */
  77static void cgroup_bpf_release_fn(struct percpu_ref *ref)
  78{
  79        struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
  80
  81        INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
  82        queue_work(system_wq, &cgrp->bpf.release_work);
  83}
  84
  85/* count number of elements in the list.
  86 * it's slow but the list cannot be long
  87 */
  88static u32 prog_list_length(struct list_head *head)
  89{
  90        struct bpf_prog_list *pl;
  91        u32 cnt = 0;
  92
  93        list_for_each_entry(pl, head, node) {
  94                if (!pl->prog)
  95                        continue;
  96                cnt++;
  97        }
  98        return cnt;
  99}
 100
 101/* if parent has non-overridable prog attached,
 102 * disallow attaching new programs to the descendent cgroup.
 103 * if parent has overridable or multi-prog, allow attaching
 104 */
 105static bool hierarchy_allows_attach(struct cgroup *cgrp,
 106                                    enum bpf_attach_type type,
 107                                    u32 new_flags)
 108{
 109        struct cgroup *p;
 110
 111        p = cgroup_parent(cgrp);
 112        if (!p)
 113                return true;
 114        do {
 115                u32 flags = p->bpf.flags[type];
 116                u32 cnt;
 117
 118                if (flags & BPF_F_ALLOW_MULTI)
 119                        return true;
 120                cnt = prog_list_length(&p->bpf.progs[type]);
 121                WARN_ON_ONCE(cnt > 1);
 122                if (cnt == 1)
 123                        return !!(flags & BPF_F_ALLOW_OVERRIDE);
 124                p = cgroup_parent(p);
 125        } while (p);
 126        return true;
 127}
 128
 129/* compute a chain of effective programs for a given cgroup:
 130 * start from the list of programs in this cgroup and add
 131 * all parent programs.
 132 * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
 133 * to programs in this cgroup
 134 */
 135static int compute_effective_progs(struct cgroup *cgrp,
 136                                   enum bpf_attach_type type,
 137                                   struct bpf_prog_array **array)
 138{
 139        enum bpf_cgroup_storage_type stype;
 140        struct bpf_prog_array *progs;
 141        struct bpf_prog_list *pl;
 142        struct cgroup *p = cgrp;
 143        int cnt = 0;
 144
 145        /* count number of effective programs by walking parents */
 146        do {
 147                if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
 148                        cnt += prog_list_length(&p->bpf.progs[type]);
 149                p = cgroup_parent(p);
 150        } while (p);
 151
 152        progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
 153        if (!progs)
 154                return -ENOMEM;
 155
 156        /* populate the array with effective progs */
 157        cnt = 0;
 158        p = cgrp;
 159        do {
 160                if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
 161                        continue;
 162
 163                list_for_each_entry(pl, &p->bpf.progs[type], node) {
 164                        if (!pl->prog)
 165                                continue;
 166
 167                        progs->items[cnt].prog = pl->prog;
 168                        for_each_cgroup_storage_type(stype)
 169                                progs->items[cnt].cgroup_storage[stype] =
 170                                        pl->storage[stype];
 171                        cnt++;
 172                }
 173        } while ((p = cgroup_parent(p)));
 174
 175        *array = progs;
 176        return 0;
 177}
 178
 179static void activate_effective_progs(struct cgroup *cgrp,
 180                                     enum bpf_attach_type type,
 181                                     struct bpf_prog_array *old_array)
 182{
 183        rcu_swap_protected(cgrp->bpf.effective[type], old_array,
 184                           lockdep_is_held(&cgroup_mutex));
 185        /* free prog array after grace period, since __cgroup_bpf_run_*()
 186         * might be still walking the array
 187         */
 188        bpf_prog_array_free(old_array);
 189}
 190
 191/**
 192 * cgroup_bpf_inherit() - inherit effective programs from parent
 193 * @cgrp: the cgroup to modify
 194 */
 195int cgroup_bpf_inherit(struct cgroup *cgrp)
 196{
 197/* has to use marco instead of const int, since compiler thinks
 198 * that array below is variable length
 199 */
 200#define NR ARRAY_SIZE(cgrp->bpf.effective)
 201        struct bpf_prog_array *arrays[NR] = {};
 202        int ret, i;
 203
 204        ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
 205                              GFP_KERNEL);
 206        if (ret)
 207                return ret;
 208
 209        for (i = 0; i < NR; i++)
 210                INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
 211
 212        for (i = 0; i < NR; i++)
 213                if (compute_effective_progs(cgrp, i, &arrays[i]))
 214                        goto cleanup;
 215
 216        for (i = 0; i < NR; i++)
 217                activate_effective_progs(cgrp, i, arrays[i]);
 218
 219        return 0;
 220cleanup:
 221        for (i = 0; i < NR; i++)
 222                bpf_prog_array_free(arrays[i]);
 223
 224        percpu_ref_exit(&cgrp->bpf.refcnt);
 225
 226        return -ENOMEM;
 227}
 228
 229static int update_effective_progs(struct cgroup *cgrp,
 230                                  enum bpf_attach_type type)
 231{
 232        struct cgroup_subsys_state *css;
 233        int err;
 234
 235        /* allocate and recompute effective prog arrays */
 236        css_for_each_descendant_pre(css, &cgrp->self) {
 237                struct cgroup *desc = container_of(css, struct cgroup, self);
 238
 239                if (percpu_ref_is_zero(&desc->bpf.refcnt))
 240                        continue;
 241
 242                err = compute_effective_progs(desc, type, &desc->bpf.inactive);
 243                if (err)
 244                        goto cleanup;
 245        }
 246
 247        /* all allocations were successful. Activate all prog arrays */
 248        css_for_each_descendant_pre(css, &cgrp->self) {
 249                struct cgroup *desc = container_of(css, struct cgroup, self);
 250
 251                if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
 252                        if (unlikely(desc->bpf.inactive)) {
 253                                bpf_prog_array_free(desc->bpf.inactive);
 254                                desc->bpf.inactive = NULL;
 255                        }
 256                        continue;
 257                }
 258
 259                activate_effective_progs(desc, type, desc->bpf.inactive);
 260                desc->bpf.inactive = NULL;
 261        }
 262
 263        return 0;
 264
 265cleanup:
 266        /* oom while computing effective. Free all computed effective arrays
 267         * since they were not activated
 268         */
 269        css_for_each_descendant_pre(css, &cgrp->self) {
 270                struct cgroup *desc = container_of(css, struct cgroup, self);
 271
 272                bpf_prog_array_free(desc->bpf.inactive);
 273                desc->bpf.inactive = NULL;
 274        }
 275
 276        return err;
 277}
 278
 279#define BPF_CGROUP_MAX_PROGS 64
 280
 281/**
 282 * __cgroup_bpf_attach() - Attach the program to a cgroup, and
 283 *                         propagate the change to descendants
 284 * @cgrp: The cgroup which descendants to traverse
 285 * @prog: A program to attach
 286 * @type: Type of attach operation
 287 * @flags: Option flags
 288 *
 289 * Must be called with cgroup_mutex held.
 290 */
 291int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 292                        enum bpf_attach_type type, u32 flags)
 293{
 294        struct list_head *progs = &cgrp->bpf.progs[type];
 295        struct bpf_prog *old_prog = NULL;
 296        struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE],
 297                *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL};
 298        enum bpf_cgroup_storage_type stype;
 299        struct bpf_prog_list *pl;
 300        bool pl_was_allocated;
 301        int err;
 302
 303        if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
 304                /* invalid combination */
 305                return -EINVAL;
 306
 307        if (!hierarchy_allows_attach(cgrp, type, flags))
 308                return -EPERM;
 309
 310        if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
 311                /* Disallow attaching non-overridable on top
 312                 * of existing overridable in this cgroup.
 313                 * Disallow attaching multi-prog if overridable or none
 314                 */
 315                return -EPERM;
 316
 317        if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
 318                return -E2BIG;
 319
 320        for_each_cgroup_storage_type(stype) {
 321                storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
 322                if (IS_ERR(storage[stype])) {
 323                        storage[stype] = NULL;
 324                        for_each_cgroup_storage_type(stype)
 325                                bpf_cgroup_storage_free(storage[stype]);
 326                        return -ENOMEM;
 327                }
 328        }
 329
 330        if (flags & BPF_F_ALLOW_MULTI) {
 331                list_for_each_entry(pl, progs, node) {
 332                        if (pl->prog == prog) {
 333                                /* disallow attaching the same prog twice */
 334                                for_each_cgroup_storage_type(stype)
 335                                        bpf_cgroup_storage_free(storage[stype]);
 336                                return -EINVAL;
 337                        }
 338                }
 339
 340                pl = kmalloc(sizeof(*pl), GFP_KERNEL);
 341                if (!pl) {
 342                        for_each_cgroup_storage_type(stype)
 343                                bpf_cgroup_storage_free(storage[stype]);
 344                        return -ENOMEM;
 345                }
 346
 347                pl_was_allocated = true;
 348                pl->prog = prog;
 349                for_each_cgroup_storage_type(stype)
 350                        pl->storage[stype] = storage[stype];
 351                list_add_tail(&pl->node, progs);
 352        } else {
 353                if (list_empty(progs)) {
 354                        pl = kmalloc(sizeof(*pl), GFP_KERNEL);
 355                        if (!pl) {
 356                                for_each_cgroup_storage_type(stype)
 357                                        bpf_cgroup_storage_free(storage[stype]);
 358                                return -ENOMEM;
 359                        }
 360                        pl_was_allocated = true;
 361                        list_add_tail(&pl->node, progs);
 362                } else {
 363                        pl = list_first_entry(progs, typeof(*pl), node);
 364                        old_prog = pl->prog;
 365                        for_each_cgroup_storage_type(stype) {
 366                                old_storage[stype] = pl->storage[stype];
 367                                bpf_cgroup_storage_unlink(old_storage[stype]);
 368                        }
 369                        pl_was_allocated = false;
 370                }
 371                pl->prog = prog;
 372                for_each_cgroup_storage_type(stype)
 373                        pl->storage[stype] = storage[stype];
 374        }
 375
 376        cgrp->bpf.flags[type] = flags;
 377
 378        err = update_effective_progs(cgrp, type);
 379        if (err)
 380                goto cleanup;
 381
 382        static_branch_inc(&cgroup_bpf_enabled_key);
 383        for_each_cgroup_storage_type(stype) {
 384                if (!old_storage[stype])
 385                        continue;
 386                bpf_cgroup_storage_free(old_storage[stype]);
 387        }
 388        if (old_prog) {
 389                bpf_prog_put(old_prog);
 390                static_branch_dec(&cgroup_bpf_enabled_key);
 391        }
 392        for_each_cgroup_storage_type(stype)
 393                bpf_cgroup_storage_link(storage[stype], cgrp, type);
 394        return 0;
 395
 396cleanup:
 397        /* and cleanup the prog list */
 398        pl->prog = old_prog;
 399        for_each_cgroup_storage_type(stype) {
 400                bpf_cgroup_storage_free(pl->storage[stype]);
 401                pl->storage[stype] = old_storage[stype];
 402                bpf_cgroup_storage_link(old_storage[stype], cgrp, type);
 403        }
 404        if (pl_was_allocated) {
 405                list_del(&pl->node);
 406                kfree(pl);
 407        }
 408        return err;
 409}
 410
 411/**
 412 * __cgroup_bpf_detach() - Detach the program from a cgroup, and
 413 *                         propagate the change to descendants
 414 * @cgrp: The cgroup which descendants to traverse
 415 * @prog: A program to detach or NULL
 416 * @type: Type of detach operation
 417 *
 418 * Must be called with cgroup_mutex held.
 419 */
 420int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 421                        enum bpf_attach_type type)
 422{
 423        struct list_head *progs = &cgrp->bpf.progs[type];
 424        enum bpf_cgroup_storage_type stype;
 425        u32 flags = cgrp->bpf.flags[type];
 426        struct bpf_prog *old_prog = NULL;
 427        struct bpf_prog_list *pl;
 428        int err;
 429
 430        if (flags & BPF_F_ALLOW_MULTI) {
 431                if (!prog)
 432                        /* to detach MULTI prog the user has to specify valid FD
 433                         * of the program to be detached
 434                         */
 435                        return -EINVAL;
 436        } else {
 437                if (list_empty(progs))
 438                        /* report error when trying to detach and nothing is attached */
 439                        return -ENOENT;
 440        }
 441
 442        if (flags & BPF_F_ALLOW_MULTI) {
 443                /* find the prog and detach it */
 444                list_for_each_entry(pl, progs, node) {
 445                        if (pl->prog != prog)
 446                                continue;
 447                        old_prog = prog;
 448                        /* mark it deleted, so it's ignored while
 449                         * recomputing effective
 450                         */
 451                        pl->prog = NULL;
 452                        break;
 453                }
 454                if (!old_prog)
 455                        return -ENOENT;
 456        } else {
 457                /* to maintain backward compatibility NONE and OVERRIDE cgroups
 458                 * allow detaching with invalid FD (prog==NULL)
 459                 */
 460                pl = list_first_entry(progs, typeof(*pl), node);
 461                old_prog = pl->prog;
 462                pl->prog = NULL;
 463        }
 464
 465        err = update_effective_progs(cgrp, type);
 466        if (err)
 467                goto cleanup;
 468
 469        /* now can actually delete it from this cgroup list */
 470        list_del(&pl->node);
 471        for_each_cgroup_storage_type(stype) {
 472                bpf_cgroup_storage_unlink(pl->storage[stype]);
 473                bpf_cgroup_storage_free(pl->storage[stype]);
 474        }
 475        kfree(pl);
 476        if (list_empty(progs))
 477                /* last program was detached, reset flags to zero */
 478                cgrp->bpf.flags[type] = 0;
 479
 480        bpf_prog_put(old_prog);
 481        static_branch_dec(&cgroup_bpf_enabled_key);
 482        return 0;
 483
 484cleanup:
 485        /* and restore back old_prog */
 486        pl->prog = old_prog;
 487        return err;
 488}
 489
 490/* Must be called with cgroup_mutex held to avoid races. */
 491int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 492                       union bpf_attr __user *uattr)
 493{
 494        __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
 495        enum bpf_attach_type type = attr->query.attach_type;
 496        struct list_head *progs = &cgrp->bpf.progs[type];
 497        u32 flags = cgrp->bpf.flags[type];
 498        struct bpf_prog_array *effective;
 499        int cnt, ret = 0, i;
 500
 501        effective = rcu_dereference_protected(cgrp->bpf.effective[type],
 502                                              lockdep_is_held(&cgroup_mutex));
 503
 504        if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
 505                cnt = bpf_prog_array_length(effective);
 506        else
 507                cnt = prog_list_length(progs);
 508
 509        if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
 510                return -EFAULT;
 511        if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
 512                return -EFAULT;
 513        if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
 514                /* return early if user requested only program count + flags */
 515                return 0;
 516        if (attr->query.prog_cnt < cnt) {
 517                cnt = attr->query.prog_cnt;
 518                ret = -ENOSPC;
 519        }
 520
 521        if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
 522                return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
 523        } else {
 524                struct bpf_prog_list *pl;
 525                u32 id;
 526
 527                i = 0;
 528                list_for_each_entry(pl, progs, node) {
 529                        id = pl->prog->aux->id;
 530                        if (copy_to_user(prog_ids + i, &id, sizeof(id)))
 531                                return -EFAULT;
 532                        if (++i == cnt)
 533                                break;
 534                }
 535        }
 536        return ret;
 537}
 538
 539int cgroup_bpf_prog_attach(const union bpf_attr *attr,
 540                           enum bpf_prog_type ptype, struct bpf_prog *prog)
 541{
 542        struct cgroup *cgrp;
 543        int ret;
 544
 545        cgrp = cgroup_get_from_fd(attr->target_fd);
 546        if (IS_ERR(cgrp))
 547                return PTR_ERR(cgrp);
 548
 549        ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
 550                                attr->attach_flags);
 551        cgroup_put(cgrp);
 552        return ret;
 553}
 554
 555int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
 556{
 557        struct bpf_prog *prog;
 558        struct cgroup *cgrp;
 559        int ret;
 560
 561        cgrp = cgroup_get_from_fd(attr->target_fd);
 562        if (IS_ERR(cgrp))
 563                return PTR_ERR(cgrp);
 564
 565        prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
 566        if (IS_ERR(prog))
 567                prog = NULL;
 568
 569        ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
 570        if (prog)
 571                bpf_prog_put(prog);
 572
 573        cgroup_put(cgrp);
 574        return ret;
 575}
 576
 577int cgroup_bpf_prog_query(const union bpf_attr *attr,
 578                          union bpf_attr __user *uattr)
 579{
 580        struct cgroup *cgrp;
 581        int ret;
 582
 583        cgrp = cgroup_get_from_fd(attr->query.target_fd);
 584        if (IS_ERR(cgrp))
 585                return PTR_ERR(cgrp);
 586
 587        ret = cgroup_bpf_query(cgrp, attr, uattr);
 588
 589        cgroup_put(cgrp);
 590        return ret;
 591}
 592
 593/**
 594 * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
 595 * @sk: The socket sending or receiving traffic
 596 * @skb: The skb that is being sent or received
 597 * @type: The type of program to be exectuted
 598 *
 599 * If no socket is passed, or the socket is not of type INET or INET6,
 600 * this function does nothing and returns 0.
 601 *
 602 * The program type passed in via @type must be suitable for network
 603 * filtering. No further check is performed to assert that.
 604 *
 605 * For egress packets, this function can return:
 606 *   NET_XMIT_SUCCESS    (0)    - continue with packet output
 607 *   NET_XMIT_DROP       (1)    - drop packet and notify TCP to call cwr
 608 *   NET_XMIT_CN         (2)    - continue with packet output and notify TCP
 609 *                                to call cwr
 610 *   -EPERM                     - drop packet
 611 *
 612 * For ingress packets, this function will return -EPERM if any
 613 * attached program was found and if it returned != 1 during execution.
 614 * Otherwise 0 is returned.
 615 */
 616int __cgroup_bpf_run_filter_skb(struct sock *sk,
 617                                struct sk_buff *skb,
 618                                enum bpf_attach_type type)
 619{
 620        unsigned int offset = skb->data - skb_network_header(skb);
 621        struct sock *save_sk;
 622        void *saved_data_end;
 623        struct cgroup *cgrp;
 624        int ret;
 625
 626        if (!sk || !sk_fullsock(sk))
 627                return 0;
 628
 629        if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
 630                return 0;
 631
 632        cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 633        save_sk = skb->sk;
 634        skb->sk = sk;
 635        __skb_push(skb, offset);
 636
 637        /* compute pointers for the bpf prog */
 638        bpf_compute_and_save_data_end(skb, &saved_data_end);
 639
 640        if (type == BPF_CGROUP_INET_EGRESS) {
 641                ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
 642                        cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb);
 643        } else {
 644                ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
 645                                          __bpf_prog_run_save_cb);
 646                ret = (ret == 1 ? 0 : -EPERM);
 647        }
 648        bpf_restore_data_end(skb, saved_data_end);
 649        __skb_pull(skb, offset);
 650        skb->sk = save_sk;
 651
 652        return ret;
 653}
 654EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
 655
 656/**
 657 * __cgroup_bpf_run_filter_sk() - Run a program on a sock
 658 * @sk: sock structure to manipulate
 659 * @type: The type of program to be exectuted
 660 *
 661 * socket is passed is expected to be of type INET or INET6.
 662 *
 663 * The program type passed in via @type must be suitable for sock
 664 * filtering. No further check is performed to assert that.
 665 *
 666 * This function will return %-EPERM if any if an attached program was found
 667 * and if it returned != 1 during execution. In all other cases, 0 is returned.
 668 */
 669int __cgroup_bpf_run_filter_sk(struct sock *sk,
 670                               enum bpf_attach_type type)
 671{
 672        struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 673        int ret;
 674
 675        ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
 676        return ret == 1 ? 0 : -EPERM;
 677}
 678EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
 679
 680/**
 681 * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
 682 *                                       provided by user sockaddr
 683 * @sk: sock struct that will use sockaddr
 684 * @uaddr: sockaddr struct provided by user
 685 * @type: The type of program to be exectuted
 686 * @t_ctx: Pointer to attach type specific context
 687 *
 688 * socket is expected to be of type INET or INET6.
 689 *
 690 * This function will return %-EPERM if an attached program is found and
 691 * returned value != 1 during execution. In all other cases, 0 is returned.
 692 */
 693int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 694                                      struct sockaddr *uaddr,
 695                                      enum bpf_attach_type type,
 696                                      void *t_ctx)
 697{
 698        struct bpf_sock_addr_kern ctx = {
 699                .sk = sk,
 700                .uaddr = uaddr,
 701                .t_ctx = t_ctx,
 702        };
 703        struct sockaddr_storage unspec;
 704        struct cgroup *cgrp;
 705        int ret;
 706
 707        /* Check socket family since not all sockets represent network
 708         * endpoint (e.g. AF_UNIX).
 709         */
 710        if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
 711                return 0;
 712
 713        if (!ctx.uaddr) {
 714                memset(&unspec, 0, sizeof(unspec));
 715                ctx.uaddr = (struct sockaddr *)&unspec;
 716        }
 717
 718        cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 719        ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
 720
 721        return ret == 1 ? 0 : -EPERM;
 722}
 723EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
 724
 725/**
 726 * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
 727 * @sk: socket to get cgroup from
 728 * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
 729 * sk with connection information (IP addresses, etc.) May not contain
 730 * cgroup info if it is a req sock.
 731 * @type: The type of program to be exectuted
 732 *
 733 * socket passed is expected to be of type INET or INET6.
 734 *
 735 * The program type passed in via @type must be suitable for sock_ops
 736 * filtering. No further check is performed to assert that.
 737 *
 738 * This function will return %-EPERM if any if an attached program was found
 739 * and if it returned != 1 during execution. In all other cases, 0 is returned.
 740 */
 741int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 742                                     struct bpf_sock_ops_kern *sock_ops,
 743                                     enum bpf_attach_type type)
 744{
 745        struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 746        int ret;
 747
 748        ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
 749                                 BPF_PROG_RUN);
 750        return ret == 1 ? 0 : -EPERM;
 751}
 752EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
 753
 754int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 755                                      short access, enum bpf_attach_type type)
 756{
 757        struct cgroup *cgrp;
 758        struct bpf_cgroup_dev_ctx ctx = {
 759                .access_type = (access << 16) | dev_type,
 760                .major = major,
 761                .minor = minor,
 762        };
 763        int allow = 1;
 764
 765        rcu_read_lock();
 766        cgrp = task_dfl_cgroup(current);
 767        allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
 768                                   BPF_PROG_RUN);
 769        rcu_read_unlock();
 770
 771        return !allow;
 772}
 773EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
 774
 775static const struct bpf_func_proto *
 776cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 777{
 778        switch (func_id) {
 779        case BPF_FUNC_map_lookup_elem:
 780                return &bpf_map_lookup_elem_proto;
 781        case BPF_FUNC_map_update_elem:
 782                return &bpf_map_update_elem_proto;
 783        case BPF_FUNC_map_delete_elem:
 784                return &bpf_map_delete_elem_proto;
 785        case BPF_FUNC_map_push_elem:
 786                return &bpf_map_push_elem_proto;
 787        case BPF_FUNC_map_pop_elem:
 788                return &bpf_map_pop_elem_proto;
 789        case BPF_FUNC_map_peek_elem:
 790                return &bpf_map_peek_elem_proto;
 791        case BPF_FUNC_get_current_uid_gid:
 792                return &bpf_get_current_uid_gid_proto;
 793        case BPF_FUNC_get_local_storage:
 794                return &bpf_get_local_storage_proto;
 795        case BPF_FUNC_get_current_cgroup_id:
 796                return &bpf_get_current_cgroup_id_proto;
 797        case BPF_FUNC_trace_printk:
 798                if (capable(CAP_SYS_ADMIN))
 799                        return bpf_get_trace_printk_proto();
 800                /* fall through */
 801        default:
 802                return NULL;
 803        }
 804}
 805
 806static const struct bpf_func_proto *
 807cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 808{
 809        return cgroup_base_func_proto(func_id, prog);
 810}
 811
 812static bool cgroup_dev_is_valid_access(int off, int size,
 813                                       enum bpf_access_type type,
 814                                       const struct bpf_prog *prog,
 815                                       struct bpf_insn_access_aux *info)
 816{
 817        const int size_default = sizeof(__u32);
 818
 819        if (type == BPF_WRITE)
 820                return false;
 821
 822        if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
 823                return false;
 824        /* The verifier guarantees that size > 0. */
 825        if (off % size != 0)
 826                return false;
 827
 828        switch (off) {
 829        case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
 830                bpf_ctx_record_field_size(info, size_default);
 831                if (!bpf_ctx_narrow_access_ok(off, size, size_default))
 832                        return false;
 833                break;
 834        default:
 835                if (size != size_default)
 836                        return false;
 837        }
 838
 839        return true;
 840}
 841
 842const struct bpf_prog_ops cg_dev_prog_ops = {
 843};
 844
 845const struct bpf_verifier_ops cg_dev_verifier_ops = {
 846        .get_func_proto         = cgroup_dev_func_proto,
 847        .is_valid_access        = cgroup_dev_is_valid_access,
 848};
 849
 850/**
 851 * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
 852 *
 853 * @head: sysctl table header
 854 * @table: sysctl table
 855 * @write: sysctl is being read (= 0) or written (= 1)
 856 * @buf: pointer to buffer passed by user space
 857 * @pcount: value-result argument: value is size of buffer pointed to by @buf,
 858 *      result is size of @new_buf if program set new value, initial value
 859 *      otherwise
 860 * @ppos: value-result argument: value is position at which read from or write
 861 *      to sysctl is happening, result is new position if program overrode it,
 862 *      initial value otherwise
 863 * @new_buf: pointer to pointer to new buffer that will be allocated if program
 864 *      overrides new value provided by user space on sysctl write
 865 *      NOTE: it's caller responsibility to free *new_buf if it was set
 866 * @type: type of program to be executed
 867 *
 868 * Program is run when sysctl is being accessed, either read or written, and
 869 * can allow or deny such access.
 870 *
 871 * This function will return %-EPERM if an attached program is found and
 872 * returned value != 1 during execution. In all other cases 0 is returned.
 873 */
 874int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 875                                   struct ctl_table *table, int write,
 876                                   void __user *buf, size_t *pcount,
 877                                   loff_t *ppos, void **new_buf,
 878                                   enum bpf_attach_type type)
 879{
 880        struct bpf_sysctl_kern ctx = {
 881                .head = head,
 882                .table = table,
 883                .write = write,
 884                .ppos = ppos,
 885                .cur_val = NULL,
 886                .cur_len = PAGE_SIZE,
 887                .new_val = NULL,
 888                .new_len = 0,
 889                .new_updated = 0,
 890        };
 891        struct cgroup *cgrp;
 892        int ret;
 893
 894        ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
 895        if (ctx.cur_val) {
 896                mm_segment_t old_fs;
 897                loff_t pos = 0;
 898
 899                old_fs = get_fs();
 900                set_fs(KERNEL_DS);
 901                if (table->proc_handler(table, 0, (void __user *)ctx.cur_val,
 902                                        &ctx.cur_len, &pos)) {
 903                        /* Let BPF program decide how to proceed. */
 904                        ctx.cur_len = 0;
 905                }
 906                set_fs(old_fs);
 907        } else {
 908                /* Let BPF program decide how to proceed. */
 909                ctx.cur_len = 0;
 910        }
 911
 912        if (write && buf && *pcount) {
 913                /* BPF program should be able to override new value with a
 914                 * buffer bigger than provided by user.
 915                 */
 916                ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
 917                ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
 918                if (!ctx.new_val ||
 919                    copy_from_user(ctx.new_val, buf, ctx.new_len))
 920                        /* Let BPF program decide how to proceed. */
 921                        ctx.new_len = 0;
 922        }
 923
 924        rcu_read_lock();
 925        cgrp = task_dfl_cgroup(current);
 926        ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
 927        rcu_read_unlock();
 928
 929        kfree(ctx.cur_val);
 930
 931        if (ret == 1 && ctx.new_updated) {
 932                *new_buf = ctx.new_val;
 933                *pcount = ctx.new_len;
 934        } else {
 935                kfree(ctx.new_val);
 936        }
 937
 938        return ret == 1 ? 0 : -EPERM;
 939}
 940EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
 941
 942#ifdef CONFIG_NET
 943static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
 944                                             enum bpf_attach_type attach_type)
 945{
 946        struct bpf_prog_array *prog_array;
 947        bool empty;
 948
 949        rcu_read_lock();
 950        prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
 951        empty = bpf_prog_array_is_empty(prog_array);
 952        rcu_read_unlock();
 953
 954        return empty;
 955}
 956
 957static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
 958{
 959        if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0)
 960                return -EINVAL;
 961
 962        ctx->optval = kzalloc(max_optlen, GFP_USER);
 963        if (!ctx->optval)
 964                return -ENOMEM;
 965
 966        ctx->optval_end = ctx->optval + max_optlen;
 967
 968        return 0;
 969}
 970
 971static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
 972{
 973        kfree(ctx->optval);
 974}
 975
 976int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 977                                       int *optname, char __user *optval,
 978                                       int *optlen, char **kernel_optval)
 979{
 980        struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 981        struct bpf_sockopt_kern ctx = {
 982                .sk = sk,
 983                .level = *level,
 984                .optname = *optname,
 985        };
 986        int ret, max_optlen;
 987
 988        /* Opportunistic check to see whether we have any BPF program
 989         * attached to the hook so we don't waste time allocating
 990         * memory and locking the socket.
 991         */
 992        if (!cgroup_bpf_enabled ||
 993            __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
 994                return 0;
 995
 996        /* Allocate a bit more than the initial user buffer for
 997         * BPF program. The canonical use case is overriding
 998         * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
 999         */
1000        max_optlen = max_t(int, 16, *optlen);
1001
1002        ret = sockopt_alloc_buf(&ctx, max_optlen);
1003        if (ret)
1004                return ret;
1005
1006        ctx.optlen = *optlen;
1007
1008        if (copy_from_user(ctx.optval, optval, *optlen) != 0) {
1009                ret = -EFAULT;
1010                goto out;
1011        }
1012
1013        lock_sock(sk);
1014        ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
1015                                 &ctx, BPF_PROG_RUN);
1016        release_sock(sk);
1017
1018        if (!ret) {
1019                ret = -EPERM;
1020                goto out;
1021        }
1022
1023        if (ctx.optlen == -1) {
1024                /* optlen set to -1, bypass kernel */
1025                ret = 1;
1026        } else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
1027                /* optlen is out of bounds */
1028                ret = -EFAULT;
1029        } else {
1030                /* optlen within bounds, run kernel handler */
1031                ret = 0;
1032
1033                /* export any potential modifications */
1034                *level = ctx.level;
1035                *optname = ctx.optname;
1036                *optlen = ctx.optlen;
1037                *kernel_optval = ctx.optval;
1038        }
1039
1040out:
1041        if (ret)
1042                sockopt_free_buf(&ctx);
1043        return ret;
1044}
1045EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt);
1046
1047int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
1048                                       int optname, char __user *optval,
1049                                       int __user *optlen, int max_optlen,
1050                                       int retval)
1051{
1052        struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1053        struct bpf_sockopt_kern ctx = {
1054                .sk = sk,
1055                .level = level,
1056                .optname = optname,
1057                .retval = retval,
1058        };
1059        int ret;
1060
1061        /* Opportunistic check to see whether we have any BPF program
1062         * attached to the hook so we don't waste time allocating
1063         * memory and locking the socket.
1064         */
1065        if (!cgroup_bpf_enabled ||
1066            __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
1067                return retval;
1068
1069        ret = sockopt_alloc_buf(&ctx, max_optlen);
1070        if (ret)
1071                return ret;
1072
1073        ctx.optlen = max_optlen;
1074
1075        if (!retval) {
1076                /* If kernel getsockopt finished successfully,
1077                 * copy whatever was returned to the user back
1078                 * into our temporary buffer. Set optlen to the
1079                 * one that kernel returned as well to let
1080                 * BPF programs inspect the value.
1081                 */
1082
1083                if (get_user(ctx.optlen, optlen)) {
1084                        ret = -EFAULT;
1085                        goto out;
1086                }
1087
1088                if (ctx.optlen > max_optlen)
1089                        ctx.optlen = max_optlen;
1090
1091                if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) {
1092                        ret = -EFAULT;
1093                        goto out;
1094                }
1095        }
1096
1097        lock_sock(sk);
1098        ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
1099                                 &ctx, BPF_PROG_RUN);
1100        release_sock(sk);
1101
1102        if (!ret) {
1103                ret = -EPERM;
1104                goto out;
1105        }
1106
1107        if (ctx.optlen > max_optlen) {
1108                ret = -EFAULT;
1109                goto out;
1110        }
1111
1112        /* BPF programs only allowed to set retval to 0, not some
1113         * arbitrary value.
1114         */
1115        if (ctx.retval != 0 && ctx.retval != retval) {
1116                ret = -EFAULT;
1117                goto out;
1118        }
1119
1120        if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
1121            put_user(ctx.optlen, optlen)) {
1122                ret = -EFAULT;
1123                goto out;
1124        }
1125
1126        ret = ctx.retval;
1127
1128out:
1129        sockopt_free_buf(&ctx);
1130        return ret;
1131}
1132EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt);
1133#endif
1134
1135static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
1136                              size_t *lenp)
1137{
1138        ssize_t tmp_ret = 0, ret;
1139
1140        if (dir->header.parent) {
1141                tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp);
1142                if (tmp_ret < 0)
1143                        return tmp_ret;
1144        }
1145
1146        ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp);
1147        if (ret < 0)
1148                return ret;
1149        *bufp += ret;
1150        *lenp -= ret;
1151        ret += tmp_ret;
1152
1153        /* Avoid leading slash. */
1154        if (!ret)
1155                return ret;
1156
1157        tmp_ret = strscpy(*bufp, "/", *lenp);
1158        if (tmp_ret < 0)
1159                return tmp_ret;
1160        *bufp += tmp_ret;
1161        *lenp -= tmp_ret;
1162
1163        return ret + tmp_ret;
1164}
1165
1166BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf,
1167           size_t, buf_len, u64, flags)
1168{
1169        ssize_t tmp_ret = 0, ret;
1170
1171        if (!buf)
1172                return -EINVAL;
1173
1174        if (!(flags & BPF_F_SYSCTL_BASE_NAME)) {
1175                if (!ctx->head)
1176                        return -EINVAL;
1177                tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len);
1178                if (tmp_ret < 0)
1179                        return tmp_ret;
1180        }
1181
1182        ret = strscpy(buf, ctx->table->procname, buf_len);
1183
1184        return ret < 0 ? ret : tmp_ret + ret;
1185}
1186
1187static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
1188        .func           = bpf_sysctl_get_name,
1189        .gpl_only       = false,
1190        .ret_type       = RET_INTEGER,
1191        .arg1_type      = ARG_PTR_TO_CTX,
1192        .arg2_type      = ARG_PTR_TO_MEM,
1193        .arg3_type      = ARG_CONST_SIZE,
1194        .arg4_type      = ARG_ANYTHING,
1195};
1196
1197static int copy_sysctl_value(char *dst, size_t dst_len, char *src,
1198                             size_t src_len)
1199{
1200        if (!dst)
1201                return -EINVAL;
1202
1203        if (!dst_len)
1204                return -E2BIG;
1205
1206        if (!src || !src_len) {
1207                memset(dst, 0, dst_len);
1208                return -EINVAL;
1209        }
1210
1211        memcpy(dst, src, min(dst_len, src_len));
1212
1213        if (dst_len > src_len) {
1214                memset(dst + src_len, '\0', dst_len - src_len);
1215                return src_len;
1216        }
1217
1218        dst[dst_len - 1] = '\0';
1219
1220        return -E2BIG;
1221}
1222
1223BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx,
1224           char *, buf, size_t, buf_len)
1225{
1226        return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len);
1227}
1228
1229static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = {
1230        .func           = bpf_sysctl_get_current_value,
1231        .gpl_only       = false,
1232        .ret_type       = RET_INTEGER,
1233        .arg1_type      = ARG_PTR_TO_CTX,
1234        .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
1235        .arg3_type      = ARG_CONST_SIZE,
1236};
1237
1238BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf,
1239           size_t, buf_len)
1240{
1241        if (!ctx->write) {
1242                if (buf && buf_len)
1243                        memset(buf, '\0', buf_len);
1244                return -EINVAL;
1245        }
1246        return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len);
1247}
1248
1249static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = {
1250        .func           = bpf_sysctl_get_new_value,
1251        .gpl_only       = false,
1252        .ret_type       = RET_INTEGER,
1253        .arg1_type      = ARG_PTR_TO_CTX,
1254        .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
1255        .arg3_type      = ARG_CONST_SIZE,
1256};
1257
1258BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
1259           const char *, buf, size_t, buf_len)
1260{
1261        if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len)
1262                return -EINVAL;
1263
1264        if (buf_len > PAGE_SIZE - 1)
1265                return -E2BIG;
1266
1267        memcpy(ctx->new_val, buf, buf_len);
1268        ctx->new_len = buf_len;
1269        ctx->new_updated = 1;
1270
1271        return 0;
1272}
1273
1274static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
1275        .func           = bpf_sysctl_set_new_value,
1276        .gpl_only       = false,
1277        .ret_type       = RET_INTEGER,
1278        .arg1_type      = ARG_PTR_TO_CTX,
1279        .arg2_type      = ARG_PTR_TO_MEM,
1280        .arg3_type      = ARG_CONST_SIZE,
1281};
1282
1283static const struct bpf_func_proto *
1284sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1285{
1286        switch (func_id) {
1287        case BPF_FUNC_strtol:
1288                return &bpf_strtol_proto;
1289        case BPF_FUNC_strtoul:
1290                return &bpf_strtoul_proto;
1291        case BPF_FUNC_sysctl_get_name:
1292                return &bpf_sysctl_get_name_proto;
1293        case BPF_FUNC_sysctl_get_current_value:
1294                return &bpf_sysctl_get_current_value_proto;
1295        case BPF_FUNC_sysctl_get_new_value:
1296                return &bpf_sysctl_get_new_value_proto;
1297        case BPF_FUNC_sysctl_set_new_value:
1298                return &bpf_sysctl_set_new_value_proto;
1299        default:
1300                return cgroup_base_func_proto(func_id, prog);
1301        }
1302}
1303
1304static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
1305                                   const struct bpf_prog *prog,
1306                                   struct bpf_insn_access_aux *info)
1307{
1308        const int size_default = sizeof(__u32);
1309
1310        if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size)
1311                return false;
1312
1313        switch (off) {
1314        case bpf_ctx_range(struct bpf_sysctl, write):
1315                if (type != BPF_READ)
1316                        return false;
1317                bpf_ctx_record_field_size(info, size_default);
1318                return bpf_ctx_narrow_access_ok(off, size, size_default);
1319        case bpf_ctx_range(struct bpf_sysctl, file_pos):
1320                if (type == BPF_READ) {
1321                        bpf_ctx_record_field_size(info, size_default);
1322                        return bpf_ctx_narrow_access_ok(off, size, size_default);
1323                } else {
1324                        return size == size_default;
1325                }
1326        default:
1327                return false;
1328        }
1329}
1330
1331static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
1332                                     const struct bpf_insn *si,
1333                                     struct bpf_insn *insn_buf,
1334                                     struct bpf_prog *prog, u32 *target_size)
1335{
1336        struct bpf_insn *insn = insn_buf;
1337        u32 read_size;
1338
1339        switch (si->off) {
1340        case offsetof(struct bpf_sysctl, write):
1341                *insn++ = BPF_LDX_MEM(
1342                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
1343                        bpf_target_off(struct bpf_sysctl_kern, write,
1344                                       FIELD_SIZEOF(struct bpf_sysctl_kern,
1345                                                    write),
1346                                       target_size));
1347                break;
1348        case offsetof(struct bpf_sysctl, file_pos):
1349                /* ppos is a pointer so it should be accessed via indirect
1350                 * loads and stores. Also for stores additional temporary
1351                 * register is used since neither src_reg nor dst_reg can be
1352                 * overridden.
1353                 */
1354                if (type == BPF_WRITE) {
1355                        int treg = BPF_REG_9;
1356
1357                        if (si->src_reg == treg || si->dst_reg == treg)
1358                                --treg;
1359                        if (si->src_reg == treg || si->dst_reg == treg)
1360                                --treg;
1361                        *insn++ = BPF_STX_MEM(
1362                                BPF_DW, si->dst_reg, treg,
1363                                offsetof(struct bpf_sysctl_kern, tmp_reg));
1364                        *insn++ = BPF_LDX_MEM(
1365                                BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
1366                                treg, si->dst_reg,
1367                                offsetof(struct bpf_sysctl_kern, ppos));
1368                        *insn++ = BPF_STX_MEM(
1369                                BPF_SIZEOF(u32), treg, si->src_reg,
1370                                bpf_ctx_narrow_access_offset(
1371                                        0, sizeof(u32), sizeof(loff_t)));
1372                        *insn++ = BPF_LDX_MEM(
1373                                BPF_DW, treg, si->dst_reg,
1374                                offsetof(struct bpf_sysctl_kern, tmp_reg));
1375                } else {
1376                        *insn++ = BPF_LDX_MEM(
1377                                BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
1378                                si->dst_reg, si->src_reg,
1379                                offsetof(struct bpf_sysctl_kern, ppos));
1380                        read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
1381                        *insn++ = BPF_LDX_MEM(
1382                                BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
1383                                bpf_ctx_narrow_access_offset(
1384                                        0, read_size, sizeof(loff_t)));
1385                }
1386                *target_size = sizeof(u32);
1387                break;
1388        }
1389
1390        return insn - insn_buf;
1391}
1392
1393const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
1394        .get_func_proto         = sysctl_func_proto,
1395        .is_valid_access        = sysctl_is_valid_access,
1396        .convert_ctx_access     = sysctl_convert_ctx_access,
1397};
1398
1399const struct bpf_prog_ops cg_sysctl_prog_ops = {
1400};
1401
1402static const struct bpf_func_proto *
1403cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1404{
1405        switch (func_id) {
1406#ifdef CONFIG_NET
1407        case BPF_FUNC_sk_storage_get:
1408                return &bpf_sk_storage_get_proto;
1409        case BPF_FUNC_sk_storage_delete:
1410                return &bpf_sk_storage_delete_proto;
1411#endif
1412#ifdef CONFIG_INET
1413        case BPF_FUNC_tcp_sock:
1414                return &bpf_tcp_sock_proto;
1415#endif
1416        default:
1417                return cgroup_base_func_proto(func_id, prog);
1418        }
1419}
1420
1421static bool cg_sockopt_is_valid_access(int off, int size,
1422                                       enum bpf_access_type type,
1423                                       const struct bpf_prog *prog,
1424                                       struct bpf_insn_access_aux *info)
1425{
1426        const int size_default = sizeof(__u32);
1427
1428        if (off < 0 || off >= sizeof(struct bpf_sockopt))
1429                return false;
1430
1431        if (off % size != 0)
1432                return false;
1433
1434        if (type == BPF_WRITE) {
1435                switch (off) {
1436                case offsetof(struct bpf_sockopt, retval):
1437                        if (size != size_default)
1438                                return false;
1439                        return prog->expected_attach_type ==
1440                                BPF_CGROUP_GETSOCKOPT;
1441                case offsetof(struct bpf_sockopt, optname):
1442                        /* fallthrough */
1443                case offsetof(struct bpf_sockopt, level):
1444                        if (size != size_default)
1445                                return false;
1446                        return prog->expected_attach_type ==
1447                                BPF_CGROUP_SETSOCKOPT;
1448                case offsetof(struct bpf_sockopt, optlen):
1449                        return size == size_default;
1450                default:
1451                        return false;
1452                }
1453        }
1454
1455        switch (off) {
1456        case offsetof(struct bpf_sockopt, sk):
1457                if (size != sizeof(__u64))
1458                        return false;
1459                info->reg_type = PTR_TO_SOCKET;
1460                break;
1461        case offsetof(struct bpf_sockopt, optval):
1462                if (size != sizeof(__u64))
1463                        return false;
1464                info->reg_type = PTR_TO_PACKET;
1465                break;
1466        case offsetof(struct bpf_sockopt, optval_end):
1467                if (size != sizeof(__u64))
1468                        return false;
1469                info->reg_type = PTR_TO_PACKET_END;
1470                break;
1471        case offsetof(struct bpf_sockopt, retval):
1472                if (size != size_default)
1473                        return false;
1474                return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
1475        default:
1476                if (size != size_default)
1477                        return false;
1478                break;
1479        }
1480        return true;
1481}
1482
1483#define CG_SOCKOPT_ACCESS_FIELD(T, F)                                   \
1484        T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F),                 \
1485          si->dst_reg, si->src_reg,                                     \
1486          offsetof(struct bpf_sockopt_kern, F))
1487
1488static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
1489                                         const struct bpf_insn *si,
1490                                         struct bpf_insn *insn_buf,
1491                                         struct bpf_prog *prog,
1492                                         u32 *target_size)
1493{
1494        struct bpf_insn *insn = insn_buf;
1495
1496        switch (si->off) {
1497        case offsetof(struct bpf_sockopt, sk):
1498                *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
1499                break;
1500        case offsetof(struct bpf_sockopt, level):
1501                if (type == BPF_WRITE)
1502                        *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
1503                else
1504                        *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
1505                break;
1506        case offsetof(struct bpf_sockopt, optname):
1507                if (type == BPF_WRITE)
1508                        *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
1509                else
1510                        *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
1511                break;
1512        case offsetof(struct bpf_sockopt, optlen):
1513                if (type == BPF_WRITE)
1514                        *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
1515                else
1516                        *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
1517                break;
1518        case offsetof(struct bpf_sockopt, retval):
1519                if (type == BPF_WRITE)
1520                        *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
1521                else
1522                        *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
1523                break;
1524        case offsetof(struct bpf_sockopt, optval):
1525                *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
1526                break;
1527        case offsetof(struct bpf_sockopt, optval_end):
1528                *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
1529                break;
1530        }
1531
1532        return insn - insn_buf;
1533}
1534
1535static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
1536                                   bool direct_write,
1537                                   const struct bpf_prog *prog)
1538{
1539        /* Nothing to do for sockopt argument. The data is kzalloc'ated.
1540         */
1541        return 0;
1542}
1543
1544const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
1545        .get_func_proto         = cg_sockopt_func_proto,
1546        .is_valid_access        = cg_sockopt_is_valid_access,
1547        .convert_ctx_access     = cg_sockopt_convert_ctx_access,
1548        .gen_prologue           = cg_sockopt_get_prologue,
1549};
1550
1551const struct bpf_prog_ops cg_sockopt_prog_ops = {
1552};
1553