linux/kernel/bpf/trampoline.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/* Copyright (c) 2019 Facebook */
   3#include <linux/hash.h>
   4#include <linux/bpf.h>
   5#include <linux/filter.h>
   6#include <linux/ftrace.h>
   7#include <linux/rbtree_latch.h>
   8#include <linux/perf_event.h>
   9#include <linux/btf.h>
  10#include <linux/rcupdate_trace.h>
  11#include <linux/rcupdate_wait.h>
  12#include <linux/module.h>
  13
  14/* dummy _ops. The verifier will operate on target program's ops. */
  15const struct bpf_verifier_ops bpf_extension_verifier_ops = {
  16};
  17const struct bpf_prog_ops bpf_extension_prog_ops = {
  18};
  19
  20/* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */
  21#define TRAMPOLINE_HASH_BITS 10
  22#define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS)
  23
  24static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
  25
  26/* serializes access to trampoline_table */
  27static DEFINE_MUTEX(trampoline_mutex);
  28
  29void *bpf_jit_alloc_exec_page(void)
  30{
  31        void *image;
  32
  33        image = bpf_jit_alloc_exec(PAGE_SIZE);
  34        if (!image)
  35                return NULL;
  36
  37        set_vm_flush_reset_perms(image);
  38        /* Keep image as writeable. The alternative is to keep flipping ro/rw
  39         * everytime new program is attached or detached.
  40         */
  41        set_memory_x((long)image, 1);
  42        return image;
  43}
  44
  45void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym)
  46{
  47        ksym->start = (unsigned long) data;
  48        ksym->end = ksym->start + PAGE_SIZE;
  49        bpf_ksym_add(ksym);
  50        perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
  51                           PAGE_SIZE, false, ksym->name);
  52}
  53
  54void bpf_image_ksym_del(struct bpf_ksym *ksym)
  55{
  56        bpf_ksym_del(ksym);
  57        perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
  58                           PAGE_SIZE, true, ksym->name);
  59}
  60
  61static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
  62{
  63        struct bpf_trampoline *tr;
  64        struct hlist_head *head;
  65        int i;
  66
  67        mutex_lock(&trampoline_mutex);
  68        head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)];
  69        hlist_for_each_entry(tr, head, hlist) {
  70                if (tr->key == key) {
  71                        refcount_inc(&tr->refcnt);
  72                        goto out;
  73                }
  74        }
  75        tr = kzalloc(sizeof(*tr), GFP_KERNEL);
  76        if (!tr)
  77                goto out;
  78
  79        tr->key = key;
  80        INIT_HLIST_NODE(&tr->hlist);
  81        hlist_add_head(&tr->hlist, head);
  82        refcount_set(&tr->refcnt, 1);
  83        mutex_init(&tr->mutex);
  84        for (i = 0; i < BPF_TRAMP_MAX; i++)
  85                INIT_HLIST_HEAD(&tr->progs_hlist[i]);
  86out:
  87        mutex_unlock(&trampoline_mutex);
  88        return tr;
  89}
  90
  91static int bpf_trampoline_module_get(struct bpf_trampoline *tr)
  92{
  93        struct module *mod;
  94        int err = 0;
  95
  96        preempt_disable();
  97        mod = __module_text_address((unsigned long) tr->func.addr);
  98        if (mod && !try_module_get(mod))
  99                err = -ENOENT;
 100        preempt_enable();
 101        tr->mod = mod;
 102        return err;
 103}
 104
 105static void bpf_trampoline_module_put(struct bpf_trampoline *tr)
 106{
 107        module_put(tr->mod);
 108        tr->mod = NULL;
 109}
 110
 111static int is_ftrace_location(void *ip)
 112{
 113        long addr;
 114
 115        addr = ftrace_location((long)ip);
 116        if (!addr)
 117                return 0;
 118        if (WARN_ON_ONCE(addr != (long)ip))
 119                return -EFAULT;
 120        return 1;
 121}
 122
 123static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
 124{
 125        void *ip = tr->func.addr;
 126        int ret;
 127
 128        if (tr->func.ftrace_managed)
 129                ret = unregister_ftrace_direct((long)ip, (long)old_addr);
 130        else
 131                ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
 132
 133        if (!ret)
 134                bpf_trampoline_module_put(tr);
 135        return ret;
 136}
 137
 138static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr)
 139{
 140        void *ip = tr->func.addr;
 141        int ret;
 142
 143        if (tr->func.ftrace_managed)
 144                ret = modify_ftrace_direct((long)ip, (long)old_addr, (long)new_addr);
 145        else
 146                ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
 147        return ret;
 148}
 149
 150/* first time registering */
 151static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
 152{
 153        void *ip = tr->func.addr;
 154        int ret;
 155
 156        ret = is_ftrace_location(ip);
 157        if (ret < 0)
 158                return ret;
 159        tr->func.ftrace_managed = ret;
 160
 161        if (bpf_trampoline_module_get(tr))
 162                return -ENOENT;
 163
 164        if (tr->func.ftrace_managed)
 165                ret = register_ftrace_direct((long)ip, (long)new_addr);
 166        else
 167                ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
 168
 169        if (ret)
 170                bpf_trampoline_module_put(tr);
 171        return ret;
 172}
 173
 174static struct bpf_tramp_progs *
 175bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total)
 176{
 177        const struct bpf_prog_aux *aux;
 178        struct bpf_tramp_progs *tprogs;
 179        struct bpf_prog **progs;
 180        int kind;
 181
 182        *total = 0;
 183        tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL);
 184        if (!tprogs)
 185                return ERR_PTR(-ENOMEM);
 186
 187        for (kind = 0; kind < BPF_TRAMP_MAX; kind++) {
 188                tprogs[kind].nr_progs = tr->progs_cnt[kind];
 189                *total += tr->progs_cnt[kind];
 190                progs = tprogs[kind].progs;
 191
 192                hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist)
 193                        *progs++ = aux->prog;
 194        }
 195        return tprogs;
 196}
 197
 198static void __bpf_tramp_image_put_deferred(struct work_struct *work)
 199{
 200        struct bpf_tramp_image *im;
 201
 202        im = container_of(work, struct bpf_tramp_image, work);
 203        bpf_image_ksym_del(&im->ksym);
 204        bpf_jit_free_exec(im->image);
 205        bpf_jit_uncharge_modmem(1);
 206        percpu_ref_exit(&im->pcref);
 207        kfree_rcu(im, rcu);
 208}
 209
 210/* callback, fexit step 3 or fentry step 2 */
 211static void __bpf_tramp_image_put_rcu(struct rcu_head *rcu)
 212{
 213        struct bpf_tramp_image *im;
 214
 215        im = container_of(rcu, struct bpf_tramp_image, rcu);
 216        INIT_WORK(&im->work, __bpf_tramp_image_put_deferred);
 217        schedule_work(&im->work);
 218}
 219
 220/* callback, fexit step 2. Called after percpu_ref_kill confirms. */
 221static void __bpf_tramp_image_release(struct percpu_ref *pcref)
 222{
 223        struct bpf_tramp_image *im;
 224
 225        im = container_of(pcref, struct bpf_tramp_image, pcref);
 226        call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu);
 227}
 228
 229/* callback, fexit or fentry step 1 */
 230static void __bpf_tramp_image_put_rcu_tasks(struct rcu_head *rcu)
 231{
 232        struct bpf_tramp_image *im;
 233
 234        im = container_of(rcu, struct bpf_tramp_image, rcu);
 235        if (im->ip_after_call)
 236                /* the case of fmod_ret/fexit trampoline and CONFIG_PREEMPTION=y */
 237                percpu_ref_kill(&im->pcref);
 238        else
 239                /* the case of fentry trampoline */
 240                call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu);
 241}
 242
 243static void bpf_tramp_image_put(struct bpf_tramp_image *im)
 244{
 245        /* The trampoline image that calls original function is using:
 246         * rcu_read_lock_trace to protect sleepable bpf progs
 247         * rcu_read_lock to protect normal bpf progs
 248         * percpu_ref to protect trampoline itself
 249         * rcu tasks to protect trampoline asm not covered by percpu_ref
 250         * (which are few asm insns before __bpf_tramp_enter and
 251         *  after __bpf_tramp_exit)
 252         *
 253         * The trampoline is unreachable before bpf_tramp_image_put().
 254         *
 255         * First, patch the trampoline to avoid calling into fexit progs.
 256         * The progs will be freed even if the original function is still
 257         * executing or sleeping.
 258         * In case of CONFIG_PREEMPT=y use call_rcu_tasks() to wait on
 259         * first few asm instructions to execute and call into
 260         * __bpf_tramp_enter->percpu_ref_get.
 261         * Then use percpu_ref_kill to wait for the trampoline and the original
 262         * function to finish.
 263         * Then use call_rcu_tasks() to make sure few asm insns in
 264         * the trampoline epilogue are done as well.
 265         *
 266         * In !PREEMPT case the task that got interrupted in the first asm
 267         * insns won't go through an RCU quiescent state which the
 268         * percpu_ref_kill will be waiting for. Hence the first
 269         * call_rcu_tasks() is not necessary.
 270         */
 271        if (im->ip_after_call) {
 272                int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP,
 273                                             NULL, im->ip_epilogue);
 274                WARN_ON(err);
 275                if (IS_ENABLED(CONFIG_PREEMPTION))
 276                        call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
 277                else
 278                        percpu_ref_kill(&im->pcref);
 279                return;
 280        }
 281
 282        /* The trampoline without fexit and fmod_ret progs doesn't call original
 283         * function and doesn't use percpu_ref.
 284         * Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
 285         * Then use call_rcu_tasks() to wait for the rest of trampoline asm
 286         * and normal progs.
 287         */
 288        call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
 289}
 290
 291static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
 292{
 293        struct bpf_tramp_image *im;
 294        struct bpf_ksym *ksym;
 295        void *image;
 296        int err = -ENOMEM;
 297
 298        im = kzalloc(sizeof(*im), GFP_KERNEL);
 299        if (!im)
 300                goto out;
 301
 302        err = bpf_jit_charge_modmem(1);
 303        if (err)
 304                goto out_free_im;
 305
 306        err = -ENOMEM;
 307        im->image = image = bpf_jit_alloc_exec_page();
 308        if (!image)
 309                goto out_uncharge;
 310
 311        err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL);
 312        if (err)
 313                goto out_free_image;
 314
 315        ksym = &im->ksym;
 316        INIT_LIST_HEAD_RCU(&ksym->lnode);
 317        snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu_%u", key, idx);
 318        bpf_image_ksym_add(image, ksym);
 319        return im;
 320
 321out_free_image:
 322        bpf_jit_free_exec(im->image);
 323out_uncharge:
 324        bpf_jit_uncharge_modmem(1);
 325out_free_im:
 326        kfree(im);
 327out:
 328        return ERR_PTR(err);
 329}
 330
 331static int bpf_trampoline_update(struct bpf_trampoline *tr)
 332{
 333        struct bpf_tramp_image *im;
 334        struct bpf_tramp_progs *tprogs;
 335        u32 flags = BPF_TRAMP_F_RESTORE_REGS;
 336        int err, total;
 337
 338        tprogs = bpf_trampoline_get_progs(tr, &total);
 339        if (IS_ERR(tprogs))
 340                return PTR_ERR(tprogs);
 341
 342        if (total == 0) {
 343                err = unregister_fentry(tr, tr->cur_image->image);
 344                bpf_tramp_image_put(tr->cur_image);
 345                tr->cur_image = NULL;
 346                tr->selector = 0;
 347                goto out;
 348        }
 349
 350        im = bpf_tramp_image_alloc(tr->key, tr->selector);
 351        if (IS_ERR(im)) {
 352                err = PTR_ERR(im);
 353                goto out;
 354        }
 355
 356        if (tprogs[BPF_TRAMP_FEXIT].nr_progs ||
 357            tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs)
 358                flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
 359
 360        err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
 361                                          &tr->func.model, flags, tprogs,
 362                                          tr->func.addr);
 363        if (err < 0)
 364                goto out;
 365
 366        WARN_ON(tr->cur_image && tr->selector == 0);
 367        WARN_ON(!tr->cur_image && tr->selector);
 368        if (tr->cur_image)
 369                /* progs already running at this address */
 370                err = modify_fentry(tr, tr->cur_image->image, im->image);
 371        else
 372                /* first time registering */
 373                err = register_fentry(tr, im->image);
 374        if (err)
 375                goto out;
 376        if (tr->cur_image)
 377                bpf_tramp_image_put(tr->cur_image);
 378        tr->cur_image = im;
 379        tr->selector++;
 380out:
 381        kfree(tprogs);
 382        return err;
 383}
 384
 385static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
 386{
 387        switch (prog->expected_attach_type) {
 388        case BPF_TRACE_FENTRY:
 389                return BPF_TRAMP_FENTRY;
 390        case BPF_MODIFY_RETURN:
 391                return BPF_TRAMP_MODIFY_RETURN;
 392        case BPF_TRACE_FEXIT:
 393                return BPF_TRAMP_FEXIT;
 394        case BPF_LSM_MAC:
 395                if (!prog->aux->attach_func_proto->type)
 396                        /* The function returns void, we cannot modify its
 397                         * return value.
 398                         */
 399                        return BPF_TRAMP_FEXIT;
 400                else
 401                        return BPF_TRAMP_MODIFY_RETURN;
 402        default:
 403                return BPF_TRAMP_REPLACE;
 404        }
 405}
 406
 407int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
 408{
 409        enum bpf_tramp_prog_type kind;
 410        int err = 0;
 411        int cnt;
 412
 413        kind = bpf_attach_type_to_tramp(prog);
 414        mutex_lock(&tr->mutex);
 415        if (tr->extension_prog) {
 416                /* cannot attach fentry/fexit if extension prog is attached.
 417                 * cannot overwrite extension prog either.
 418                 */
 419                err = -EBUSY;
 420                goto out;
 421        }
 422        cnt = tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT];
 423        if (kind == BPF_TRAMP_REPLACE) {
 424                /* Cannot attach extension if fentry/fexit are in use. */
 425                if (cnt) {
 426                        err = -EBUSY;
 427                        goto out;
 428                }
 429                tr->extension_prog = prog;
 430                err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
 431                                         prog->bpf_func);
 432                goto out;
 433        }
 434        if (cnt >= BPF_MAX_TRAMP_PROGS) {
 435                err = -E2BIG;
 436                goto out;
 437        }
 438        if (!hlist_unhashed(&prog->aux->tramp_hlist)) {
 439                /* prog already linked */
 440                err = -EBUSY;
 441                goto out;
 442        }
 443        hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]);
 444        tr->progs_cnt[kind]++;
 445        err = bpf_trampoline_update(tr);
 446        if (err) {
 447                hlist_del_init(&prog->aux->tramp_hlist);
 448                tr->progs_cnt[kind]--;
 449        }
 450out:
 451        mutex_unlock(&tr->mutex);
 452        return err;
 453}
 454
 455/* bpf_trampoline_unlink_prog() should never fail. */
 456int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
 457{
 458        enum bpf_tramp_prog_type kind;
 459        int err;
 460
 461        kind = bpf_attach_type_to_tramp(prog);
 462        mutex_lock(&tr->mutex);
 463        if (kind == BPF_TRAMP_REPLACE) {
 464                WARN_ON_ONCE(!tr->extension_prog);
 465                err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
 466                                         tr->extension_prog->bpf_func, NULL);
 467                tr->extension_prog = NULL;
 468                goto out;
 469        }
 470        hlist_del_init(&prog->aux->tramp_hlist);
 471        tr->progs_cnt[kind]--;
 472        err = bpf_trampoline_update(tr);
 473out:
 474        mutex_unlock(&tr->mutex);
 475        return err;
 476}
 477
 478struct bpf_trampoline *bpf_trampoline_get(u64 key,
 479                                          struct bpf_attach_target_info *tgt_info)
 480{
 481        struct bpf_trampoline *tr;
 482
 483        tr = bpf_trampoline_lookup(key);
 484        if (!tr)
 485                return NULL;
 486
 487        mutex_lock(&tr->mutex);
 488        if (tr->func.addr)
 489                goto out;
 490
 491        memcpy(&tr->func.model, &tgt_info->fmodel, sizeof(tgt_info->fmodel));
 492        tr->func.addr = (void *)tgt_info->tgt_addr;
 493out:
 494        mutex_unlock(&tr->mutex);
 495        return tr;
 496}
 497
 498void bpf_trampoline_put(struct bpf_trampoline *tr)
 499{
 500        if (!tr)
 501                return;
 502        mutex_lock(&trampoline_mutex);
 503        if (!refcount_dec_and_test(&tr->refcnt))
 504                goto out;
 505        WARN_ON_ONCE(mutex_is_locked(&tr->mutex));
 506        if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FENTRY])))
 507                goto out;
 508        if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
 509                goto out;
 510        /* This code will be executed even when the last bpf_tramp_image
 511         * is alive. All progs are detached from the trampoline and the
 512         * trampoline image is patched with jmp into epilogue to skip
 513         * fexit progs. The fentry-only trampoline will be freed via
 514         * multiple rcu callbacks.
 515         */
 516        hlist_del(&tr->hlist);
 517        kfree(tr);
 518out:
 519        mutex_unlock(&trampoline_mutex);
 520}
 521
 522#define NO_START_TIME 1
 523static u64 notrace bpf_prog_start_time(void)
 524{
 525        u64 start = NO_START_TIME;
 526
 527        if (static_branch_unlikely(&bpf_stats_enabled_key)) {
 528                start = sched_clock();
 529                if (unlikely(!start))
 530                        start = NO_START_TIME;
 531        }
 532        return start;
 533}
 534
 535static void notrace inc_misses_counter(struct bpf_prog *prog)
 536{
 537        struct bpf_prog_stats *stats;
 538
 539        stats = this_cpu_ptr(prog->stats);
 540        u64_stats_update_begin(&stats->syncp);
 541        stats->misses++;
 542        u64_stats_update_end(&stats->syncp);
 543}
 544
 545/* The logic is similar to BPF_PROG_RUN, but with an explicit
 546 * rcu_read_lock() and migrate_disable() which are required
 547 * for the trampoline. The macro is split into
 548 * call __bpf_prog_enter
 549 * call prog->bpf_func
 550 * call __bpf_prog_exit
 551 *
 552 * __bpf_prog_enter returns:
 553 * 0 - skip execution of the bpf prog
 554 * 1 - execute bpf prog
 555 * [2..MAX_U64] - excute bpf prog and record execution time.
 556 *     This is start time.
 557 */
 558u64 notrace __bpf_prog_enter(struct bpf_prog *prog)
 559        __acquires(RCU)
 560{
 561        rcu_read_lock();
 562        migrate_disable();
 563        if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
 564                inc_misses_counter(prog);
 565                return 0;
 566        }
 567        return bpf_prog_start_time();
 568}
 569
 570static void notrace update_prog_stats(struct bpf_prog *prog,
 571                                      u64 start)
 572{
 573        struct bpf_prog_stats *stats;
 574
 575        if (static_branch_unlikely(&bpf_stats_enabled_key) &&
 576            /* static_key could be enabled in __bpf_prog_enter*
 577             * and disabled in __bpf_prog_exit*.
 578             * And vice versa.
 579             * Hence check that 'start' is valid.
 580             */
 581            start > NO_START_TIME) {
 582                stats = this_cpu_ptr(prog->stats);
 583                u64_stats_update_begin(&stats->syncp);
 584                stats->cnt++;
 585                stats->nsecs += sched_clock() - start;
 586                u64_stats_update_end(&stats->syncp);
 587        }
 588}
 589
 590void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
 591        __releases(RCU)
 592{
 593        update_prog_stats(prog, start);
 594        __this_cpu_dec(*(prog->active));
 595        migrate_enable();
 596        rcu_read_unlock();
 597}
 598
 599u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog)
 600{
 601        rcu_read_lock_trace();
 602        migrate_disable();
 603        might_fault();
 604        if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
 605                inc_misses_counter(prog);
 606                return 0;
 607        }
 608        return bpf_prog_start_time();
 609}
 610
 611void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start)
 612{
 613        update_prog_stats(prog, start);
 614        __this_cpu_dec(*(prog->active));
 615        migrate_enable();
 616        rcu_read_unlock_trace();
 617}
 618
 619void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr)
 620{
 621        percpu_ref_get(&tr->pcref);
 622}
 623
 624void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr)
 625{
 626        percpu_ref_put(&tr->pcref);
 627}
 628
 629int __weak
 630arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
 631                            const struct btf_func_model *m, u32 flags,
 632                            struct bpf_tramp_progs *tprogs,
 633                            void *orig_call)
 634{
 635        return -ENOTSUPP;
 636}
 637
 638static int __init init_trampolines(void)
 639{
 640        int i;
 641
 642        for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++)
 643                INIT_HLIST_HEAD(&trampoline_table[i]);
 644        return 0;
 645}
 646late_initcall(init_trampolines);
 647