linux/kernel/bpf/inode.c
<<
>>
Prefs
   1/*
   2 * Minimal file system backend for holding eBPF maps and programs,
   3 * used by bpf(2) object pinning.
   4 *
   5 * Authors:
   6 *
   7 *      Daniel Borkmann <daniel@iogearbox.net>
   8 *
   9 * This program is free software; you can redistribute it and/or
  10 * modify it under the terms of the GNU General Public License
  11 * version 2 as published by the Free Software Foundation.
  12 */
  13
  14#include <linux/init.h>
  15#include <linux/magic.h>
  16#include <linux/major.h>
  17#include <linux/mount.h>
  18#include <linux/namei.h>
  19#include <linux/fs.h>
  20#include <linux/kdev_t.h>
  21#include <linux/parser.h>
  22#include <linux/filter.h>
  23#include <linux/bpf.h>
  24#include <linux/bpf_trace.h>
  25
  26enum bpf_type {
  27        BPF_TYPE_UNSPEC = 0,
  28        BPF_TYPE_PROG,
  29        BPF_TYPE_MAP,
  30};
  31
  32static void *bpf_any_get(void *raw, enum bpf_type type)
  33{
  34        switch (type) {
  35        case BPF_TYPE_PROG:
  36                raw = bpf_prog_inc(raw);
  37                break;
  38        case BPF_TYPE_MAP:
  39                raw = bpf_map_inc(raw, true);
  40                break;
  41        default:
  42                WARN_ON_ONCE(1);
  43                break;
  44        }
  45
  46        return raw;
  47}
  48
  49static void bpf_any_put(void *raw, enum bpf_type type)
  50{
  51        switch (type) {
  52        case BPF_TYPE_PROG:
  53                bpf_prog_put(raw);
  54                break;
  55        case BPF_TYPE_MAP:
  56                bpf_map_put_with_uref(raw);
  57                break;
  58        default:
  59                WARN_ON_ONCE(1);
  60                break;
  61        }
  62}
  63
  64static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type)
  65{
  66        void *raw;
  67
  68        *type = BPF_TYPE_MAP;
  69        raw = bpf_map_get_with_uref(ufd);
  70        if (IS_ERR(raw)) {
  71                *type = BPF_TYPE_PROG;
  72                raw = bpf_prog_get(ufd);
  73        }
  74
  75        return raw;
  76}
  77
  78static const struct inode_operations bpf_dir_iops;
  79
  80static const struct inode_operations bpf_prog_iops = { };
  81static const struct inode_operations bpf_map_iops  = { };
  82
  83static struct inode *bpf_get_inode(struct super_block *sb,
  84                                   const struct inode *dir,
  85                                   umode_t mode)
  86{
  87        struct inode *inode;
  88
  89        switch (mode & S_IFMT) {
  90        case S_IFDIR:
  91        case S_IFREG:
  92        case S_IFLNK:
  93                break;
  94        default:
  95                return ERR_PTR(-EINVAL);
  96        }
  97
  98        inode = new_inode(sb);
  99        if (!inode)
 100                return ERR_PTR(-ENOSPC);
 101
 102        inode->i_ino = get_next_ino();
 103        inode->i_atime = current_time(inode);
 104        inode->i_mtime = inode->i_atime;
 105        inode->i_ctime = inode->i_atime;
 106
 107        inode_init_owner(inode, dir, mode);
 108
 109        return inode;
 110}
 111
 112static int bpf_inode_type(const struct inode *inode, enum bpf_type *type)
 113{
 114        *type = BPF_TYPE_UNSPEC;
 115        if (inode->i_op == &bpf_prog_iops)
 116                *type = BPF_TYPE_PROG;
 117        else if (inode->i_op == &bpf_map_iops)
 118                *type = BPF_TYPE_MAP;
 119        else
 120                return -EACCES;
 121
 122        return 0;
 123}
 124
 125static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
 126                                struct inode *dir)
 127{
 128        d_instantiate(dentry, inode);
 129        dget(dentry);
 130
 131        dir->i_mtime = current_time(dir);
 132        dir->i_ctime = dir->i_mtime;
 133}
 134
 135static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 136{
 137        struct inode *inode;
 138
 139        inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR);
 140        if (IS_ERR(inode))
 141                return PTR_ERR(inode);
 142
 143        inode->i_op = &bpf_dir_iops;
 144        inode->i_fop = &simple_dir_operations;
 145
 146        inc_nlink(inode);
 147        inc_nlink(dir);
 148
 149        bpf_dentry_finalize(dentry, inode, dir);
 150        return 0;
 151}
 152
 153struct map_iter {
 154        void *key;
 155        bool done;
 156};
 157
 158static struct map_iter *map_iter(struct seq_file *m)
 159{
 160        return m->private;
 161}
 162
 163static struct bpf_map *seq_file_to_map(struct seq_file *m)
 164{
 165        return file_inode(m->file)->i_private;
 166}
 167
 168static void map_iter_free(struct map_iter *iter)
 169{
 170        if (iter) {
 171                kfree(iter->key);
 172                kfree(iter);
 173        }
 174}
 175
 176static struct map_iter *map_iter_alloc(struct bpf_map *map)
 177{
 178        struct map_iter *iter;
 179
 180        iter = kzalloc(sizeof(*iter), GFP_KERNEL | __GFP_NOWARN);
 181        if (!iter)
 182                goto error;
 183
 184        iter->key = kzalloc(map->key_size, GFP_KERNEL | __GFP_NOWARN);
 185        if (!iter->key)
 186                goto error;
 187
 188        return iter;
 189
 190error:
 191        map_iter_free(iter);
 192        return NULL;
 193}
 194
 195static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
 196{
 197        struct bpf_map *map = seq_file_to_map(m);
 198        void *key = map_iter(m)->key;
 199        void *prev_key;
 200
 201        if (map_iter(m)->done)
 202                return NULL;
 203
 204        if (unlikely(v == SEQ_START_TOKEN))
 205                prev_key = NULL;
 206        else
 207                prev_key = key;
 208
 209        if (map->ops->map_get_next_key(map, prev_key, key)) {
 210                map_iter(m)->done = true;
 211                return NULL;
 212        }
 213
 214        ++(*pos);
 215        return key;
 216}
 217
 218static void *map_seq_start(struct seq_file *m, loff_t *pos)
 219{
 220        if (map_iter(m)->done)
 221                return NULL;
 222
 223        return *pos ? map_iter(m)->key : SEQ_START_TOKEN;
 224}
 225
 226static void map_seq_stop(struct seq_file *m, void *v)
 227{
 228}
 229
 230static int map_seq_show(struct seq_file *m, void *v)
 231{
 232        struct bpf_map *map = seq_file_to_map(m);
 233        void *key = map_iter(m)->key;
 234
 235        if (unlikely(v == SEQ_START_TOKEN)) {
 236                seq_puts(m, "# WARNING!! The output is for debug purpose only\n");
 237                seq_puts(m, "# WARNING!! The output format will change\n");
 238        } else {
 239                map->ops->map_seq_show_elem(map, key, m);
 240        }
 241
 242        return 0;
 243}
 244
 245static const struct seq_operations bpffs_map_seq_ops = {
 246        .start  = map_seq_start,
 247        .next   = map_seq_next,
 248        .show   = map_seq_show,
 249        .stop   = map_seq_stop,
 250};
 251
 252static int bpffs_map_open(struct inode *inode, struct file *file)
 253{
 254        struct bpf_map *map = inode->i_private;
 255        struct map_iter *iter;
 256        struct seq_file *m;
 257        int err;
 258
 259        iter = map_iter_alloc(map);
 260        if (!iter)
 261                return -ENOMEM;
 262
 263        err = seq_open(file, &bpffs_map_seq_ops);
 264        if (err) {
 265                map_iter_free(iter);
 266                return err;
 267        }
 268
 269        m = file->private_data;
 270        m->private = iter;
 271
 272        return 0;
 273}
 274
 275static int bpffs_map_release(struct inode *inode, struct file *file)
 276{
 277        struct seq_file *m = file->private_data;
 278
 279        map_iter_free(map_iter(m));
 280
 281        return seq_release(inode, file);
 282}
 283
 284/* bpffs_map_fops should only implement the basic
 285 * read operation for a BPF map.  The purpose is to
 286 * provide a simple user intuitive way to do
 287 * "cat bpffs/pathto/a-pinned-map".
 288 *
 289 * Other operations (e.g. write, lookup...) should be realized by
 290 * the userspace tools (e.g. bpftool) through the
 291 * BPF_OBJ_GET_INFO_BY_FD and the map's lookup/update
 292 * interface.
 293 */
 294static const struct file_operations bpffs_map_fops = {
 295        .open           = bpffs_map_open,
 296        .read           = seq_read,
 297        .release        = bpffs_map_release,
 298};
 299
 300static int bpffs_obj_open(struct inode *inode, struct file *file)
 301{
 302        return -EIO;
 303}
 304
 305static const struct file_operations bpffs_obj_fops = {
 306        .open           = bpffs_obj_open,
 307};
 308
 309static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
 310                         const struct inode_operations *iops,
 311                         const struct file_operations *fops)
 312{
 313        struct inode *dir = dentry->d_parent->d_inode;
 314        struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode);
 315        if (IS_ERR(inode))
 316                return PTR_ERR(inode);
 317
 318        inode->i_op = iops;
 319        inode->i_fop = fops;
 320        inode->i_private = raw;
 321
 322        bpf_dentry_finalize(dentry, inode, dir);
 323        return 0;
 324}
 325
 326static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg)
 327{
 328        return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops,
 329                             &bpffs_obj_fops);
 330}
 331
 332static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
 333{
 334        struct bpf_map *map = arg;
 335
 336        return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops,
 337                             bpf_map_support_seq_show(map) ?
 338                             &bpffs_map_fops : &bpffs_obj_fops);
 339}
 340
 341static struct dentry *
 342bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
 343{
 344        /* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future
 345         * extensions.
 346         */
 347        if (strchr(dentry->d_name.name, '.'))
 348                return ERR_PTR(-EPERM);
 349
 350        return simple_lookup(dir, dentry, flags);
 351}
 352
 353static int bpf_symlink(struct inode *dir, struct dentry *dentry,
 354                       const char *target)
 355{
 356        char *link = kstrdup(target, GFP_USER | __GFP_NOWARN);
 357        struct inode *inode;
 358
 359        if (!link)
 360                return -ENOMEM;
 361
 362        inode = bpf_get_inode(dir->i_sb, dir, S_IRWXUGO | S_IFLNK);
 363        if (IS_ERR(inode)) {
 364                kfree(link);
 365                return PTR_ERR(inode);
 366        }
 367
 368        inode->i_op = &simple_symlink_inode_operations;
 369        inode->i_link = link;
 370
 371        bpf_dentry_finalize(dentry, inode, dir);
 372        return 0;
 373}
 374
 375static const struct inode_operations bpf_dir_iops = {
 376        .lookup         = bpf_lookup,
 377        .mkdir          = bpf_mkdir,
 378        .symlink        = bpf_symlink,
 379        .rmdir          = simple_rmdir,
 380        .rename         = simple_rename,
 381        .link           = simple_link,
 382        .unlink         = simple_unlink,
 383};
 384
 385static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
 386                          enum bpf_type type)
 387{
 388        struct dentry *dentry;
 389        struct inode *dir;
 390        struct path path;
 391        umode_t mode;
 392        int ret;
 393
 394        dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0);
 395        if (IS_ERR(dentry))
 396                return PTR_ERR(dentry);
 397
 398        mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
 399
 400        ret = security_path_mknod(&path, dentry, mode, 0);
 401        if (ret)
 402                goto out;
 403
 404        dir = d_inode(path.dentry);
 405        if (dir->i_op != &bpf_dir_iops) {
 406                ret = -EPERM;
 407                goto out;
 408        }
 409
 410        switch (type) {
 411        case BPF_TYPE_PROG:
 412                ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw);
 413                break;
 414        case BPF_TYPE_MAP:
 415                ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw);
 416                break;
 417        default:
 418                ret = -EPERM;
 419        }
 420out:
 421        done_path_create(&path, dentry);
 422        return ret;
 423}
 424
 425int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
 426{
 427        struct filename *pname;
 428        enum bpf_type type;
 429        void *raw;
 430        int ret;
 431
 432        pname = getname(pathname);
 433        if (IS_ERR(pname))
 434                return PTR_ERR(pname);
 435
 436        raw = bpf_fd_probe_obj(ufd, &type);
 437        if (IS_ERR(raw)) {
 438                ret = PTR_ERR(raw);
 439                goto out;
 440        }
 441
 442        ret = bpf_obj_do_pin(pname, raw, type);
 443        if (ret != 0)
 444                bpf_any_put(raw, type);
 445out:
 446        putname(pname);
 447        return ret;
 448}
 449
 450static void *bpf_obj_do_get(const struct filename *pathname,
 451                            enum bpf_type *type, int flags)
 452{
 453        struct inode *inode;
 454        struct path path;
 455        void *raw;
 456        int ret;
 457
 458        ret = kern_path(pathname->name, LOOKUP_FOLLOW, &path);
 459        if (ret)
 460                return ERR_PTR(ret);
 461
 462        inode = d_backing_inode(path.dentry);
 463        ret = inode_permission(inode, ACC_MODE(flags));
 464        if (ret)
 465                goto out;
 466
 467        ret = bpf_inode_type(inode, type);
 468        if (ret)
 469                goto out;
 470
 471        raw = bpf_any_get(inode->i_private, *type);
 472        if (!IS_ERR(raw))
 473                touch_atime(&path);
 474
 475        path_put(&path);
 476        return raw;
 477out:
 478        path_put(&path);
 479        return ERR_PTR(ret);
 480}
 481
 482int bpf_obj_get_user(const char __user *pathname, int flags)
 483{
 484        enum bpf_type type = BPF_TYPE_UNSPEC;
 485        struct filename *pname;
 486        int ret = -ENOENT;
 487        int f_flags;
 488        void *raw;
 489
 490        f_flags = bpf_get_file_flag(flags);
 491        if (f_flags < 0)
 492                return f_flags;
 493
 494        pname = getname(pathname);
 495        if (IS_ERR(pname))
 496                return PTR_ERR(pname);
 497
 498        raw = bpf_obj_do_get(pname, &type, f_flags);
 499        if (IS_ERR(raw)) {
 500                ret = PTR_ERR(raw);
 501                goto out;
 502        }
 503
 504        if (type == BPF_TYPE_PROG)
 505                ret = bpf_prog_new_fd(raw);
 506        else if (type == BPF_TYPE_MAP)
 507                ret = bpf_map_new_fd(raw, f_flags);
 508        else
 509                goto out;
 510
 511        if (ret < 0)
 512                bpf_any_put(raw, type);
 513out:
 514        putname(pname);
 515        return ret;
 516}
 517
 518static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
 519{
 520        struct bpf_prog *prog;
 521        int ret = inode_permission(inode, MAY_READ | MAY_WRITE);
 522        if (ret)
 523                return ERR_PTR(ret);
 524
 525        if (inode->i_op == &bpf_map_iops)
 526                return ERR_PTR(-EINVAL);
 527        if (inode->i_op != &bpf_prog_iops)
 528                return ERR_PTR(-EACCES);
 529
 530        prog = inode->i_private;
 531
 532        ret = security_bpf_prog(prog);
 533        if (ret < 0)
 534                return ERR_PTR(ret);
 535
 536        if (!bpf_prog_get_ok(prog, &type, false))
 537                return ERR_PTR(-EINVAL);
 538
 539        return bpf_prog_inc(prog);
 540}
 541
 542struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type)
 543{
 544        struct bpf_prog *prog;
 545        struct path path;
 546        int ret = kern_path(name, LOOKUP_FOLLOW, &path);
 547        if (ret)
 548                return ERR_PTR(ret);
 549        prog = __get_prog_inode(d_backing_inode(path.dentry), type);
 550        if (!IS_ERR(prog))
 551                touch_atime(&path);
 552        path_put(&path);
 553        return prog;
 554}
 555EXPORT_SYMBOL(bpf_prog_get_type_path);
 556
 557/*
 558 * Display the mount options in /proc/mounts.
 559 */
 560static int bpf_show_options(struct seq_file *m, struct dentry *root)
 561{
 562        umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX;
 563
 564        if (mode != S_IRWXUGO)
 565                seq_printf(m, ",mode=%o", mode);
 566        return 0;
 567}
 568
 569static void bpf_destroy_inode_deferred(struct rcu_head *head)
 570{
 571        struct inode *inode = container_of(head, struct inode, i_rcu);
 572        enum bpf_type type;
 573
 574        if (S_ISLNK(inode->i_mode))
 575                kfree(inode->i_link);
 576        if (!bpf_inode_type(inode, &type))
 577                bpf_any_put(inode->i_private, type);
 578        free_inode_nonrcu(inode);
 579}
 580
 581static void bpf_destroy_inode(struct inode *inode)
 582{
 583        call_rcu(&inode->i_rcu, bpf_destroy_inode_deferred);
 584}
 585
 586static const struct super_operations bpf_super_ops = {
 587        .statfs         = simple_statfs,
 588        .drop_inode     = generic_delete_inode,
 589        .show_options   = bpf_show_options,
 590        .destroy_inode  = bpf_destroy_inode,
 591};
 592
 593enum {
 594        OPT_MODE,
 595        OPT_ERR,
 596};
 597
 598static const match_table_t bpf_mount_tokens = {
 599        { OPT_MODE, "mode=%o" },
 600        { OPT_ERR, NULL },
 601};
 602
 603struct bpf_mount_opts {
 604        umode_t mode;
 605};
 606
 607static int bpf_parse_options(char *data, struct bpf_mount_opts *opts)
 608{
 609        substring_t args[MAX_OPT_ARGS];
 610        int option, token;
 611        char *ptr;
 612
 613        opts->mode = S_IRWXUGO;
 614
 615        while ((ptr = strsep(&data, ",")) != NULL) {
 616                if (!*ptr)
 617                        continue;
 618
 619                token = match_token(ptr, bpf_mount_tokens, args);
 620                switch (token) {
 621                case OPT_MODE:
 622                        if (match_octal(&args[0], &option))
 623                                return -EINVAL;
 624                        opts->mode = option & S_IALLUGO;
 625                        break;
 626                /* We might like to report bad mount options here, but
 627                 * traditionally we've ignored all mount options, so we'd
 628                 * better continue to ignore non-existing options for bpf.
 629                 */
 630                }
 631        }
 632
 633        return 0;
 634}
 635
 636static int bpf_fill_super(struct super_block *sb, void *data, int silent)
 637{
 638        static const struct tree_descr bpf_rfiles[] = { { "" } };
 639        struct bpf_mount_opts opts;
 640        struct inode *inode;
 641        int ret;
 642
 643        ret = bpf_parse_options(data, &opts);
 644        if (ret)
 645                return ret;
 646
 647        ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
 648        if (ret)
 649                return ret;
 650
 651        sb->s_op = &bpf_super_ops;
 652
 653        inode = sb->s_root->d_inode;
 654        inode->i_op = &bpf_dir_iops;
 655        inode->i_mode &= ~S_IALLUGO;
 656        inode->i_mode |= S_ISVTX | opts.mode;
 657
 658        return 0;
 659}
 660
 661static struct dentry *bpf_mount(struct file_system_type *type, int flags,
 662                                const char *dev_name, void *data)
 663{
 664        return mount_nodev(type, flags, data, bpf_fill_super);
 665}
 666
 667static struct file_system_type bpf_fs_type = {
 668        .owner          = THIS_MODULE,
 669        .name           = "bpf",
 670        .mount          = bpf_mount,
 671        .kill_sb        = kill_litter_super,
 672};
 673
 674static int __init bpf_init(void)
 675{
 676        int ret;
 677
 678        ret = sysfs_create_mount_point(fs_kobj, "bpf");
 679        if (ret)
 680                return ret;
 681
 682        ret = register_filesystem(&bpf_fs_type);
 683        if (ret)
 684                sysfs_remove_mount_point(fs_kobj, "bpf");
 685
 686        return ret;
 687}
 688fs_initcall(bpf_init);
 689