qemu/tools/virtiofsd/passthrough_ll.c
<<
>>
Prefs
   1/*
   2 * FUSE: Filesystem in Userspace
   3 * Copyright (C) 2001-2007  Miklos Szeredi <miklos@szeredi.hu>
   4 *
   5 * This program can be distributed under the terms of the GNU GPLv2.
   6 * See the file COPYING.
   7 */
   8
   9/*
  10 *
  11 * This file system mirrors the existing file system hierarchy of the
  12 * system, starting at the root file system. This is implemented by
  13 * just "passing through" all requests to the corresponding user-space
  14 * libc functions. In contrast to passthrough.c and passthrough_fh.c,
  15 * this implementation uses the low-level API. Its performance should
  16 * be the least bad among the three, but many operations are not
  17 * implemented. In particular, it is not possible to remove files (or
  18 * directories) because the code necessary to defer actual removal
  19 * until the file is not opened anymore would make the example much
  20 * more complicated.
  21 *
  22 * When writeback caching is enabled (-o writeback mount option), it
  23 * is only possible to write to files for which the mounting user has
  24 * read permissions. This is because the writeback cache requires the
  25 * kernel to be able to issue read requests for all files (which the
  26 * passthrough filesystem cannot satisfy if it can't read the file in
  27 * the underlying filesystem).
  28 *
  29 * Compile with:
  30 *
  31 *     gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o
  32 * passthrough_ll
  33 *
  34 * ## Source code ##
  35 * \include passthrough_ll.c
  36 */
  37
  38#include "qemu/osdep.h"
  39#include "qemu/timer.h"
  40#include "qemu-version.h"
  41#include "qemu-common.h"
  42#include "fuse_virtio.h"
  43#include "fuse_log.h"
  44#include "fuse_lowlevel.h"
  45#include "standard-headers/linux/fuse.h"
  46#include <cap-ng.h>
  47#include <dirent.h>
  48#include <pthread.h>
  49#include <sys/file.h>
  50#include <sys/mount.h>
  51#include <sys/prctl.h>
  52#include <sys/resource.h>
  53#include <sys/syscall.h>
  54#include <sys/wait.h>
  55#include <sys/xattr.h>
  56#include <syslog.h>
  57#include <grp.h>
  58
  59#include "qemu/cutils.h"
  60#include "passthrough_helpers.h"
  61#include "passthrough_seccomp.h"
  62
  63/* Keep track of inode posix locks for each owner. */
  64struct lo_inode_plock {
  65    uint64_t lock_owner;
  66    int fd; /* fd for OFD locks */
  67};
  68
  69struct lo_map_elem {
  70    union {
  71        struct lo_inode *inode;
  72        struct lo_dirp *dirp;
  73        int fd;
  74        ssize_t freelist;
  75    };
  76    bool in_use;
  77};
  78
  79/* Maps FUSE fh or ino values to internal objects */
  80struct lo_map {
  81    struct lo_map_elem *elems;
  82    size_t nelems;
  83    ssize_t freelist;
  84};
  85
  86struct lo_key {
  87    ino_t ino;
  88    dev_t dev;
  89    uint64_t mnt_id;
  90};
  91
  92struct lo_inode {
  93    int fd;
  94
  95    /*
  96     * Atomic reference count for this object.  The nlookup field holds a
  97     * reference and release it when nlookup reaches 0.
  98     */
  99    gint refcount;
 100
 101    struct lo_key key;
 102
 103    /*
 104     * This counter keeps the inode alive during the FUSE session.
 105     * Incremented when the FUSE inode number is sent in a reply
 106     * (FUSE_LOOKUP, FUSE_READDIRPLUS, etc).  Decremented when an inode is
 107     * released by a FUSE_FORGET request.
 108     *
 109     * Note that this value is untrusted because the client can manipulate
 110     * it arbitrarily using FUSE_FORGET requests.
 111     *
 112     * Protected by lo->mutex.
 113     */
 114    uint64_t nlookup;
 115
 116    fuse_ino_t fuse_ino;
 117    pthread_mutex_t plock_mutex;
 118    GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */
 119
 120    mode_t filetype;
 121};
 122
 123struct lo_cred {
 124    uid_t euid;
 125    gid_t egid;
 126    mode_t umask;
 127};
 128
 129enum {
 130    CACHE_NONE,
 131    CACHE_AUTO,
 132    CACHE_ALWAYS,
 133};
 134
 135enum {
 136    SANDBOX_NAMESPACE,
 137    SANDBOX_CHROOT,
 138};
 139
 140typedef struct xattr_map_entry {
 141    char *key;
 142    char *prepend;
 143    unsigned int flags;
 144} XattrMapEntry;
 145
 146struct lo_data {
 147    pthread_mutex_t mutex;
 148    int sandbox;
 149    int debug;
 150    int writeback;
 151    int flock;
 152    int posix_lock;
 153    int xattr;
 154    char *xattrmap;
 155    char *xattr_security_capability;
 156    char *source;
 157    char *modcaps;
 158    double timeout;
 159    int cache;
 160    int timeout_set;
 161    int readdirplus_set;
 162    int readdirplus_clear;
 163    int allow_direct_io;
 164    int announce_submounts;
 165    bool use_statx;
 166    struct lo_inode root;
 167    GHashTable *inodes; /* protected by lo->mutex */
 168    struct lo_map ino_map; /* protected by lo->mutex */
 169    struct lo_map dirp_map; /* protected by lo->mutex */
 170    struct lo_map fd_map; /* protected by lo->mutex */
 171    XattrMapEntry *xattr_map_list;
 172    size_t xattr_map_nentries;
 173
 174    /* An O_PATH file descriptor to /proc/self/fd/ */
 175    int proc_self_fd;
 176    /* An O_PATH file descriptor to /proc/self/task/ */
 177    int proc_self_task;
 178    int user_killpriv_v2, killpriv_v2;
 179    /* If set, virtiofsd is responsible for setting umask during creation */
 180    bool change_umask;
 181    int user_posix_acl, posix_acl;
 182    /* Keeps track if /proc/<pid>/attr/fscreate should be used or not */
 183    bool use_fscreate;
 184    int user_security_label;
 185};
 186
 187static const struct fuse_opt lo_opts[] = {
 188    { "sandbox=namespace",
 189      offsetof(struct lo_data, sandbox),
 190      SANDBOX_NAMESPACE },
 191    { "sandbox=chroot",
 192      offsetof(struct lo_data, sandbox),
 193      SANDBOX_CHROOT },
 194    { "writeback", offsetof(struct lo_data, writeback), 1 },
 195    { "no_writeback", offsetof(struct lo_data, writeback), 0 },
 196    { "source=%s", offsetof(struct lo_data, source), 0 },
 197    { "flock", offsetof(struct lo_data, flock), 1 },
 198    { "no_flock", offsetof(struct lo_data, flock), 0 },
 199    { "posix_lock", offsetof(struct lo_data, posix_lock), 1 },
 200    { "no_posix_lock", offsetof(struct lo_data, posix_lock), 0 },
 201    { "xattr", offsetof(struct lo_data, xattr), 1 },
 202    { "no_xattr", offsetof(struct lo_data, xattr), 0 },
 203    { "xattrmap=%s", offsetof(struct lo_data, xattrmap), 0 },
 204    { "modcaps=%s", offsetof(struct lo_data, modcaps), 0 },
 205    { "timeout=%lf", offsetof(struct lo_data, timeout), 0 },
 206    { "timeout=", offsetof(struct lo_data, timeout_set), 1 },
 207    { "cache=none", offsetof(struct lo_data, cache), CACHE_NONE },
 208    { "cache=auto", offsetof(struct lo_data, cache), CACHE_AUTO },
 209    { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS },
 210    { "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 },
 211    { "no_readdirplus", offsetof(struct lo_data, readdirplus_clear), 1 },
 212    { "allow_direct_io", offsetof(struct lo_data, allow_direct_io), 1 },
 213    { "no_allow_direct_io", offsetof(struct lo_data, allow_direct_io), 0 },
 214    { "announce_submounts", offsetof(struct lo_data, announce_submounts), 1 },
 215    { "killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 1 },
 216    { "no_killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 0 },
 217    { "posix_acl", offsetof(struct lo_data, user_posix_acl), 1 },
 218    { "no_posix_acl", offsetof(struct lo_data, user_posix_acl), 0 },
 219    { "security_label", offsetof(struct lo_data, user_security_label), 1 },
 220    { "no_security_label", offsetof(struct lo_data, user_security_label), 0 },
 221    FUSE_OPT_END
 222};
 223static bool use_syslog = false;
 224static int current_log_level;
 225static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
 226                                 uint64_t n);
 227
 228static struct {
 229    pthread_mutex_t mutex;
 230    void *saved;
 231} cap;
 232/* That we loaded cap-ng in the current thread from the saved */
 233static __thread bool cap_loaded = 0;
 234
 235static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
 236                                uint64_t mnt_id);
 237static int xattr_map_client(const struct lo_data *lo, const char *client_name,
 238                            char **out_name);
 239
 240#define FCHDIR_NOFAIL(fd) do {                         \
 241        int fchdir_res = fchdir(fd);                   \
 242        assert(fchdir_res == 0);                       \
 243    } while (0)
 244
 245static bool is_dot_or_dotdot(const char *name)
 246{
 247    return name[0] == '.' &&
 248           (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'));
 249}
 250
 251/* Is `path` a single path component that is not "." or ".."? */
 252static bool is_safe_path_component(const char *path)
 253{
 254    if (strchr(path, '/')) {
 255        return false;
 256    }
 257
 258    return !is_dot_or_dotdot(path);
 259}
 260
 261static bool is_empty(const char *name)
 262{
 263    return name[0] == '\0';
 264}
 265
 266static struct lo_data *lo_data(fuse_req_t req)
 267{
 268    return (struct lo_data *)fuse_req_userdata(req);
 269}
 270
 271/*
 272 * Tries to figure out if /proc/<pid>/attr/fscreate is usable or not. With
 273 * selinux=0, read from fscreate returns -EINVAL.
 274 *
 275 * TODO: Link with libselinux and use is_selinux_enabled() instead down
 276 * the line. It probably will be more reliable indicator.
 277 */
 278static bool is_fscreate_usable(struct lo_data *lo)
 279{
 280    char procname[64];
 281    int fscreate_fd;
 282    size_t bytes_read;
 283
 284    sprintf(procname, "%ld/attr/fscreate", syscall(SYS_gettid));
 285    fscreate_fd = openat(lo->proc_self_task, procname, O_RDWR);
 286    if (fscreate_fd == -1) {
 287        return false;
 288    }
 289
 290    bytes_read = read(fscreate_fd, procname, 64);
 291    close(fscreate_fd);
 292    if (bytes_read == -1) {
 293        return false;
 294    }
 295    return true;
 296}
 297
 298/* Helpers to set/reset fscreate */
 299static int open_set_proc_fscreate(struct lo_data *lo, const void *ctx,
 300                                  size_t ctxlen, int *fd)
 301{
 302    char procname[64];
 303    int fscreate_fd, err = 0;
 304    size_t written;
 305
 306    sprintf(procname, "%ld/attr/fscreate", syscall(SYS_gettid));
 307    fscreate_fd = openat(lo->proc_self_task, procname, O_WRONLY);
 308    err = fscreate_fd == -1 ? errno : 0;
 309    if (err) {
 310        return err;
 311    }
 312
 313    written = write(fscreate_fd, ctx, ctxlen);
 314    err = written == -1 ? errno : 0;
 315    if (err) {
 316        goto out;
 317    }
 318
 319    *fd = fscreate_fd;
 320    return 0;
 321out:
 322    close(fscreate_fd);
 323    return err;
 324}
 325
 326static void close_reset_proc_fscreate(int fd)
 327{
 328    if ((write(fd, NULL, 0)) == -1) {
 329        fuse_log(FUSE_LOG_WARNING, "Failed to reset fscreate. err=%d\n", errno);
 330    }
 331    close(fd);
 332    return;
 333}
 334
 335/*
 336 * Load capng's state from our saved state if the current thread
 337 * hadn't previously been loaded.
 338 * returns 0 on success
 339 */
 340static int load_capng(void)
 341{
 342    if (!cap_loaded) {
 343        pthread_mutex_lock(&cap.mutex);
 344        capng_restore_state(&cap.saved);
 345        /*
 346         * restore_state free's the saved copy
 347         * so make another.
 348         */
 349        cap.saved = capng_save_state();
 350        if (!cap.saved) {
 351            pthread_mutex_unlock(&cap.mutex);
 352            fuse_log(FUSE_LOG_ERR, "capng_save_state (thread)\n");
 353            return -EINVAL;
 354        }
 355        pthread_mutex_unlock(&cap.mutex);
 356
 357        /*
 358         * We want to use the loaded state for our pid,
 359         * not the original
 360         */
 361        capng_setpid(syscall(SYS_gettid));
 362        cap_loaded = true;
 363    }
 364    return 0;
 365}
 366
 367/*
 368 * Helpers for dropping and regaining effective capabilities. Returns 0
 369 * on success, error otherwise
 370 */
 371static int drop_effective_cap(const char *cap_name, bool *cap_dropped)
 372{
 373    int cap, ret;
 374
 375    cap = capng_name_to_capability(cap_name);
 376    if (cap < 0) {
 377        ret = errno;
 378        fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
 379                 cap_name, strerror(errno));
 380        goto out;
 381    }
 382
 383    if (load_capng()) {
 384        ret = errno;
 385        fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
 386        goto out;
 387    }
 388
 389    /* We dont have this capability in effective set already. */
 390    if (!capng_have_capability(CAPNG_EFFECTIVE, cap)) {
 391        ret = 0;
 392        goto out;
 393    }
 394
 395    if (capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, cap)) {
 396        ret = errno;
 397        fuse_log(FUSE_LOG_ERR, "capng_update(DROP,) failed\n");
 398        goto out;
 399    }
 400
 401    if (capng_apply(CAPNG_SELECT_CAPS)) {
 402        ret = errno;
 403        fuse_log(FUSE_LOG_ERR, "drop:capng_apply() failed\n");
 404        goto out;
 405    }
 406
 407    ret = 0;
 408    if (cap_dropped) {
 409        *cap_dropped = true;
 410    }
 411
 412out:
 413    return ret;
 414}
 415
 416static int gain_effective_cap(const char *cap_name)
 417{
 418    int cap;
 419    int ret = 0;
 420
 421    cap = capng_name_to_capability(cap_name);
 422    if (cap < 0) {
 423        ret = errno;
 424        fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
 425                 cap_name, strerror(errno));
 426        goto out;
 427    }
 428
 429    if (load_capng()) {
 430        ret = errno;
 431        fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
 432        goto out;
 433    }
 434
 435    if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, cap)) {
 436        ret = errno;
 437        fuse_log(FUSE_LOG_ERR, "capng_update(ADD,) failed\n");
 438        goto out;
 439    }
 440
 441    if (capng_apply(CAPNG_SELECT_CAPS)) {
 442        ret = errno;
 443        fuse_log(FUSE_LOG_ERR, "gain:capng_apply() failed\n");
 444        goto out;
 445    }
 446    ret = 0;
 447
 448out:
 449    return ret;
 450}
 451
 452/*
 453 * The host kernel normally drops security.capability xattr's on
 454 * any write, however if we're remapping xattr names we need to drop
 455 * whatever the clients security.capability is actually stored as.
 456 */
 457static int drop_security_capability(const struct lo_data *lo, int fd)
 458{
 459    if (!lo->xattr_security_capability) {
 460        /* We didn't remap the name, let the host kernel do it */
 461        return 0;
 462    }
 463    if (!fremovexattr(fd, lo->xattr_security_capability)) {
 464        /* All good */
 465        return 0;
 466    }
 467
 468    switch (errno) {
 469    case ENODATA:
 470        /* Attribute didn't exist, that's fine */
 471        return 0;
 472
 473    case ENOTSUP:
 474        /* FS didn't support attribute anyway, also fine */
 475        return 0;
 476
 477    default:
 478        /* Hmm other error */
 479        return errno;
 480    }
 481}
 482
 483static void lo_map_init(struct lo_map *map)
 484{
 485    map->elems = NULL;
 486    map->nelems = 0;
 487    map->freelist = -1;
 488}
 489
 490static void lo_map_destroy(struct lo_map *map)
 491{
 492    g_free(map->elems);
 493}
 494
 495static int lo_map_grow(struct lo_map *map, size_t new_nelems)
 496{
 497    struct lo_map_elem *new_elems;
 498    size_t i;
 499
 500    if (new_nelems <= map->nelems) {
 501        return 1;
 502    }
 503
 504    new_elems = g_try_realloc_n(map->elems, new_nelems, sizeof(map->elems[0]));
 505    if (!new_elems) {
 506        return 0;
 507    }
 508
 509    for (i = map->nelems; i < new_nelems; i++) {
 510        new_elems[i].freelist = i + 1;
 511        new_elems[i].in_use = false;
 512    }
 513    new_elems[new_nelems - 1].freelist = -1;
 514
 515    map->elems = new_elems;
 516    map->freelist = map->nelems;
 517    map->nelems = new_nelems;
 518    return 1;
 519}
 520
 521static struct lo_map_elem *lo_map_alloc_elem(struct lo_map *map)
 522{
 523    struct lo_map_elem *elem;
 524
 525    if (map->freelist == -1 && !lo_map_grow(map, map->nelems + 256)) {
 526        return NULL;
 527    }
 528
 529    elem = &map->elems[map->freelist];
 530    map->freelist = elem->freelist;
 531
 532    elem->in_use = true;
 533
 534    return elem;
 535}
 536
 537static struct lo_map_elem *lo_map_reserve(struct lo_map *map, size_t key)
 538{
 539    ssize_t *prev;
 540
 541    if (!lo_map_grow(map, key + 1)) {
 542        return NULL;
 543    }
 544
 545    for (prev = &map->freelist; *prev != -1;
 546         prev = &map->elems[*prev].freelist) {
 547        if (*prev == key) {
 548            struct lo_map_elem *elem = &map->elems[key];
 549
 550            *prev = elem->freelist;
 551            elem->in_use = true;
 552            return elem;
 553        }
 554    }
 555    return NULL;
 556}
 557
 558static struct lo_map_elem *lo_map_get(struct lo_map *map, size_t key)
 559{
 560    if (key >= map->nelems) {
 561        return NULL;
 562    }
 563    if (!map->elems[key].in_use) {
 564        return NULL;
 565    }
 566    return &map->elems[key];
 567}
 568
 569static void lo_map_remove(struct lo_map *map, size_t key)
 570{
 571    struct lo_map_elem *elem;
 572
 573    if (key >= map->nelems) {
 574        return;
 575    }
 576
 577    elem = &map->elems[key];
 578    if (!elem->in_use) {
 579        return;
 580    }
 581
 582    elem->in_use = false;
 583
 584    elem->freelist = map->freelist;
 585    map->freelist = key;
 586}
 587
 588/* Assumes lo->mutex is held */
 589static ssize_t lo_add_fd_mapping(struct lo_data *lo, int fd)
 590{
 591    struct lo_map_elem *elem;
 592
 593    elem = lo_map_alloc_elem(&lo->fd_map);
 594    if (!elem) {
 595        return -1;
 596    }
 597
 598    elem->fd = fd;
 599    return elem - lo->fd_map.elems;
 600}
 601
 602/* Assumes lo->mutex is held */
 603static ssize_t lo_add_dirp_mapping(fuse_req_t req, struct lo_dirp *dirp)
 604{
 605    struct lo_map_elem *elem;
 606
 607    elem = lo_map_alloc_elem(&lo_data(req)->dirp_map);
 608    if (!elem) {
 609        return -1;
 610    }
 611
 612    elem->dirp = dirp;
 613    return elem - lo_data(req)->dirp_map.elems;
 614}
 615
 616/* Assumes lo->mutex is held */
 617static ssize_t lo_add_inode_mapping(fuse_req_t req, struct lo_inode *inode)
 618{
 619    struct lo_map_elem *elem;
 620
 621    elem = lo_map_alloc_elem(&lo_data(req)->ino_map);
 622    if (!elem) {
 623        return -1;
 624    }
 625
 626    elem->inode = inode;
 627    return elem - lo_data(req)->ino_map.elems;
 628}
 629
 630static void lo_inode_put(struct lo_data *lo, struct lo_inode **inodep)
 631{
 632    struct lo_inode *inode = *inodep;
 633
 634    if (!inode) {
 635        return;
 636    }
 637
 638    *inodep = NULL;
 639
 640    if (g_atomic_int_dec_and_test(&inode->refcount)) {
 641        close(inode->fd);
 642        free(inode);
 643    }
 644}
 645
 646/* Caller must release refcount using lo_inode_put() */
 647static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino)
 648{
 649    struct lo_data *lo = lo_data(req);
 650    struct lo_map_elem *elem;
 651
 652    pthread_mutex_lock(&lo->mutex);
 653    elem = lo_map_get(&lo->ino_map, ino);
 654    if (elem) {
 655        g_atomic_int_inc(&elem->inode->refcount);
 656    }
 657    pthread_mutex_unlock(&lo->mutex);
 658
 659    if (!elem) {
 660        return NULL;
 661    }
 662
 663    return elem->inode;
 664}
 665
 666/*
 667 * TODO Remove this helper and force callers to hold an inode refcount until
 668 * they are done with the fd.  This will be done in a later patch to make
 669 * review easier.
 670 */
 671static int lo_fd(fuse_req_t req, fuse_ino_t ino)
 672{
 673    struct lo_inode *inode = lo_inode(req, ino);
 674    int fd;
 675
 676    if (!inode) {
 677        return -1;
 678    }
 679
 680    fd = inode->fd;
 681    lo_inode_put(lo_data(req), &inode);
 682    return fd;
 683}
 684
 685/*
 686 * Open a file descriptor for an inode. Returns -EBADF if the inode is not a
 687 * regular file or a directory.
 688 *
 689 * Use this helper function instead of raw openat(2) to prevent security issues
 690 * when a malicious client opens special files such as block device nodes.
 691 * Symlink inodes are also rejected since symlinks must already have been
 692 * traversed on the client side.
 693 */
 694static int lo_inode_open(struct lo_data *lo, struct lo_inode *inode,
 695                         int open_flags)
 696{
 697    g_autofree char *fd_str = g_strdup_printf("%d", inode->fd);
 698    int fd;
 699
 700    if (!S_ISREG(inode->filetype) && !S_ISDIR(inode->filetype)) {
 701        return -EBADF;
 702    }
 703
 704    /*
 705     * The file is a symlink so O_NOFOLLOW must be ignored. We checked earlier
 706     * that the inode is not a special file but if an external process races
 707     * with us then symlinks are traversed here. It is not possible to escape
 708     * the shared directory since it is mounted as "/" though.
 709     */
 710    fd = openat(lo->proc_self_fd, fd_str, open_flags & ~O_NOFOLLOW);
 711    if (fd < 0) {
 712        return -errno;
 713    }
 714    return fd;
 715}
 716
 717static void lo_init(void *userdata, struct fuse_conn_info *conn)
 718{
 719    struct lo_data *lo = (struct lo_data *)userdata;
 720
 721    if (conn->capable & FUSE_CAP_EXPORT_SUPPORT) {
 722        conn->want |= FUSE_CAP_EXPORT_SUPPORT;
 723    }
 724
 725    if (lo->writeback && conn->capable & FUSE_CAP_WRITEBACK_CACHE) {
 726        fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n");
 727        conn->want |= FUSE_CAP_WRITEBACK_CACHE;
 728    }
 729    if (conn->capable & FUSE_CAP_FLOCK_LOCKS) {
 730        if (lo->flock) {
 731            fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n");
 732            conn->want |= FUSE_CAP_FLOCK_LOCKS;
 733        } else {
 734            fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling flock locks\n");
 735            conn->want &= ~FUSE_CAP_FLOCK_LOCKS;
 736        }
 737    }
 738
 739    if (conn->capable & FUSE_CAP_POSIX_LOCKS) {
 740        if (lo->posix_lock) {
 741            fuse_log(FUSE_LOG_DEBUG, "lo_init: activating posix locks\n");
 742            conn->want |= FUSE_CAP_POSIX_LOCKS;
 743        } else {
 744            fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix locks\n");
 745            conn->want &= ~FUSE_CAP_POSIX_LOCKS;
 746        }
 747    }
 748
 749    if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) ||
 750        lo->readdirplus_clear) {
 751        fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n");
 752        conn->want &= ~FUSE_CAP_READDIRPLUS;
 753    }
 754
 755    if (!(conn->capable & FUSE_CAP_SUBMOUNTS) && lo->announce_submounts) {
 756        fuse_log(FUSE_LOG_WARNING, "lo_init: Cannot announce submounts, client "
 757                 "does not support it\n");
 758        lo->announce_submounts = false;
 759    }
 760
 761    if (lo->user_killpriv_v2 == 1) {
 762        /*
 763         * User explicitly asked for this option. Enable it unconditionally.
 764         * If connection does not have this capability, it should fail
 765         * in fuse_lowlevel.c
 766         */
 767        fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling killpriv_v2\n");
 768        conn->want |= FUSE_CAP_HANDLE_KILLPRIV_V2;
 769        lo->killpriv_v2 = 1;
 770    } else if (lo->user_killpriv_v2 == -1 &&
 771               conn->capable & FUSE_CAP_HANDLE_KILLPRIV_V2) {
 772        /*
 773         * User did not specify a value for killpriv_v2. By default enable it
 774         * if connection offers this capability
 775         */
 776        fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling killpriv_v2\n");
 777        conn->want |= FUSE_CAP_HANDLE_KILLPRIV_V2;
 778        lo->killpriv_v2 = 1;
 779    } else {
 780        /*
 781         * Either user specified to disable killpriv_v2, or connection does
 782         * not offer this capability. Disable killpriv_v2 in both the cases
 783         */
 784        fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling killpriv_v2\n");
 785        conn->want &= ~FUSE_CAP_HANDLE_KILLPRIV_V2;
 786        lo->killpriv_v2 = 0;
 787    }
 788
 789    if (lo->user_posix_acl == 1) {
 790        /*
 791         * User explicitly asked for this option. Enable it unconditionally.
 792         * If connection does not have this capability, print error message
 793         * now. It will fail later in fuse_lowlevel.c
 794         */
 795        if (!(conn->capable & FUSE_CAP_POSIX_ACL) ||
 796            !(conn->capable & FUSE_CAP_DONT_MASK) ||
 797            !(conn->capable & FUSE_CAP_SETXATTR_EXT)) {
 798            fuse_log(FUSE_LOG_ERR, "lo_init: Can not enable posix acl."
 799                     " kernel does not support FUSE_POSIX_ACL, FUSE_DONT_MASK"
 800                     " or FUSE_SETXATTR_EXT capability.\n");
 801        } else {
 802            fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling posix acl\n");
 803        }
 804
 805        conn->want |= FUSE_CAP_POSIX_ACL | FUSE_CAP_DONT_MASK |
 806                      FUSE_CAP_SETXATTR_EXT;
 807        lo->change_umask = true;
 808        lo->posix_acl = true;
 809    } else {
 810        /* User either did not specify anything or wants it disabled */
 811        fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix_acl\n");
 812        conn->want &= ~FUSE_CAP_POSIX_ACL;
 813    }
 814
 815    if (lo->user_security_label == 1) {
 816        if (!(conn->capable & FUSE_CAP_SECURITY_CTX)) {
 817            fuse_log(FUSE_LOG_ERR, "lo_init: Can not enable security label."
 818                     " kernel does not support FUSE_SECURITY_CTX capability.\n");
 819        }
 820        conn->want |= FUSE_CAP_SECURITY_CTX;
 821    } else {
 822        fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling security label\n");
 823        conn->want &= ~FUSE_CAP_SECURITY_CTX;
 824    }
 825}
 826
 827static void lo_getattr(fuse_req_t req, fuse_ino_t ino,
 828                       struct fuse_file_info *fi)
 829{
 830    int res;
 831    struct stat buf;
 832    struct lo_data *lo = lo_data(req);
 833
 834    (void)fi;
 835
 836    res =
 837        fstatat(lo_fd(req, ino), "", &buf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
 838    if (res == -1) {
 839        return (void)fuse_reply_err(req, errno);
 840    }
 841
 842    fuse_reply_attr(req, &buf, lo->timeout);
 843}
 844
 845static int lo_fi_fd(fuse_req_t req, struct fuse_file_info *fi)
 846{
 847    struct lo_data *lo = lo_data(req);
 848    struct lo_map_elem *elem;
 849
 850    pthread_mutex_lock(&lo->mutex);
 851    elem = lo_map_get(&lo->fd_map, fi->fh);
 852    pthread_mutex_unlock(&lo->mutex);
 853
 854    if (!elem) {
 855        return -1;
 856    }
 857
 858    return elem->fd;
 859}
 860
 861static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
 862                       int valid, struct fuse_file_info *fi)
 863{
 864    int saverr;
 865    char procname[64];
 866    struct lo_data *lo = lo_data(req);
 867    struct lo_inode *inode;
 868    int ifd;
 869    int res;
 870    int fd = -1;
 871
 872    inode = lo_inode(req, ino);
 873    if (!inode) {
 874        fuse_reply_err(req, EBADF);
 875        return;
 876    }
 877
 878    ifd = inode->fd;
 879
 880    /* If fi->fh is invalid we'll report EBADF later */
 881    if (fi) {
 882        fd = lo_fi_fd(req, fi);
 883    }
 884
 885    if (valid & FUSE_SET_ATTR_MODE) {
 886        if (fi) {
 887            res = fchmod(fd, attr->st_mode);
 888        } else {
 889            sprintf(procname, "%i", ifd);
 890            res = fchmodat(lo->proc_self_fd, procname, attr->st_mode, 0);
 891        }
 892        if (res == -1) {
 893            saverr = errno;
 894            goto out_err;
 895        }
 896    }
 897    if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) {
 898        uid_t uid = (valid & FUSE_SET_ATTR_UID) ? attr->st_uid : (uid_t)-1;
 899        gid_t gid = (valid & FUSE_SET_ATTR_GID) ? attr->st_gid : (gid_t)-1;
 900
 901        saverr = drop_security_capability(lo, ifd);
 902        if (saverr) {
 903            goto out_err;
 904        }
 905
 906        res = fchownat(ifd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
 907        if (res == -1) {
 908            saverr = errno;
 909            goto out_err;
 910        }
 911    }
 912    if (valid & FUSE_SET_ATTR_SIZE) {
 913        int truncfd;
 914        bool kill_suidgid;
 915        bool cap_fsetid_dropped = false;
 916
 917        kill_suidgid = lo->killpriv_v2 && (valid & FUSE_SET_ATTR_KILL_SUIDGID);
 918        if (fi) {
 919            truncfd = fd;
 920        } else {
 921            truncfd = lo_inode_open(lo, inode, O_RDWR);
 922            if (truncfd < 0) {
 923                saverr = -truncfd;
 924                goto out_err;
 925            }
 926        }
 927
 928        saverr = drop_security_capability(lo, truncfd);
 929        if (saverr) {
 930            if (!fi) {
 931                close(truncfd);
 932            }
 933            goto out_err;
 934        }
 935
 936        if (kill_suidgid) {
 937            res = drop_effective_cap("FSETID", &cap_fsetid_dropped);
 938            if (res != 0) {
 939                saverr = res;
 940                if (!fi) {
 941                    close(truncfd);
 942                }
 943                goto out_err;
 944            }
 945        }
 946
 947        res = ftruncate(truncfd, attr->st_size);
 948        saverr = res == -1 ? errno : 0;
 949
 950        if (cap_fsetid_dropped) {
 951            if (gain_effective_cap("FSETID")) {
 952                fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
 953            }
 954        }
 955        if (!fi) {
 956            close(truncfd);
 957        }
 958        if (res == -1) {
 959            goto out_err;
 960        }
 961    }
 962    if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) {
 963        struct timespec tv[2];
 964
 965        tv[0].tv_sec = 0;
 966        tv[1].tv_sec = 0;
 967        tv[0].tv_nsec = UTIME_OMIT;
 968        tv[1].tv_nsec = UTIME_OMIT;
 969
 970        if (valid & FUSE_SET_ATTR_ATIME_NOW) {
 971            tv[0].tv_nsec = UTIME_NOW;
 972        } else if (valid & FUSE_SET_ATTR_ATIME) {
 973            tv[0] = attr->st_atim;
 974        }
 975
 976        if (valid & FUSE_SET_ATTR_MTIME_NOW) {
 977            tv[1].tv_nsec = UTIME_NOW;
 978        } else if (valid & FUSE_SET_ATTR_MTIME) {
 979            tv[1] = attr->st_mtim;
 980        }
 981
 982        if (fi) {
 983            res = futimens(fd, tv);
 984        } else {
 985            sprintf(procname, "%i", inode->fd);
 986            res = utimensat(lo->proc_self_fd, procname, tv, 0);
 987        }
 988        if (res == -1) {
 989            saverr = errno;
 990            goto out_err;
 991        }
 992    }
 993    lo_inode_put(lo, &inode);
 994
 995    return lo_getattr(req, ino, fi);
 996
 997out_err:
 998    lo_inode_put(lo, &inode);
 999    fuse_reply_err(req, saverr);
1000}
1001
1002static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
1003                                uint64_t mnt_id)
1004{
1005    struct lo_inode *p;
1006    struct lo_key key = {
1007        .ino = st->st_ino,
1008        .dev = st->st_dev,
1009        .mnt_id = mnt_id,
1010    };
1011
1012    pthread_mutex_lock(&lo->mutex);
1013    p = g_hash_table_lookup(lo->inodes, &key);
1014    if (p) {
1015        assert(p->nlookup > 0);
1016        p->nlookup++;
1017        g_atomic_int_inc(&p->refcount);
1018    }
1019    pthread_mutex_unlock(&lo->mutex);
1020
1021    return p;
1022}
1023
1024/* value_destroy_func for posix_locks GHashTable */
1025static void posix_locks_value_destroy(gpointer data)
1026{
1027    struct lo_inode_plock *plock = data;
1028
1029    /*
1030     * We had used open() for locks and had only one fd. So
1031     * closing this fd should release all OFD locks.
1032     */
1033    close(plock->fd);
1034    free(plock);
1035}
1036
1037static int do_statx(struct lo_data *lo, int dirfd, const char *pathname,
1038                    struct stat *statbuf, int flags, uint64_t *mnt_id)
1039{
1040    int res;
1041
1042#if defined(CONFIG_STATX) && defined(CONFIG_STATX_MNT_ID)
1043    if (lo->use_statx) {
1044        struct statx statxbuf;
1045
1046        res = statx(dirfd, pathname, flags, STATX_BASIC_STATS | STATX_MNT_ID,
1047                    &statxbuf);
1048        if (!res) {
1049            memset(statbuf, 0, sizeof(*statbuf));
1050            statbuf->st_dev = makedev(statxbuf.stx_dev_major,
1051                                      statxbuf.stx_dev_minor);
1052            statbuf->st_ino = statxbuf.stx_ino;
1053            statbuf->st_mode = statxbuf.stx_mode;
1054            statbuf->st_nlink = statxbuf.stx_nlink;
1055            statbuf->st_uid = statxbuf.stx_uid;
1056            statbuf->st_gid = statxbuf.stx_gid;
1057            statbuf->st_rdev = makedev(statxbuf.stx_rdev_major,
1058                                       statxbuf.stx_rdev_minor);
1059            statbuf->st_size = statxbuf.stx_size;
1060            statbuf->st_blksize = statxbuf.stx_blksize;
1061            statbuf->st_blocks = statxbuf.stx_blocks;
1062            statbuf->st_atim.tv_sec = statxbuf.stx_atime.tv_sec;
1063            statbuf->st_atim.tv_nsec = statxbuf.stx_atime.tv_nsec;
1064            statbuf->st_mtim.tv_sec = statxbuf.stx_mtime.tv_sec;
1065            statbuf->st_mtim.tv_nsec = statxbuf.stx_mtime.tv_nsec;
1066            statbuf->st_ctim.tv_sec = statxbuf.stx_ctime.tv_sec;
1067            statbuf->st_ctim.tv_nsec = statxbuf.stx_ctime.tv_nsec;
1068
1069            if (statxbuf.stx_mask & STATX_MNT_ID) {
1070                *mnt_id = statxbuf.stx_mnt_id;
1071            } else {
1072                *mnt_id = 0;
1073            }
1074            return 0;
1075        } else if (errno != ENOSYS) {
1076            return -1;
1077        }
1078        lo->use_statx = false;
1079        /* fallback */
1080    }
1081#endif
1082    res = fstatat(dirfd, pathname, statbuf, flags);
1083    if (res == -1) {
1084        return -1;
1085    }
1086    *mnt_id = 0;
1087
1088    return 0;
1089}
1090
1091/*
1092 * Increments nlookup on the inode on success. unref_inode_lolocked() must be
1093 * called eventually to decrement nlookup again. If inodep is non-NULL, the
1094 * inode pointer is stored and the caller must call lo_inode_put().
1095 */
1096static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
1097                        struct fuse_entry_param *e,
1098                        struct lo_inode **inodep)
1099{
1100    int newfd;
1101    int res;
1102    int saverr;
1103    uint64_t mnt_id;
1104    struct lo_data *lo = lo_data(req);
1105    struct lo_inode *inode = NULL;
1106    struct lo_inode *dir = lo_inode(req, parent);
1107
1108    if (inodep) {
1109        *inodep = NULL; /* in case there is an error */
1110    }
1111
1112    /*
1113     * name_to_handle_at() and open_by_handle_at() can reach here with fuse
1114     * mount point in guest, but we don't have its inode info in the
1115     * ino_map.
1116     */
1117    if (!dir) {
1118        return ENOENT;
1119    }
1120
1121    memset(e, 0, sizeof(*e));
1122    e->attr_timeout = lo->timeout;
1123    e->entry_timeout = lo->timeout;
1124
1125    /* Do not allow escaping root directory */
1126    if (dir == &lo->root && strcmp(name, "..") == 0) {
1127        name = ".";
1128    }
1129
1130    newfd = openat(dir->fd, name, O_PATH | O_NOFOLLOW);
1131    if (newfd == -1) {
1132        goto out_err;
1133    }
1134
1135    res = do_statx(lo, newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW,
1136                   &mnt_id);
1137    if (res == -1) {
1138        goto out_err;
1139    }
1140
1141    if (S_ISDIR(e->attr.st_mode) && lo->announce_submounts &&
1142        (e->attr.st_dev != dir->key.dev || mnt_id != dir->key.mnt_id)) {
1143        e->attr_flags |= FUSE_ATTR_SUBMOUNT;
1144    }
1145
1146    inode = lo_find(lo, &e->attr, mnt_id);
1147    if (inode) {
1148        close(newfd);
1149    } else {
1150        inode = calloc(1, sizeof(struct lo_inode));
1151        if (!inode) {
1152            goto out_err;
1153        }
1154
1155        /* cache only filetype */
1156        inode->filetype = (e->attr.st_mode & S_IFMT);
1157
1158        /*
1159         * One for the caller and one for nlookup (released in
1160         * unref_inode_lolocked())
1161         */
1162        g_atomic_int_set(&inode->refcount, 2);
1163
1164        inode->nlookup = 1;
1165        inode->fd = newfd;
1166        inode->key.ino = e->attr.st_ino;
1167        inode->key.dev = e->attr.st_dev;
1168        inode->key.mnt_id = mnt_id;
1169        if (lo->posix_lock) {
1170            pthread_mutex_init(&inode->plock_mutex, NULL);
1171            inode->posix_locks = g_hash_table_new_full(
1172                g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy);
1173        }
1174        pthread_mutex_lock(&lo->mutex);
1175        inode->fuse_ino = lo_add_inode_mapping(req, inode);
1176        g_hash_table_insert(lo->inodes, &inode->key, inode);
1177        pthread_mutex_unlock(&lo->mutex);
1178    }
1179    e->ino = inode->fuse_ino;
1180
1181    /* Transfer ownership of inode pointer to caller or drop it */
1182    if (inodep) {
1183        *inodep = inode;
1184    } else {
1185        lo_inode_put(lo, &inode);
1186    }
1187
1188    lo_inode_put(lo, &dir);
1189
1190    fuse_log(FUSE_LOG_DEBUG, "  %lli/%s -> %lli\n", (unsigned long long)parent,
1191             name, (unsigned long long)e->ino);
1192
1193    return 0;
1194
1195out_err:
1196    saverr = errno;
1197    if (newfd != -1) {
1198        close(newfd);
1199    }
1200    lo_inode_put(lo, &inode);
1201    lo_inode_put(lo, &dir);
1202    return saverr;
1203}
1204
1205static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)
1206{
1207    struct fuse_entry_param e;
1208    int err;
1209
1210    fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", parent,
1211             name);
1212
1213    if (is_empty(name)) {
1214        fuse_reply_err(req, ENOENT);
1215        return;
1216    }
1217
1218    /*
1219     * Don't use is_safe_path_component(), allow "." and ".." for NFS export
1220     * support.
1221     */
1222    if (strchr(name, '/')) {
1223        fuse_reply_err(req, EINVAL);
1224        return;
1225    }
1226
1227    err = lo_do_lookup(req, parent, name, &e, NULL);
1228    if (err) {
1229        fuse_reply_err(req, err);
1230    } else {
1231        fuse_reply_entry(req, &e);
1232    }
1233}
1234
1235/*
1236 * On some archs, setres*id is limited to 2^16 but they
1237 * provide setres*id32 variants that allow 2^32.
1238 * Others just let setres*id do 2^32 anyway.
1239 */
1240#ifdef SYS_setresgid32
1241#define OURSYS_setresgid SYS_setresgid32
1242#else
1243#define OURSYS_setresgid SYS_setresgid
1244#endif
1245
1246#ifdef SYS_setresuid32
1247#define OURSYS_setresuid SYS_setresuid32
1248#else
1249#define OURSYS_setresuid SYS_setresuid
1250#endif
1251
1252static void drop_supplementary_groups(void)
1253{
1254    int ret;
1255
1256    ret = getgroups(0, NULL);
1257    if (ret == -1) {
1258        fuse_log(FUSE_LOG_ERR, "getgroups() failed with error=%d:%s\n",
1259                 errno, strerror(errno));
1260        exit(1);
1261    }
1262
1263    if (!ret) {
1264        return;
1265    }
1266
1267    /* Drop all supplementary groups. We should not need it */
1268    ret = setgroups(0, NULL);
1269    if (ret == -1) {
1270        fuse_log(FUSE_LOG_ERR, "setgroups() failed with error=%d:%s\n",
1271                 errno, strerror(errno));
1272        exit(1);
1273    }
1274}
1275
1276/*
1277 * Change to uid/gid of caller so that file is created with
1278 * ownership of caller.
1279 * TODO: What about selinux context?
1280 */
1281static int lo_change_cred(fuse_req_t req, struct lo_cred *old,
1282                          bool change_umask)
1283{
1284    int res;
1285
1286    old->euid = geteuid();
1287    old->egid = getegid();
1288
1289    res = syscall(OURSYS_setresgid, -1, fuse_req_ctx(req)->gid, -1);
1290    if (res == -1) {
1291        return errno;
1292    }
1293
1294    res = syscall(OURSYS_setresuid, -1, fuse_req_ctx(req)->uid, -1);
1295    if (res == -1) {
1296        int errno_save = errno;
1297
1298        syscall(OURSYS_setresgid, -1, old->egid, -1);
1299        return errno_save;
1300    }
1301
1302    if (change_umask) {
1303        old->umask = umask(req->ctx.umask);
1304    }
1305    return 0;
1306}
1307
1308/* Regain Privileges */
1309static void lo_restore_cred(struct lo_cred *old, bool restore_umask)
1310{
1311    int res;
1312
1313    res = syscall(OURSYS_setresuid, -1, old->euid, -1);
1314    if (res == -1) {
1315        fuse_log(FUSE_LOG_ERR, "seteuid(%u): %m\n", old->euid);
1316        exit(1);
1317    }
1318
1319    res = syscall(OURSYS_setresgid, -1, old->egid, -1);
1320    if (res == -1) {
1321        fuse_log(FUSE_LOG_ERR, "setegid(%u): %m\n", old->egid);
1322        exit(1);
1323    }
1324
1325    if (restore_umask)
1326        umask(old->umask);
1327}
1328
1329/*
1330 * A helper to change cred and drop capability. Returns 0 on success and
1331 * errno on error
1332 */
1333static int lo_drop_cap_change_cred(fuse_req_t req, struct lo_cred *old,
1334                                   bool change_umask, const char *cap_name,
1335                                   bool *cap_dropped)
1336{
1337    int ret;
1338    bool __cap_dropped;
1339
1340    assert(cap_name);
1341
1342    ret = drop_effective_cap(cap_name, &__cap_dropped);
1343    if (ret) {
1344        return ret;
1345    }
1346
1347    ret = lo_change_cred(req, old, change_umask);
1348    if (ret) {
1349        if (__cap_dropped) {
1350            if (gain_effective_cap(cap_name)) {
1351                fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_%s\n", cap_name);
1352            }
1353        }
1354    }
1355
1356    if (cap_dropped) {
1357        *cap_dropped = __cap_dropped;
1358    }
1359    return ret;
1360}
1361
1362static void lo_restore_cred_gain_cap(struct lo_cred *old, bool restore_umask,
1363                                     const char *cap_name)
1364{
1365    assert(cap_name);
1366
1367    lo_restore_cred(old, restore_umask);
1368
1369    if (gain_effective_cap(cap_name)) {
1370        fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_%s\n", cap_name);
1371    }
1372}
1373
1374static int do_mknod_symlink_secctx(fuse_req_t req, struct lo_inode *dir,
1375                                   const char *name, const char *secctx_name)
1376{
1377    int path_fd, err;
1378    char procname[64];
1379    struct lo_data *lo = lo_data(req);
1380
1381    if (!req->secctx.ctxlen) {
1382        return 0;
1383    }
1384
1385    /* Open newly created element with O_PATH */
1386    path_fd = openat(dir->fd, name, O_PATH | O_NOFOLLOW);
1387    err = path_fd == -1 ? errno : 0;
1388    if (err) {
1389        return err;
1390    }
1391    sprintf(procname, "%i", path_fd);
1392    FCHDIR_NOFAIL(lo->proc_self_fd);
1393    /* Set security context. This is not atomic w.r.t file creation */
1394    err = setxattr(procname, secctx_name, req->secctx.ctx, req->secctx.ctxlen,
1395                   0);
1396    if (err) {
1397        err = errno;
1398    }
1399    FCHDIR_NOFAIL(lo->root.fd);
1400    close(path_fd);
1401    return err;
1402}
1403
1404static int do_mknod_symlink(fuse_req_t req, struct lo_inode *dir,
1405                            const char *name, mode_t mode, dev_t rdev,
1406                            const char *link)
1407{
1408    int err, fscreate_fd = -1;
1409    const char *secctx_name = req->secctx.name;
1410    struct lo_cred old = {};
1411    struct lo_data *lo = lo_data(req);
1412    char *mapped_name = NULL;
1413    bool secctx_enabled = req->secctx.ctxlen;
1414    bool do_fscreate = false;
1415
1416    if (secctx_enabled && lo->xattrmap) {
1417        err = xattr_map_client(lo, req->secctx.name, &mapped_name);
1418        if (err < 0) {
1419            return -err;
1420        }
1421        secctx_name = mapped_name;
1422    }
1423
1424    /*
1425     * If security xattr has not been remapped and selinux is enabled on
1426     * host, set fscreate and no need to do a setxattr() after file creation
1427     */
1428    if (secctx_enabled && !mapped_name && lo->use_fscreate) {
1429        do_fscreate = true;
1430        err = open_set_proc_fscreate(lo, req->secctx.ctx, req->secctx.ctxlen,
1431                                     &fscreate_fd);
1432        if (err) {
1433            goto out;
1434        }
1435    }
1436
1437    err = lo_change_cred(req, &old, lo->change_umask && !S_ISLNK(mode));
1438    if (err) {
1439        goto out;
1440    }
1441
1442    err = mknod_wrapper(dir->fd, name, link, mode, rdev);
1443    err = err == -1 ? errno : 0;
1444    lo_restore_cred(&old, lo->change_umask && !S_ISLNK(mode));
1445    if (err) {
1446        goto out;
1447    }
1448
1449    if (!do_fscreate) {
1450        err = do_mknod_symlink_secctx(req, dir, name, secctx_name);
1451        if (err) {
1452            unlinkat(dir->fd, name, S_ISDIR(mode) ? AT_REMOVEDIR : 0);
1453        }
1454    }
1455out:
1456    if (fscreate_fd != -1) {
1457        close_reset_proc_fscreate(fscreate_fd);
1458    }
1459    g_free(mapped_name);
1460    return err;
1461}
1462
1463static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent,
1464                             const char *name, mode_t mode, dev_t rdev,
1465                             const char *link)
1466{
1467    int saverr;
1468    struct lo_data *lo = lo_data(req);
1469    struct lo_inode *dir;
1470    struct fuse_entry_param e;
1471
1472    if (is_empty(name)) {
1473        fuse_reply_err(req, ENOENT);
1474        return;
1475    }
1476
1477    if (!is_safe_path_component(name)) {
1478        fuse_reply_err(req, EINVAL);
1479        return;
1480    }
1481
1482    dir = lo_inode(req, parent);
1483    if (!dir) {
1484        fuse_reply_err(req, EBADF);
1485        return;
1486    }
1487
1488    saverr = do_mknod_symlink(req, dir, name, mode, rdev, link);
1489    if (saverr) {
1490        goto out;
1491    }
1492
1493    saverr = lo_do_lookup(req, parent, name, &e, NULL);
1494    if (saverr) {
1495        goto out;
1496    }
1497
1498    fuse_log(FUSE_LOG_DEBUG, "  %lli/%s -> %lli\n", (unsigned long long)parent,
1499             name, (unsigned long long)e.ino);
1500
1501    fuse_reply_entry(req, &e);
1502    lo_inode_put(lo, &dir);
1503    return;
1504
1505out:
1506    lo_inode_put(lo, &dir);
1507    fuse_reply_err(req, saverr);
1508}
1509
1510static void lo_mknod(fuse_req_t req, fuse_ino_t parent, const char *name,
1511                     mode_t mode, dev_t rdev)
1512{
1513    lo_mknod_symlink(req, parent, name, mode, rdev, NULL);
1514}
1515
1516static void lo_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name,
1517                     mode_t mode)
1518{
1519    lo_mknod_symlink(req, parent, name, S_IFDIR | mode, 0, NULL);
1520}
1521
1522static void lo_symlink(fuse_req_t req, const char *link, fuse_ino_t parent,
1523                       const char *name)
1524{
1525    lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link);
1526}
1527
1528static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent,
1529                    const char *name)
1530{
1531    int res;
1532    struct lo_data *lo = lo_data(req);
1533    struct lo_inode *parent_inode;
1534    struct lo_inode *inode;
1535    struct fuse_entry_param e;
1536    char procname[64];
1537    int saverr;
1538
1539    if (is_empty(name)) {
1540        fuse_reply_err(req, ENOENT);
1541        return;
1542    }
1543
1544    if (!is_safe_path_component(name)) {
1545        fuse_reply_err(req, EINVAL);
1546        return;
1547    }
1548
1549    parent_inode = lo_inode(req, parent);
1550    inode = lo_inode(req, ino);
1551    if (!parent_inode || !inode) {
1552        errno = EBADF;
1553        goto out_err;
1554    }
1555
1556    memset(&e, 0, sizeof(struct fuse_entry_param));
1557    e.attr_timeout = lo->timeout;
1558    e.entry_timeout = lo->timeout;
1559
1560    sprintf(procname, "%i", inode->fd);
1561    res = linkat(lo->proc_self_fd, procname, parent_inode->fd, name,
1562                 AT_SYMLINK_FOLLOW);
1563    if (res == -1) {
1564        goto out_err;
1565    }
1566
1567    res = fstatat(inode->fd, "", &e.attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1568    if (res == -1) {
1569        goto out_err;
1570    }
1571
1572    pthread_mutex_lock(&lo->mutex);
1573    inode->nlookup++;
1574    pthread_mutex_unlock(&lo->mutex);
1575    e.ino = inode->fuse_ino;
1576
1577    fuse_log(FUSE_LOG_DEBUG, "  %lli/%s -> %lli\n", (unsigned long long)parent,
1578             name, (unsigned long long)e.ino);
1579
1580    fuse_reply_entry(req, &e);
1581    lo_inode_put(lo, &parent_inode);
1582    lo_inode_put(lo, &inode);
1583    return;
1584
1585out_err:
1586    saverr = errno;
1587    lo_inode_put(lo, &parent_inode);
1588    lo_inode_put(lo, &inode);
1589    fuse_reply_err(req, saverr);
1590}
1591
1592/* Increments nlookup and caller must release refcount using lo_inode_put() */
1593static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent,
1594                                    const char *name)
1595{
1596    int res;
1597    uint64_t mnt_id;
1598    struct stat attr;
1599    struct lo_data *lo = lo_data(req);
1600    struct lo_inode *dir = lo_inode(req, parent);
1601
1602    if (!dir) {
1603        return NULL;
1604    }
1605
1606    res = do_statx(lo, dir->fd, name, &attr, AT_SYMLINK_NOFOLLOW, &mnt_id);
1607    lo_inode_put(lo, &dir);
1608    if (res == -1) {
1609        return NULL;
1610    }
1611
1612    return lo_find(lo, &attr, mnt_id);
1613}
1614
1615static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name)
1616{
1617    int res;
1618    struct lo_inode *inode;
1619    struct lo_data *lo = lo_data(req);
1620
1621    if (is_empty(name)) {
1622        fuse_reply_err(req, ENOENT);
1623        return;
1624    }
1625
1626    if (!is_safe_path_component(name)) {
1627        fuse_reply_err(req, EINVAL);
1628        return;
1629    }
1630
1631    inode = lookup_name(req, parent, name);
1632    if (!inode) {
1633        fuse_reply_err(req, EIO);
1634        return;
1635    }
1636
1637    res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR);
1638
1639    fuse_reply_err(req, res == -1 ? errno : 0);
1640    unref_inode_lolocked(lo, inode, 1);
1641    lo_inode_put(lo, &inode);
1642}
1643
1644static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name,
1645                      fuse_ino_t newparent, const char *newname,
1646                      unsigned int flags)
1647{
1648    int res;
1649    struct lo_inode *parent_inode;
1650    struct lo_inode *newparent_inode;
1651    struct lo_inode *oldinode = NULL;
1652    struct lo_inode *newinode = NULL;
1653    struct lo_data *lo = lo_data(req);
1654
1655    if (is_empty(name) || is_empty(newname)) {
1656        fuse_reply_err(req, ENOENT);
1657        return;
1658    }
1659
1660    if (!is_safe_path_component(name) || !is_safe_path_component(newname)) {
1661        fuse_reply_err(req, EINVAL);
1662        return;
1663    }
1664
1665    parent_inode = lo_inode(req, parent);
1666    newparent_inode = lo_inode(req, newparent);
1667    if (!parent_inode || !newparent_inode) {
1668        fuse_reply_err(req, EBADF);
1669        goto out;
1670    }
1671
1672    oldinode = lookup_name(req, parent, name);
1673    newinode = lookup_name(req, newparent, newname);
1674
1675    if (!oldinode) {
1676        fuse_reply_err(req, EIO);
1677        goto out;
1678    }
1679
1680    if (flags) {
1681#ifndef SYS_renameat2
1682        fuse_reply_err(req, EINVAL);
1683#else
1684        res = syscall(SYS_renameat2, parent_inode->fd, name,
1685                        newparent_inode->fd, newname, flags);
1686        if (res == -1 && errno == ENOSYS) {
1687            fuse_reply_err(req, EINVAL);
1688        } else {
1689            fuse_reply_err(req, res == -1 ? errno : 0);
1690        }
1691#endif
1692        goto out;
1693    }
1694
1695    res = renameat(parent_inode->fd, name, newparent_inode->fd, newname);
1696
1697    fuse_reply_err(req, res == -1 ? errno : 0);
1698out:
1699    unref_inode_lolocked(lo, oldinode, 1);
1700    unref_inode_lolocked(lo, newinode, 1);
1701    lo_inode_put(lo, &oldinode);
1702    lo_inode_put(lo, &newinode);
1703    lo_inode_put(lo, &parent_inode);
1704    lo_inode_put(lo, &newparent_inode);
1705}
1706
1707static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name)
1708{
1709    int res;
1710    struct lo_inode *inode;
1711    struct lo_data *lo = lo_data(req);
1712
1713    if (is_empty(name)) {
1714        fuse_reply_err(req, ENOENT);
1715        return;
1716    }
1717
1718    if (!is_safe_path_component(name)) {
1719        fuse_reply_err(req, EINVAL);
1720        return;
1721    }
1722
1723    inode = lookup_name(req, parent, name);
1724    if (!inode) {
1725        fuse_reply_err(req, EIO);
1726        return;
1727    }
1728
1729    res = unlinkat(lo_fd(req, parent), name, 0);
1730
1731    fuse_reply_err(req, res == -1 ? errno : 0);
1732    unref_inode_lolocked(lo, inode, 1);
1733    lo_inode_put(lo, &inode);
1734}
1735
1736/* To be called with lo->mutex held */
1737static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n)
1738{
1739    if (!inode) {
1740        return;
1741    }
1742
1743    assert(inode->nlookup >= n);
1744    inode->nlookup -= n;
1745    if (!inode->nlookup) {
1746        lo_map_remove(&lo->ino_map, inode->fuse_ino);
1747        g_hash_table_remove(lo->inodes, &inode->key);
1748        if (lo->posix_lock) {
1749            if (g_hash_table_size(inode->posix_locks)) {
1750                fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n");
1751            }
1752            g_hash_table_destroy(inode->posix_locks);
1753            pthread_mutex_destroy(&inode->plock_mutex);
1754        }
1755        /* Drop our refcount from lo_do_lookup() */
1756        lo_inode_put(lo, &inode);
1757    }
1758}
1759
1760static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
1761                                 uint64_t n)
1762{
1763    if (!inode) {
1764        return;
1765    }
1766
1767    pthread_mutex_lock(&lo->mutex);
1768    unref_inode(lo, inode, n);
1769    pthread_mutex_unlock(&lo->mutex);
1770}
1771
1772static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
1773{
1774    struct lo_data *lo = lo_data(req);
1775    struct lo_inode *inode;
1776
1777    inode = lo_inode(req, ino);
1778    if (!inode) {
1779        return;
1780    }
1781
1782    fuse_log(FUSE_LOG_DEBUG, "  forget %lli %lli -%lli\n",
1783             (unsigned long long)ino, (unsigned long long)inode->nlookup,
1784             (unsigned long long)nlookup);
1785
1786    unref_inode_lolocked(lo, inode, nlookup);
1787    lo_inode_put(lo, &inode);
1788}
1789
1790static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
1791{
1792    lo_forget_one(req, ino, nlookup);
1793    fuse_reply_none(req);
1794}
1795
1796static void lo_forget_multi(fuse_req_t req, size_t count,
1797                            struct fuse_forget_data *forgets)
1798{
1799    int i;
1800
1801    for (i = 0; i < count; i++) {
1802        lo_forget_one(req, forgets[i].ino, forgets[i].nlookup);
1803    }
1804    fuse_reply_none(req);
1805}
1806
1807static void lo_readlink(fuse_req_t req, fuse_ino_t ino)
1808{
1809    char buf[PATH_MAX + 1];
1810    int res;
1811
1812    res = readlinkat(lo_fd(req, ino), "", buf, sizeof(buf));
1813    if (res == -1) {
1814        return (void)fuse_reply_err(req, errno);
1815    }
1816
1817    if (res == sizeof(buf)) {
1818        return (void)fuse_reply_err(req, ENAMETOOLONG);
1819    }
1820
1821    buf[res] = '\0';
1822
1823    fuse_reply_readlink(req, buf);
1824}
1825
1826struct lo_dirp {
1827    gint refcount;
1828    DIR *dp;
1829    struct dirent *entry;
1830    off_t offset;
1831};
1832
1833static void lo_dirp_put(struct lo_dirp **dp)
1834{
1835    struct lo_dirp *d = *dp;
1836
1837    if (!d) {
1838        return;
1839    }
1840    *dp = NULL;
1841
1842    if (g_atomic_int_dec_and_test(&d->refcount)) {
1843        closedir(d->dp);
1844        free(d);
1845    }
1846}
1847
1848/* Call lo_dirp_put() on the return value when no longer needed */
1849static struct lo_dirp *lo_dirp(fuse_req_t req, struct fuse_file_info *fi)
1850{
1851    struct lo_data *lo = lo_data(req);
1852    struct lo_map_elem *elem;
1853
1854    pthread_mutex_lock(&lo->mutex);
1855    elem = lo_map_get(&lo->dirp_map, fi->fh);
1856    if (elem) {
1857        g_atomic_int_inc(&elem->dirp->refcount);
1858    }
1859    pthread_mutex_unlock(&lo->mutex);
1860    if (!elem) {
1861        return NULL;
1862    }
1863
1864    return elem->dirp;
1865}
1866
1867static void lo_opendir(fuse_req_t req, fuse_ino_t ino,
1868                       struct fuse_file_info *fi)
1869{
1870    int error = ENOMEM;
1871    struct lo_data *lo = lo_data(req);
1872    struct lo_dirp *d;
1873    int fd;
1874    ssize_t fh;
1875
1876    d = calloc(1, sizeof(struct lo_dirp));
1877    if (d == NULL) {
1878        goto out_err;
1879    }
1880
1881    fd = openat(lo_fd(req, ino), ".", O_RDONLY);
1882    if (fd == -1) {
1883        goto out_errno;
1884    }
1885
1886    d->dp = fdopendir(fd);
1887    if (d->dp == NULL) {
1888        goto out_errno;
1889    }
1890
1891    d->offset = 0;
1892    d->entry = NULL;
1893
1894    g_atomic_int_set(&d->refcount, 1); /* paired with lo_releasedir() */
1895    pthread_mutex_lock(&lo->mutex);
1896    fh = lo_add_dirp_mapping(req, d);
1897    pthread_mutex_unlock(&lo->mutex);
1898    if (fh == -1) {
1899        goto out_err;
1900    }
1901
1902    fi->fh = fh;
1903    if (lo->cache == CACHE_ALWAYS) {
1904        fi->cache_readdir = 1;
1905    }
1906    fuse_reply_open(req, fi);
1907    return;
1908
1909out_errno:
1910    error = errno;
1911out_err:
1912    if (d) {
1913        if (d->dp) {
1914            closedir(d->dp);
1915        } else if (fd != -1) {
1916            close(fd);
1917        }
1918        free(d);
1919    }
1920    fuse_reply_err(req, error);
1921}
1922
1923static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
1924                          off_t offset, struct fuse_file_info *fi, int plus)
1925{
1926    struct lo_data *lo = lo_data(req);
1927    struct lo_dirp *d = NULL;
1928    struct lo_inode *dinode;
1929    g_autofree char *buf = NULL;
1930    char *p;
1931    size_t rem = size;
1932    int err = EBADF;
1933
1934    dinode = lo_inode(req, ino);
1935    if (!dinode) {
1936        goto error;
1937    }
1938
1939    d = lo_dirp(req, fi);
1940    if (!d) {
1941        goto error;
1942    }
1943
1944    err = ENOMEM;
1945    buf = g_try_malloc0(size);
1946    if (!buf) {
1947        goto error;
1948    }
1949    p = buf;
1950
1951    if (offset != d->offset) {
1952        seekdir(d->dp, offset);
1953        d->entry = NULL;
1954        d->offset = offset;
1955    }
1956    while (1) {
1957        size_t entsize;
1958        off_t nextoff;
1959        const char *name;
1960
1961        if (!d->entry) {
1962            errno = 0;
1963            d->entry = readdir(d->dp);
1964            if (!d->entry) {
1965                if (errno) { /* Error */
1966                    err = errno;
1967                    goto error;
1968                } else { /* End of stream */
1969                    break;
1970                }
1971            }
1972        }
1973        nextoff = d->entry->d_off;
1974        name = d->entry->d_name;
1975
1976        fuse_ino_t entry_ino = 0;
1977        struct fuse_entry_param e = (struct fuse_entry_param){
1978            .attr.st_ino = d->entry->d_ino,
1979            .attr.st_mode = d->entry->d_type << 12,
1980        };
1981
1982        /* Hide root's parent directory */
1983        if (dinode == &lo->root && strcmp(name, "..") == 0) {
1984            e.attr.st_ino = lo->root.key.ino;
1985            e.attr.st_mode = DT_DIR << 12;
1986        }
1987
1988        if (plus) {
1989            if (!is_dot_or_dotdot(name)) {
1990                err = lo_do_lookup(req, ino, name, &e, NULL);
1991                if (err) {
1992                    goto error;
1993                }
1994                entry_ino = e.ino;
1995            }
1996
1997            entsize = fuse_add_direntry_plus(req, p, rem, name, &e, nextoff);
1998        } else {
1999            entsize = fuse_add_direntry(req, p, rem, name, &e.attr, nextoff);
2000        }
2001        if (entsize > rem) {
2002            if (entry_ino != 0) {
2003                lo_forget_one(req, entry_ino, 1);
2004            }
2005            break;
2006        }
2007
2008        p += entsize;
2009        rem -= entsize;
2010
2011        d->entry = NULL;
2012        d->offset = nextoff;
2013    }
2014
2015    err = 0;
2016error:
2017    lo_dirp_put(&d);
2018    lo_inode_put(lo, &dinode);
2019
2020    /*
2021     * If there's an error, we can only signal it if we haven't stored
2022     * any entries yet - otherwise we'd end up with wrong lookup
2023     * counts for the entries that are already in the buffer. So we
2024     * return what we've collected until that point.
2025     */
2026    if (err && rem == size) {
2027        fuse_reply_err(req, err);
2028    } else {
2029        fuse_reply_buf(req, buf, size - rem);
2030    }
2031}
2032
2033static void lo_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
2034                       off_t offset, struct fuse_file_info *fi)
2035{
2036    lo_do_readdir(req, ino, size, offset, fi, 0);
2037}
2038
2039static void lo_readdirplus(fuse_req_t req, fuse_ino_t ino, size_t size,
2040                           off_t offset, struct fuse_file_info *fi)
2041{
2042    lo_do_readdir(req, ino, size, offset, fi, 1);
2043}
2044
2045static void lo_releasedir(fuse_req_t req, fuse_ino_t ino,
2046                          struct fuse_file_info *fi)
2047{
2048    struct lo_data *lo = lo_data(req);
2049    struct lo_map_elem *elem;
2050    struct lo_dirp *d;
2051
2052    (void)ino;
2053
2054    pthread_mutex_lock(&lo->mutex);
2055    elem = lo_map_get(&lo->dirp_map, fi->fh);
2056    if (!elem) {
2057        pthread_mutex_unlock(&lo->mutex);
2058        fuse_reply_err(req, EBADF);
2059        return;
2060    }
2061
2062    d = elem->dirp;
2063    lo_map_remove(&lo->dirp_map, fi->fh);
2064    pthread_mutex_unlock(&lo->mutex);
2065
2066    lo_dirp_put(&d); /* paired with lo_opendir() */
2067
2068    fuse_reply_err(req, 0);
2069}
2070
2071static void update_open_flags(int writeback, int allow_direct_io,
2072                              struct fuse_file_info *fi)
2073{
2074    /*
2075     * With writeback cache, kernel may send read requests even
2076     * when userspace opened write-only
2077     */
2078    if (writeback && (fi->flags & O_ACCMODE) == O_WRONLY) {
2079        fi->flags &= ~O_ACCMODE;
2080        fi->flags |= O_RDWR;
2081    }
2082
2083    /*
2084     * With writeback cache, O_APPEND is handled by the kernel.
2085     * This breaks atomicity (since the file may change in the
2086     * underlying filesystem, so that the kernel's idea of the
2087     * end of the file isn't accurate anymore). In this example,
2088     * we just accept that. A more rigorous filesystem may want
2089     * to return an error here
2090     */
2091    if (writeback && (fi->flags & O_APPEND)) {
2092        fi->flags &= ~O_APPEND;
2093    }
2094
2095    /*
2096     * O_DIRECT in guest should not necessarily mean bypassing page
2097     * cache on host as well. Therefore, we discard it by default
2098     * ('-o no_allow_direct_io'). If somebody needs that behavior,
2099     * the '-o allow_direct_io' option should be set.
2100     */
2101    if (!allow_direct_io) {
2102        fi->flags &= ~O_DIRECT;
2103    }
2104}
2105
2106/*
2107 * Open a regular file, set up an fd mapping, and fill out the struct
2108 * fuse_file_info for it. If existing_fd is not negative, use that fd instead
2109 * opening a new one. Takes ownership of existing_fd.
2110 *
2111 * Returns 0 on success or a positive errno.
2112 */
2113static int lo_do_open(struct lo_data *lo, struct lo_inode *inode,
2114                      int existing_fd, struct fuse_file_info *fi)
2115{
2116    ssize_t fh;
2117    int fd = existing_fd;
2118    int err;
2119    bool cap_fsetid_dropped = false;
2120    bool kill_suidgid = lo->killpriv_v2 && fi->kill_priv;
2121
2122    update_open_flags(lo->writeback, lo->allow_direct_io, fi);
2123
2124    if (fd < 0) {
2125        if (kill_suidgid) {
2126            err = drop_effective_cap("FSETID", &cap_fsetid_dropped);
2127            if (err) {
2128                return err;
2129            }
2130        }
2131
2132        fd = lo_inode_open(lo, inode, fi->flags);
2133
2134        if (cap_fsetid_dropped) {
2135            if (gain_effective_cap("FSETID")) {
2136                fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
2137            }
2138        }
2139        if (fd < 0) {
2140            return -fd;
2141        }
2142        if (fi->flags & (O_TRUNC)) {
2143            int err = drop_security_capability(lo, fd);
2144            if (err) {
2145                close(fd);
2146                return err;
2147            }
2148        }
2149    }
2150
2151    pthread_mutex_lock(&lo->mutex);
2152    fh = lo_add_fd_mapping(lo, fd);
2153    pthread_mutex_unlock(&lo->mutex);
2154    if (fh == -1) {
2155        close(fd);
2156        return ENOMEM;
2157    }
2158
2159    fi->fh = fh;
2160    if (lo->cache == CACHE_NONE) {
2161        fi->direct_io = 1;
2162    } else if (lo->cache == CACHE_ALWAYS) {
2163        fi->keep_cache = 1;
2164    }
2165    return 0;
2166}
2167
2168static int do_create_nosecctx(fuse_req_t req, struct lo_inode *parent_inode,
2169                               const char *name, mode_t mode,
2170                               struct fuse_file_info *fi, int *open_fd,
2171                              bool tmpfile)
2172{
2173    int err, fd;
2174    struct lo_cred old = {};
2175    struct lo_data *lo = lo_data(req);
2176    int flags;
2177
2178    if (tmpfile) {
2179        flags = fi->flags | O_TMPFILE;
2180        /*
2181         * Don't use O_EXCL as we want to link file later. Also reset O_CREAT
2182         * otherwise openat() returns -EINVAL.
2183         */
2184        flags &= ~(O_CREAT | O_EXCL);
2185
2186        /* O_TMPFILE needs either O_RDWR or O_WRONLY */
2187        if ((flags & O_ACCMODE) == O_RDONLY) {
2188            flags |= O_RDWR;
2189        }
2190    } else {
2191        flags = fi->flags | O_CREAT | O_EXCL;
2192    }
2193
2194    err = lo_change_cred(req, &old, lo->change_umask);
2195    if (err) {
2196        return err;
2197    }
2198
2199    /* Try to create a new file but don't open existing files */
2200    fd = openat(parent_inode->fd, name, flags, mode);
2201    err = fd == -1 ? errno : 0;
2202    lo_restore_cred(&old, lo->change_umask);
2203    if (!err) {
2204        *open_fd = fd;
2205    }
2206    return err;
2207}
2208
2209static int do_create_secctx_fscreate(fuse_req_t req,
2210                                     struct lo_inode *parent_inode,
2211                                     const char *name, mode_t mode,
2212                                     struct fuse_file_info *fi, int *open_fd)
2213{
2214    int err = 0, fd = -1, fscreate_fd = -1;
2215    struct lo_data *lo = lo_data(req);
2216
2217    err = open_set_proc_fscreate(lo, req->secctx.ctx, req->secctx.ctxlen,
2218                                 &fscreate_fd);
2219    if (err) {
2220        return err;
2221    }
2222
2223    err = do_create_nosecctx(req, parent_inode, name, mode, fi, &fd, false);
2224
2225    close_reset_proc_fscreate(fscreate_fd);
2226    if (!err) {
2227        *open_fd = fd;
2228    }
2229    return err;
2230}
2231
2232static int do_create_secctx_tmpfile(fuse_req_t req,
2233                                    struct lo_inode *parent_inode,
2234                                    const char *name, mode_t mode,
2235                                    struct fuse_file_info *fi,
2236                                    const char *secctx_name, int *open_fd)
2237{
2238    int err, fd = -1;
2239    struct lo_data *lo = lo_data(req);
2240    char procname[64];
2241
2242    err = do_create_nosecctx(req, parent_inode, ".", mode, fi, &fd, true);
2243    if (err) {
2244        return err;
2245    }
2246
2247    err = fsetxattr(fd, secctx_name, req->secctx.ctx, req->secctx.ctxlen, 0);
2248    if (err) {
2249        err = errno;
2250        goto out;
2251    }
2252
2253    /* Security context set on file. Link it in place */
2254    sprintf(procname, "%d", fd);
2255    FCHDIR_NOFAIL(lo->proc_self_fd);
2256    err = linkat(AT_FDCWD, procname, parent_inode->fd, name,
2257                 AT_SYMLINK_FOLLOW);
2258    err = err == -1 ? errno : 0;
2259    FCHDIR_NOFAIL(lo->root.fd);
2260
2261out:
2262    if (!err) {
2263        *open_fd = fd;
2264    } else if (fd != -1) {
2265        close(fd);
2266    }
2267    return err;
2268}
2269
2270static int do_create_secctx_noatomic(fuse_req_t req,
2271                                     struct lo_inode *parent_inode,
2272                                     const char *name, mode_t mode,
2273                                     struct fuse_file_info *fi,
2274                                     const char *secctx_name, int *open_fd)
2275{
2276    int err = 0, fd = -1;
2277
2278    err = do_create_nosecctx(req, parent_inode, name, mode, fi, &fd, false);
2279    if (err) {
2280        goto out;
2281    }
2282
2283    /* Set security context. This is not atomic w.r.t file creation */
2284    err = fsetxattr(fd, secctx_name, req->secctx.ctx, req->secctx.ctxlen, 0);
2285    err = err == -1 ? errno : 0;
2286out:
2287    if (!err) {
2288        *open_fd = fd;
2289    } else {
2290        if (fd != -1) {
2291            close(fd);
2292            unlinkat(parent_inode->fd, name, 0);
2293        }
2294    }
2295    return err;
2296}
2297
2298static int do_lo_create(fuse_req_t req, struct lo_inode *parent_inode,
2299                        const char *name, mode_t mode,
2300                        struct fuse_file_info *fi, int *open_fd)
2301{
2302    struct lo_data *lo = lo_data(req);
2303    char *mapped_name = NULL;
2304    int err;
2305    const char *ctxname = req->secctx.name;
2306    bool secctx_enabled = req->secctx.ctxlen;
2307
2308    if (secctx_enabled && lo->xattrmap) {
2309        err = xattr_map_client(lo, req->secctx.name, &mapped_name);
2310        if (err < 0) {
2311            return -err;
2312        }
2313
2314        ctxname = mapped_name;
2315    }
2316
2317    if (secctx_enabled) {
2318        /*
2319         * If security.selinux has not been remapped and selinux is enabled,
2320         * use fscreate to set context before file creation. If not, use
2321         * tmpfile method for regular files. Otherwise fallback to
2322         * non-atomic method of file creation and xattr settting.
2323         */
2324        if (!mapped_name && lo->use_fscreate) {
2325            err = do_create_secctx_fscreate(req, parent_inode, name, mode, fi,
2326                                            open_fd);
2327            goto out;
2328        } else if (S_ISREG(mode)) {
2329            err = do_create_secctx_tmpfile(req, parent_inode, name, mode, fi,
2330                                           ctxname, open_fd);
2331            /*
2332             * If filesystem does not support O_TMPFILE, fallback to non-atomic
2333             * method.
2334             */
2335            if (!err || err != EOPNOTSUPP) {
2336                goto out;
2337            }
2338        }
2339
2340        err = do_create_secctx_noatomic(req, parent_inode, name, mode, fi,
2341                                        ctxname, open_fd);
2342    } else {
2343        err = do_create_nosecctx(req, parent_inode, name, mode, fi, open_fd,
2344                                 false);
2345    }
2346
2347out:
2348    g_free(mapped_name);
2349    return err;
2350}
2351
2352static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name,
2353                      mode_t mode, struct fuse_file_info *fi)
2354{
2355    int fd = -1;
2356    struct lo_data *lo = lo_data(req);
2357    struct lo_inode *parent_inode;
2358    struct lo_inode *inode = NULL;
2359    struct fuse_entry_param e;
2360    int err;
2361
2362    fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)"
2363             " kill_priv=%d\n", parent, name, fi->kill_priv);
2364
2365    if (!is_safe_path_component(name)) {
2366        fuse_reply_err(req, EINVAL);
2367        return;
2368    }
2369
2370    parent_inode = lo_inode(req, parent);
2371    if (!parent_inode) {
2372        fuse_reply_err(req, EBADF);
2373        return;
2374    }
2375
2376    update_open_flags(lo->writeback, lo->allow_direct_io, fi);
2377
2378    err = do_lo_create(req, parent_inode, name, mode, fi, &fd);
2379
2380    /* Ignore the error if file exists and O_EXCL was not given */
2381    if (err && (err != EEXIST || (fi->flags & O_EXCL))) {
2382        goto out;
2383    }
2384
2385    err = lo_do_lookup(req, parent, name, &e, &inode);
2386    if (err) {
2387        goto out;
2388    }
2389
2390    err = lo_do_open(lo, inode, fd, fi);
2391    fd = -1; /* lo_do_open() takes ownership of fd */
2392    if (err) {
2393        /* Undo lo_do_lookup() nlookup ref */
2394        unref_inode_lolocked(lo, inode, 1);
2395    }
2396
2397out:
2398    lo_inode_put(lo, &inode);
2399    lo_inode_put(lo, &parent_inode);
2400
2401    if (err) {
2402        if (fd >= 0) {
2403            close(fd);
2404        }
2405
2406        fuse_reply_err(req, err);
2407    } else {
2408        fuse_reply_create(req, &e, fi);
2409    }
2410}
2411
2412/* Should be called with inode->plock_mutex held */
2413static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo,
2414                                                      struct lo_inode *inode,
2415                                                      uint64_t lock_owner,
2416                                                      pid_t pid, int *err)
2417{
2418    struct lo_inode_plock *plock;
2419    int fd;
2420
2421    plock =
2422        g_hash_table_lookup(inode->posix_locks, GUINT_TO_POINTER(lock_owner));
2423
2424    if (plock) {
2425        return plock;
2426    }
2427
2428    plock = malloc(sizeof(struct lo_inode_plock));
2429    if (!plock) {
2430        *err = ENOMEM;
2431        return NULL;
2432    }
2433
2434    /* Open another instance of file which can be used for ofd locks. */
2435    /* TODO: What if file is not writable? */
2436    fd = lo_inode_open(lo, inode, O_RDWR);
2437    if (fd < 0) {
2438        *err = -fd;
2439        free(plock);
2440        return NULL;
2441    }
2442
2443    plock->lock_owner = lock_owner;
2444    plock->fd = fd;
2445    g_hash_table_insert(inode->posix_locks, GUINT_TO_POINTER(plock->lock_owner),
2446                        plock);
2447    return plock;
2448}
2449
2450static void lo_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
2451                     struct flock *lock)
2452{
2453    struct lo_data *lo = lo_data(req);
2454    struct lo_inode *inode;
2455    struct lo_inode_plock *plock;
2456    int ret, saverr = 0;
2457
2458    fuse_log(FUSE_LOG_DEBUG,
2459             "lo_getlk(ino=%" PRIu64 ", flags=%d)"
2460             " owner=0x%" PRIx64 ", l_type=%d l_start=0x%" PRIx64
2461             " l_len=0x%" PRIx64 "\n",
2462             ino, fi->flags, fi->lock_owner, lock->l_type,
2463             (uint64_t)lock->l_start, (uint64_t)lock->l_len);
2464
2465    if (!lo->posix_lock) {
2466        fuse_reply_err(req, ENOSYS);
2467        return;
2468    }
2469
2470    inode = lo_inode(req, ino);
2471    if (!inode) {
2472        fuse_reply_err(req, EBADF);
2473        return;
2474    }
2475
2476    pthread_mutex_lock(&inode->plock_mutex);
2477    plock =
2478        lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
2479    if (!plock) {
2480        saverr = ret;
2481        goto out;
2482    }
2483
2484    ret = fcntl(plock->fd, F_OFD_GETLK, lock);
2485    if (ret == -1) {
2486        saverr = errno;
2487    }
2488
2489out:
2490    pthread_mutex_unlock(&inode->plock_mutex);
2491    lo_inode_put(lo, &inode);
2492
2493    if (saverr) {
2494        fuse_reply_err(req, saverr);
2495    } else {
2496        fuse_reply_lock(req, lock);
2497    }
2498}
2499
2500static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
2501                     struct flock *lock, int sleep)
2502{
2503    struct lo_data *lo = lo_data(req);
2504    struct lo_inode *inode;
2505    struct lo_inode_plock *plock;
2506    int ret, saverr = 0;
2507
2508    fuse_log(FUSE_LOG_DEBUG,
2509             "lo_setlk(ino=%" PRIu64 ", flags=%d)"
2510             " cmd=%d pid=%d owner=0x%" PRIx64 " sleep=%d l_whence=%d"
2511             " l_start=0x%" PRIx64 " l_len=0x%" PRIx64 "\n",
2512             ino, fi->flags, lock->l_type, lock->l_pid, fi->lock_owner, sleep,
2513             lock->l_whence, (uint64_t)lock->l_start, (uint64_t)lock->l_len);
2514
2515    if (!lo->posix_lock) {
2516        fuse_reply_err(req, ENOSYS);
2517        return;
2518    }
2519
2520    if (sleep) {
2521        fuse_reply_err(req, EOPNOTSUPP);
2522        return;
2523    }
2524
2525    inode = lo_inode(req, ino);
2526    if (!inode) {
2527        fuse_reply_err(req, EBADF);
2528        return;
2529    }
2530
2531    pthread_mutex_lock(&inode->plock_mutex);
2532    plock =
2533        lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
2534
2535    if (!plock) {
2536        saverr = ret;
2537        goto out;
2538    }
2539
2540    /* TODO: Is it alright to modify flock? */
2541    lock->l_pid = 0;
2542    ret = fcntl(plock->fd, F_OFD_SETLK, lock);
2543    if (ret == -1) {
2544        saverr = errno;
2545    }
2546
2547out:
2548    pthread_mutex_unlock(&inode->plock_mutex);
2549    lo_inode_put(lo, &inode);
2550
2551    fuse_reply_err(req, saverr);
2552}
2553
2554static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync,
2555                        struct fuse_file_info *fi)
2556{
2557    int res;
2558    struct lo_dirp *d;
2559    int fd;
2560
2561    (void)ino;
2562
2563    d = lo_dirp(req, fi);
2564    if (!d) {
2565        fuse_reply_err(req, EBADF);
2566        return;
2567    }
2568
2569    fd = dirfd(d->dp);
2570    if (datasync) {
2571        res = fdatasync(fd);
2572    } else {
2573        res = fsync(fd);
2574    }
2575
2576    lo_dirp_put(&d);
2577
2578    fuse_reply_err(req, res == -1 ? errno : 0);
2579}
2580
2581static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
2582{
2583    struct lo_data *lo = lo_data(req);
2584    struct lo_inode *inode = lo_inode(req, ino);
2585    int err;
2586
2587    fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d, kill_priv=%d)"
2588             "\n", ino, fi->flags, fi->kill_priv);
2589
2590    if (!inode) {
2591        fuse_reply_err(req, EBADF);
2592        return;
2593    }
2594
2595    err = lo_do_open(lo, inode, -1, fi);
2596    lo_inode_put(lo, &inode);
2597    if (err) {
2598        fuse_reply_err(req, err);
2599    } else {
2600        fuse_reply_open(req, fi);
2601    }
2602}
2603
2604static void lo_release(fuse_req_t req, fuse_ino_t ino,
2605                       struct fuse_file_info *fi)
2606{
2607    struct lo_data *lo = lo_data(req);
2608    struct lo_map_elem *elem;
2609    int fd = -1;
2610
2611    (void)ino;
2612
2613    pthread_mutex_lock(&lo->mutex);
2614    elem = lo_map_get(&lo->fd_map, fi->fh);
2615    if (elem) {
2616        fd = elem->fd;
2617        elem = NULL;
2618        lo_map_remove(&lo->fd_map, fi->fh);
2619    }
2620    pthread_mutex_unlock(&lo->mutex);
2621
2622    close(fd);
2623    fuse_reply_err(req, 0);
2624}
2625
2626static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
2627{
2628    int res;
2629    (void)ino;
2630    struct lo_inode *inode;
2631    struct lo_data *lo = lo_data(req);
2632
2633    inode = lo_inode(req, ino);
2634    if (!inode) {
2635        fuse_reply_err(req, EBADF);
2636        return;
2637    }
2638
2639    if (!S_ISREG(inode->filetype)) {
2640        lo_inode_put(lo, &inode);
2641        fuse_reply_err(req, EBADF);
2642        return;
2643    }
2644
2645    /* An fd is going away. Cleanup associated posix locks */
2646    if (lo->posix_lock) {
2647        pthread_mutex_lock(&inode->plock_mutex);
2648        g_hash_table_remove(inode->posix_locks,
2649            GUINT_TO_POINTER(fi->lock_owner));
2650        pthread_mutex_unlock(&inode->plock_mutex);
2651    }
2652    res = close(dup(lo_fi_fd(req, fi)));
2653    lo_inode_put(lo, &inode);
2654    fuse_reply_err(req, res == -1 ? errno : 0);
2655}
2656
2657static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync,
2658                     struct fuse_file_info *fi)
2659{
2660    struct lo_inode *inode = lo_inode(req, ino);
2661    struct lo_data *lo = lo_data(req);
2662    int res;
2663    int fd;
2664
2665    fuse_log(FUSE_LOG_DEBUG, "lo_fsync(ino=%" PRIu64 ", fi=0x%p)\n", ino,
2666             (void *)fi);
2667
2668    if (!inode) {
2669        fuse_reply_err(req, EBADF);
2670        return;
2671    }
2672
2673    if (!fi) {
2674        fd = lo_inode_open(lo, inode, O_RDWR);
2675        if (fd < 0) {
2676            res = -fd;
2677            goto out;
2678        }
2679    } else {
2680        fd = lo_fi_fd(req, fi);
2681    }
2682
2683    if (datasync) {
2684        res = fdatasync(fd) == -1 ? errno : 0;
2685    } else {
2686        res = fsync(fd) == -1 ? errno : 0;
2687    }
2688    if (!fi) {
2689        close(fd);
2690    }
2691out:
2692    lo_inode_put(lo, &inode);
2693    fuse_reply_err(req, res);
2694}
2695
2696static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset,
2697                    struct fuse_file_info *fi)
2698{
2699    struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size);
2700
2701    fuse_log(FUSE_LOG_DEBUG,
2702             "lo_read(ino=%" PRIu64 ", size=%zd, "
2703             "off=%lu)\n",
2704             ino, size, (unsigned long)offset);
2705
2706    buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
2707    buf.buf[0].fd = lo_fi_fd(req, fi);
2708    buf.buf[0].pos = offset;
2709
2710    fuse_reply_data(req, &buf);
2711}
2712
2713static void lo_write_buf(fuse_req_t req, fuse_ino_t ino,
2714                         struct fuse_bufvec *in_buf, off_t off,
2715                         struct fuse_file_info *fi)
2716{
2717    (void)ino;
2718    ssize_t res;
2719    struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf));
2720    bool cap_fsetid_dropped = false;
2721
2722    out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
2723    out_buf.buf[0].fd = lo_fi_fd(req, fi);
2724    out_buf.buf[0].pos = off;
2725
2726    fuse_log(FUSE_LOG_DEBUG,
2727             "lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu kill_priv=%d)\n",
2728             ino, out_buf.buf[0].size, (unsigned long)off, fi->kill_priv);
2729
2730    res = drop_security_capability(lo_data(req), out_buf.buf[0].fd);
2731    if (res) {
2732        fuse_reply_err(req, res);
2733        return;
2734    }
2735
2736    /*
2737     * If kill_priv is set, drop CAP_FSETID which should lead to kernel
2738     * clearing setuid/setgid on file. Note, for WRITE, we need to do
2739     * this even if killpriv_v2 is not enabled. fuse direct write path
2740     * relies on this.
2741     */
2742    if (fi->kill_priv) {
2743        res = drop_effective_cap("FSETID", &cap_fsetid_dropped);
2744        if (res != 0) {
2745            fuse_reply_err(req, res);
2746            return;
2747        }
2748    }
2749
2750    res = fuse_buf_copy(&out_buf, in_buf);
2751    if (res < 0) {
2752        fuse_reply_err(req, -res);
2753    } else {
2754        fuse_reply_write(req, (size_t)res);
2755    }
2756
2757    if (cap_fsetid_dropped) {
2758        res = gain_effective_cap("FSETID");
2759        if (res) {
2760            fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
2761        }
2762    }
2763}
2764
2765static void lo_statfs(fuse_req_t req, fuse_ino_t ino)
2766{
2767    int res;
2768    struct statvfs stbuf;
2769
2770    res = fstatvfs(lo_fd(req, ino), &stbuf);
2771    if (res == -1) {
2772        fuse_reply_err(req, errno);
2773    } else {
2774        fuse_reply_statfs(req, &stbuf);
2775    }
2776}
2777
2778static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset,
2779                         off_t length, struct fuse_file_info *fi)
2780{
2781    int err = EOPNOTSUPP;
2782    (void)ino;
2783
2784#ifdef CONFIG_FALLOCATE
2785    err = fallocate(lo_fi_fd(req, fi), mode, offset, length);
2786    if (err < 0) {
2787        err = errno;
2788    }
2789
2790#elif defined(CONFIG_POSIX_FALLOCATE)
2791    if (mode) {
2792        fuse_reply_err(req, EOPNOTSUPP);
2793        return;
2794    }
2795
2796    err = posix_fallocate(lo_fi_fd(req, fi), offset, length);
2797#endif
2798
2799    fuse_reply_err(req, err);
2800}
2801
2802static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
2803                     int op)
2804{
2805    int res;
2806    (void)ino;
2807
2808    if (!(op & LOCK_NB)) {
2809        /*
2810         * Blocking flock can deadlock as there is only one thread
2811         * serving the queue.
2812         */
2813        fuse_reply_err(req, EOPNOTSUPP);
2814        return;
2815    }
2816
2817    res = flock(lo_fi_fd(req, fi), op);
2818
2819    fuse_reply_err(req, res == -1 ? errno : 0);
2820}
2821
2822/* types */
2823/*
2824 * Exit; process attribute unmodified if matched.
2825 * An empty key applies to all.
2826 */
2827#define XATTR_MAP_FLAG_OK      (1 <<  0)
2828/*
2829 * The attribute is unwanted;
2830 * EPERM on write, hidden on read.
2831 */
2832#define XATTR_MAP_FLAG_BAD     (1 <<  1)
2833/*
2834 * For attr that start with 'key' prepend 'prepend'
2835 * 'key' may be empty to prepend for all attrs
2836 * key is defined from set/remove point of view.
2837 * Automatically reversed on read
2838 */
2839#define XATTR_MAP_FLAG_PREFIX  (1 <<  2)
2840/*
2841 * The attribute is unsupported;
2842 * ENOTSUP on write, hidden on read.
2843 */
2844#define XATTR_MAP_FLAG_UNSUPPORTED     (1 <<  3)
2845
2846/* scopes */
2847/* Apply rule to get/set/remove */
2848#define XATTR_MAP_FLAG_CLIENT  (1 << 16)
2849/* Apply rule to list */
2850#define XATTR_MAP_FLAG_SERVER  (1 << 17)
2851/* Apply rule to all */
2852#define XATTR_MAP_FLAG_ALL   (XATTR_MAP_FLAG_SERVER | XATTR_MAP_FLAG_CLIENT)
2853
2854static void add_xattrmap_entry(struct lo_data *lo,
2855                               const XattrMapEntry *new_entry)
2856{
2857    XattrMapEntry *res = g_realloc_n(lo->xattr_map_list,
2858                                     lo->xattr_map_nentries + 1,
2859                                     sizeof(XattrMapEntry));
2860    res[lo->xattr_map_nentries++] = *new_entry;
2861
2862    lo->xattr_map_list = res;
2863}
2864
2865static void free_xattrmap(struct lo_data *lo)
2866{
2867    XattrMapEntry *map = lo->xattr_map_list;
2868    size_t i;
2869
2870    if (!map) {
2871        return;
2872    }
2873
2874    for (i = 0; i < lo->xattr_map_nentries; i++) {
2875        g_free(map[i].key);
2876        g_free(map[i].prepend);
2877    };
2878
2879    g_free(map);
2880    lo->xattr_map_list = NULL;
2881    lo->xattr_map_nentries = -1;
2882}
2883
2884/*
2885 * Handle the 'map' type, which is sugar for a set of commands
2886 * for the common case of prefixing a subset or everything,
2887 * and allowing anything not prefixed through.
2888 * It must be the last entry in the stream, although there
2889 * can be other entries before it.
2890 * The form is:
2891 *    :map:key:prefix:
2892 *
2893 * key maybe empty in which case all entries are prefixed.
2894 */
2895static void parse_xattrmap_map(struct lo_data *lo,
2896                               const char *rule, char sep)
2897{
2898    const char *tmp;
2899    char *key;
2900    char *prefix;
2901    XattrMapEntry tmp_entry;
2902
2903    if (*rule != sep) {
2904        fuse_log(FUSE_LOG_ERR,
2905                 "%s: Expecting '%c' after 'map' keyword, found '%c'\n",
2906                 __func__, sep, *rule);
2907        exit(1);
2908    }
2909
2910    rule++;
2911
2912    /* At start of 'key' field */
2913    tmp = strchr(rule, sep);
2914    if (!tmp) {
2915        fuse_log(FUSE_LOG_ERR,
2916                 "%s: Missing '%c' at end of key field in map rule\n",
2917                 __func__, sep);
2918        exit(1);
2919    }
2920
2921    key = g_strndup(rule, tmp - rule);
2922    rule = tmp + 1;
2923
2924    /* At start of prefix field */
2925    tmp = strchr(rule, sep);
2926    if (!tmp) {
2927        fuse_log(FUSE_LOG_ERR,
2928                 "%s: Missing '%c' at end of prefix field in map rule\n",
2929                 __func__, sep);
2930        exit(1);
2931    }
2932
2933    prefix = g_strndup(rule, tmp - rule);
2934    rule = tmp + 1;
2935
2936    /*
2937     * This should be the end of the string, we don't allow
2938     * any more commands after 'map'.
2939     */
2940    if (*rule) {
2941        fuse_log(FUSE_LOG_ERR,
2942                 "%s: Expecting end of command after map, found '%c'\n",
2943                 __func__, *rule);
2944        exit(1);
2945    }
2946
2947    /* 1st: Prefix matches/everything */
2948    tmp_entry.flags = XATTR_MAP_FLAG_PREFIX | XATTR_MAP_FLAG_ALL;
2949    tmp_entry.key = g_strdup(key);
2950    tmp_entry.prepend = g_strdup(prefix);
2951    add_xattrmap_entry(lo, &tmp_entry);
2952
2953    if (!*key) {
2954        /* Prefix all case */
2955
2956        /* 2nd: Hide any non-prefixed entries on the host */
2957        tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_ALL;
2958        tmp_entry.key = g_strdup("");
2959        tmp_entry.prepend = g_strdup("");
2960        add_xattrmap_entry(lo, &tmp_entry);
2961    } else {
2962        /* Prefix matching case */
2963
2964        /* 2nd: Hide non-prefixed but matching entries on the host */
2965        tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_SERVER;
2966        tmp_entry.key = g_strdup(""); /* Not used */
2967        tmp_entry.prepend = g_strdup(key);
2968        add_xattrmap_entry(lo, &tmp_entry);
2969
2970        /* 3rd: Stop the client accessing prefixed attributes directly */
2971        tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_CLIENT;
2972        tmp_entry.key = g_strdup(prefix);
2973        tmp_entry.prepend = g_strdup(""); /* Not used */
2974        add_xattrmap_entry(lo, &tmp_entry);
2975
2976        /* 4th: Everything else is OK */
2977        tmp_entry.flags = XATTR_MAP_FLAG_OK | XATTR_MAP_FLAG_ALL;
2978        tmp_entry.key = g_strdup("");
2979        tmp_entry.prepend = g_strdup("");
2980        add_xattrmap_entry(lo, &tmp_entry);
2981    }
2982
2983    g_free(key);
2984    g_free(prefix);
2985}
2986
2987static void parse_xattrmap(struct lo_data *lo)
2988{
2989    const char *map = lo->xattrmap;
2990    const char *tmp;
2991    int ret;
2992
2993    lo->xattr_map_nentries = 0;
2994    while (*map) {
2995        XattrMapEntry tmp_entry;
2996        char sep;
2997
2998        if (isspace(*map)) {
2999            map++;
3000            continue;
3001        }
3002        /* The separator is the first non-space of the rule */
3003        sep = *map++;
3004        if (!sep) {
3005            break;
3006        }
3007
3008        tmp_entry.flags = 0;
3009        /* Start of 'type' */
3010        if (strstart(map, "prefix", &map)) {
3011            tmp_entry.flags |= XATTR_MAP_FLAG_PREFIX;
3012        } else if (strstart(map, "ok", &map)) {
3013            tmp_entry.flags |= XATTR_MAP_FLAG_OK;
3014        } else if (strstart(map, "bad", &map)) {
3015            tmp_entry.flags |= XATTR_MAP_FLAG_BAD;
3016        } else if (strstart(map, "unsupported", &map)) {
3017            tmp_entry.flags |= XATTR_MAP_FLAG_UNSUPPORTED;
3018        } else if (strstart(map, "map", &map)) {
3019            /*
3020             * map is sugar that adds a number of rules, and must be
3021             * the last entry.
3022             */
3023            parse_xattrmap_map(lo, map, sep);
3024            break;
3025        } else {
3026            fuse_log(FUSE_LOG_ERR,
3027                     "%s: Unexpected type;"
3028                     "Expecting 'prefix', 'ok', 'bad', 'unsupported' or 'map'"
3029                     " in rule %zu\n", __func__, lo->xattr_map_nentries);
3030            exit(1);
3031        }
3032
3033        if (*map++ != sep) {
3034            fuse_log(FUSE_LOG_ERR,
3035                     "%s: Missing '%c' at end of type field of rule %zu\n",
3036                     __func__, sep, lo->xattr_map_nentries);
3037            exit(1);
3038        }
3039
3040        /* Start of 'scope' */
3041        if (strstart(map, "client", &map)) {
3042            tmp_entry.flags |= XATTR_MAP_FLAG_CLIENT;
3043        } else if (strstart(map, "server", &map)) {
3044            tmp_entry.flags |= XATTR_MAP_FLAG_SERVER;
3045        } else if (strstart(map, "all", &map)) {
3046            tmp_entry.flags |= XATTR_MAP_FLAG_ALL;
3047        } else {
3048            fuse_log(FUSE_LOG_ERR,
3049                     "%s: Unexpected scope;"
3050                     " Expecting 'client', 'server', or 'all', in rule %zu\n",
3051                     __func__, lo->xattr_map_nentries);
3052            exit(1);
3053        }
3054
3055        if (*map++ != sep) {
3056            fuse_log(FUSE_LOG_ERR,
3057                     "%s: Expecting '%c' found '%c'"
3058                     " after scope in rule %zu\n",
3059                     __func__, sep, *map, lo->xattr_map_nentries);
3060            exit(1);
3061        }
3062
3063        /* At start of 'key' field */
3064        tmp = strchr(map, sep);
3065        if (!tmp) {
3066            fuse_log(FUSE_LOG_ERR,
3067                     "%s: Missing '%c' at end of key field of rule %zu",
3068                     __func__, sep, lo->xattr_map_nentries);
3069            exit(1);
3070        }
3071        tmp_entry.key = g_strndup(map, tmp - map);
3072        map = tmp + 1;
3073
3074        /* At start of 'prepend' field */
3075        tmp = strchr(map, sep);
3076        if (!tmp) {
3077            fuse_log(FUSE_LOG_ERR,
3078                     "%s: Missing '%c' at end of prepend field of rule %zu",
3079                     __func__, sep, lo->xattr_map_nentries);
3080            exit(1);
3081        }
3082        tmp_entry.prepend = g_strndup(map, tmp - map);
3083        map = tmp + 1;
3084
3085        add_xattrmap_entry(lo, &tmp_entry);
3086        /* End of rule - go around again for another rule */
3087    }
3088
3089    if (!lo->xattr_map_nentries) {
3090        fuse_log(FUSE_LOG_ERR, "Empty xattr map\n");
3091        exit(1);
3092    }
3093
3094    ret = xattr_map_client(lo, "security.capability",
3095                           &lo->xattr_security_capability);
3096    if (ret) {
3097        fuse_log(FUSE_LOG_ERR, "Failed to map security.capability: %s\n",
3098                strerror(ret));
3099        exit(1);
3100    }
3101    if (!lo->xattr_security_capability ||
3102        !strcmp(lo->xattr_security_capability, "security.capability")) {
3103        /* 1-1 mapping, don't need to do anything */
3104        free(lo->xattr_security_capability);
3105        lo->xattr_security_capability = NULL;
3106    }
3107}
3108
3109/*
3110 * For use with getxattr/setxattr/removexattr, where the client
3111 * gives us a name and we may need to choose a different one.
3112 * Allocates a buffer for the result placing it in *out_name.
3113 *   If there's no change then *out_name is not set.
3114 * Returns 0 on success
3115 * Can return -EPERM to indicate we block a given attribute
3116 *   (in which case out_name is not allocated)
3117 * Can return -ENOMEM to indicate out_name couldn't be allocated.
3118 */
3119static int xattr_map_client(const struct lo_data *lo, const char *client_name,
3120                            char **out_name)
3121{
3122    size_t i;
3123    for (i = 0; i < lo->xattr_map_nentries; i++) {
3124        const XattrMapEntry *cur_entry = lo->xattr_map_list + i;
3125
3126        if ((cur_entry->flags & XATTR_MAP_FLAG_CLIENT) &&
3127            (strstart(client_name, cur_entry->key, NULL))) {
3128            if (cur_entry->flags & XATTR_MAP_FLAG_BAD) {
3129                return -EPERM;
3130            }
3131            if (cur_entry->flags & XATTR_MAP_FLAG_UNSUPPORTED) {
3132                return -ENOTSUP;
3133            }
3134            if (cur_entry->flags & XATTR_MAP_FLAG_OK) {
3135                /* Unmodified name */
3136                return 0;
3137            }
3138            if (cur_entry->flags & XATTR_MAP_FLAG_PREFIX) {
3139                *out_name = g_try_malloc(strlen(client_name) +
3140                                         strlen(cur_entry->prepend) + 1);
3141                if (!*out_name) {
3142                    return -ENOMEM;
3143                }
3144                sprintf(*out_name, "%s%s", cur_entry->prepend, client_name);
3145                return 0;
3146            }
3147        }
3148    }
3149
3150    return -EPERM;
3151}
3152
3153/*
3154 * For use with listxattr where the server fs gives us a name and we may need
3155 * to sanitize this for the client.
3156 * Returns a pointer to the result in *out_name
3157 *   This is always the original string or the current string with some prefix
3158 *   removed; no reallocation is done.
3159 * Returns 0 on success
3160 * Can return -ENODATA to indicate the name should be dropped from the list.
3161 */
3162static int xattr_map_server(const struct lo_data *lo, const char *server_name,
3163                            const char **out_name)
3164{
3165    size_t i;
3166    const char *end;
3167
3168    for (i = 0; i < lo->xattr_map_nentries; i++) {
3169        const XattrMapEntry *cur_entry = lo->xattr_map_list + i;
3170
3171        if ((cur_entry->flags & XATTR_MAP_FLAG_SERVER) &&
3172            (strstart(server_name, cur_entry->prepend, &end))) {
3173            if (cur_entry->flags & XATTR_MAP_FLAG_BAD ||
3174                cur_entry->flags & XATTR_MAP_FLAG_UNSUPPORTED) {
3175                return -ENODATA;
3176            }
3177            if (cur_entry->flags & XATTR_MAP_FLAG_OK) {
3178                *out_name = server_name;
3179                return 0;
3180            }
3181            if (cur_entry->flags & XATTR_MAP_FLAG_PREFIX) {
3182                /* Remove prefix */
3183                *out_name = end;
3184                return 0;
3185            }
3186        }
3187    }
3188
3189    return -ENODATA;
3190}
3191
3192static bool block_xattr(struct lo_data *lo, const char *name)
3193{
3194    /*
3195     * If user explicitly enabled posix_acl or did not provide any option,
3196     * do not block acl. Otherwise block system.posix_acl_access and
3197     * system.posix_acl_default xattrs.
3198     */
3199    if (lo->user_posix_acl) {
3200        return false;
3201    }
3202    if (!strcmp(name, "system.posix_acl_access") ||
3203        !strcmp(name, "system.posix_acl_default"))
3204            return true;
3205
3206    return false;
3207}
3208
3209/*
3210 * Returns number of bytes in xattr_list after filtering on success. This
3211 * could be zero as well if nothing is left after filtering.
3212 *
3213 * Returns negative error code on failure.
3214 * xattr_list is modified in place.
3215 */
3216static int remove_blocked_xattrs(struct lo_data *lo, char *xattr_list,
3217                                 unsigned in_size)
3218{
3219    size_t out_index, in_index;
3220
3221    /*
3222     * As of now we only filter out acl xattrs. If acls are enabled or
3223     * they have not been explicitly disabled, there is nothing to
3224     * filter.
3225     */
3226    if (lo->user_posix_acl) {
3227        return in_size;
3228    }
3229
3230    out_index = 0;
3231    in_index = 0;
3232    while (in_index < in_size) {
3233        char *in_ptr = xattr_list + in_index;
3234
3235        /* Length of current attribute name */
3236        size_t in_len = strlen(xattr_list + in_index) + 1;
3237
3238        if (!block_xattr(lo, in_ptr)) {
3239            if (in_index != out_index) {
3240                memmove(xattr_list + out_index, xattr_list + in_index, in_len);
3241            }
3242            out_index += in_len;
3243        }
3244        in_index += in_len;
3245     }
3246    return out_index;
3247}
3248
3249static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
3250                        size_t size)
3251{
3252    struct lo_data *lo = lo_data(req);
3253    g_autofree char *value = NULL;
3254    char procname[64];
3255    const char *name;
3256    char *mapped_name;
3257    struct lo_inode *inode;
3258    ssize_t ret;
3259    int saverr;
3260    int fd = -1;
3261
3262    if (block_xattr(lo, in_name)) {
3263        fuse_reply_err(req, EOPNOTSUPP);
3264        return;
3265    }
3266
3267    mapped_name = NULL;
3268    name = in_name;
3269    if (lo->xattrmap) {
3270        ret = xattr_map_client(lo, in_name, &mapped_name);
3271        if (ret < 0) {
3272            if (ret == -EPERM) {
3273                ret = -ENODATA;
3274            }
3275            fuse_reply_err(req, -ret);
3276            return;
3277        }
3278        if (mapped_name) {
3279            name = mapped_name;
3280        }
3281    }
3282
3283    inode = lo_inode(req, ino);
3284    if (!inode) {
3285        fuse_reply_err(req, EBADF);
3286        g_free(mapped_name);
3287        return;
3288    }
3289
3290    saverr = ENOSYS;
3291    if (!lo_data(req)->xattr) {
3292        goto out;
3293    }
3294
3295    fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n",
3296             ino, name, size);
3297
3298    if (size) {
3299        value = g_try_malloc(size);
3300        if (!value) {
3301            goto out_err;
3302        }
3303    }
3304
3305    sprintf(procname, "%i", inode->fd);
3306    /*
3307     * It is not safe to open() non-regular/non-dir files in file server
3308     * unless O_PATH is used, so use that method for regular files/dir
3309     * only (as it seems giving less performance overhead).
3310     * Otherwise, call fchdir() to avoid open().
3311     */
3312    if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
3313        fd = openat(lo->proc_self_fd, procname, O_RDONLY);
3314        if (fd < 0) {
3315            goto out_err;
3316        }
3317        ret = fgetxattr(fd, name, value, size);
3318        saverr = ret == -1 ? errno : 0;
3319    } else {
3320        /* fchdir should not fail here */
3321        FCHDIR_NOFAIL(lo->proc_self_fd);
3322        ret = getxattr(procname, name, value, size);
3323        saverr = ret == -1 ? errno : 0;
3324        FCHDIR_NOFAIL(lo->root.fd);
3325    }
3326
3327    if (ret == -1) {
3328        goto out;
3329    }
3330    if (size) {
3331        saverr = 0;
3332        if (ret == 0) {
3333            goto out;
3334        }
3335        fuse_reply_buf(req, value, ret);
3336    } else {
3337        fuse_reply_xattr(req, ret);
3338    }
3339out_free:
3340    if (fd >= 0) {
3341        close(fd);
3342    }
3343
3344    lo_inode_put(lo, &inode);
3345    return;
3346
3347out_err:
3348    saverr = errno;
3349out:
3350    fuse_reply_err(req, saverr);
3351    g_free(mapped_name);
3352    goto out_free;
3353}
3354
3355static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size)
3356{
3357    struct lo_data *lo = lo_data(req);
3358    g_autofree char *value = NULL;
3359    char procname[64];
3360    struct lo_inode *inode;
3361    ssize_t ret;
3362    int saverr;
3363    int fd = -1;
3364
3365    inode = lo_inode(req, ino);
3366    if (!inode) {
3367        fuse_reply_err(req, EBADF);
3368        return;
3369    }
3370
3371    saverr = ENOSYS;
3372    if (!lo_data(req)->xattr) {
3373        goto out;
3374    }
3375
3376    fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ino,
3377             size);
3378
3379    if (size) {
3380        value = g_try_malloc(size);
3381        if (!value) {
3382            goto out_err;
3383        }
3384    }
3385
3386    sprintf(procname, "%i", inode->fd);
3387    if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
3388        fd = openat(lo->proc_self_fd, procname, O_RDONLY);
3389        if (fd < 0) {
3390            goto out_err;
3391        }
3392        ret = flistxattr(fd, value, size);
3393        saverr = ret == -1 ? errno : 0;
3394    } else {
3395        /* fchdir should not fail here */
3396        FCHDIR_NOFAIL(lo->proc_self_fd);
3397        ret = listxattr(procname, value, size);
3398        saverr = ret == -1 ? errno : 0;
3399        FCHDIR_NOFAIL(lo->root.fd);
3400    }
3401
3402    if (ret == -1) {
3403        goto out;
3404    }
3405    if (size) {
3406        saverr = 0;
3407        if (ret == 0) {
3408            goto out;
3409        }
3410
3411        if (lo->xattr_map_list) {
3412            /*
3413             * Map the names back, some attributes might be dropped,
3414             * some shortened, but not increased, so we shouldn't
3415             * run out of room.
3416             */
3417            size_t out_index, in_index;
3418            out_index = 0;
3419            in_index = 0;
3420            while (in_index < ret) {
3421                const char *map_out;
3422                char *in_ptr = value + in_index;
3423                /* Length of current attribute name */
3424                size_t in_len = strlen(value + in_index) + 1;
3425
3426                int mapret = xattr_map_server(lo, in_ptr, &map_out);
3427                if (mapret != -ENODATA && mapret != 0) {
3428                    /* Shouldn't happen */
3429                    saverr = -mapret;
3430                    goto out;
3431                }
3432                if (mapret == 0) {
3433                    /* Either unchanged, or truncated */
3434                    size_t out_len;
3435                    if (map_out != in_ptr) {
3436                        /* +1 copies the NIL */
3437                        out_len = strlen(map_out) + 1;
3438                    } else {
3439                        /* No change */
3440                        out_len = in_len;
3441                    }
3442                    /*
3443                     * Move result along, may still be needed for an unchanged
3444                     * entry if a previous entry was changed.
3445                     */
3446                    memmove(value + out_index, map_out, out_len);
3447
3448                    out_index += out_len;
3449                }
3450                in_index += in_len;
3451            }
3452            ret = out_index;
3453            if (ret == 0) {
3454                goto out;
3455            }
3456        }
3457
3458        ret = remove_blocked_xattrs(lo, value, ret);
3459        if (ret <= 0) {
3460            saverr = -ret;
3461            goto out;
3462        }
3463        fuse_reply_buf(req, value, ret);
3464    } else {
3465        /*
3466         * xattrmap only ever shortens the result,
3467         * so we don't need to do anything clever with the
3468         * allocation length here.
3469         */
3470        fuse_reply_xattr(req, ret);
3471    }
3472out_free:
3473    if (fd >= 0) {
3474        close(fd);
3475    }
3476
3477    lo_inode_put(lo, &inode);
3478    return;
3479
3480out_err:
3481    saverr = errno;
3482out:
3483    fuse_reply_err(req, saverr);
3484    goto out_free;
3485}
3486
3487static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
3488                        const char *value, size_t size, int flags,
3489                        uint32_t extra_flags)
3490{
3491    char procname[64];
3492    const char *name;
3493    char *mapped_name;
3494    struct lo_data *lo = lo_data(req);
3495    struct lo_inode *inode;
3496    ssize_t ret;
3497    int saverr;
3498    int fd = -1;
3499    bool switched_creds = false;
3500    bool cap_fsetid_dropped = false;
3501    struct lo_cred old = {};
3502
3503    if (block_xattr(lo, in_name)) {
3504        fuse_reply_err(req, EOPNOTSUPP);
3505        return;
3506    }
3507
3508    mapped_name = NULL;
3509    name = in_name;
3510    if (lo->xattrmap) {
3511        ret = xattr_map_client(lo, in_name, &mapped_name);
3512        if (ret < 0) {
3513            fuse_reply_err(req, -ret);
3514            return;
3515        }
3516        if (mapped_name) {
3517            name = mapped_name;
3518        }
3519    }
3520
3521    inode = lo_inode(req, ino);
3522    if (!inode) {
3523        fuse_reply_err(req, EBADF);
3524        g_free(mapped_name);
3525        return;
3526    }
3527
3528    saverr = ENOSYS;
3529    if (!lo_data(req)->xattr) {
3530        goto out;
3531    }
3532
3533    fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64
3534             ", name=%s value=%s size=%zd)\n", ino, name, value, size);
3535
3536    sprintf(procname, "%i", inode->fd);
3537    /*
3538     * If we are setting posix access acl and if SGID needs to be
3539     * cleared, then switch to caller's gid and drop CAP_FSETID
3540     * and that should make sure host kernel clears SGID.
3541     *
3542     * This probably will not work when we support idmapped mounts.
3543     * In that case we will need to find a non-root gid and switch
3544     * to it. (Instead of gid in request). Fix it when we support
3545     * idmapped mounts.
3546     */
3547    if (lo->posix_acl && !strcmp(name, "system.posix_acl_access")
3548        && (extra_flags & FUSE_SETXATTR_ACL_KILL_SGID)) {
3549        ret = lo_drop_cap_change_cred(req, &old, false, "FSETID",
3550                                      &cap_fsetid_dropped);
3551        if (ret) {
3552            saverr = ret;
3553            goto out;
3554        }
3555        switched_creds = true;
3556    }
3557    if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
3558        fd = openat(lo->proc_self_fd, procname, O_RDONLY);
3559        if (fd < 0) {
3560            saverr = errno;
3561            goto out;
3562        }
3563        ret = fsetxattr(fd, name, value, size, flags);
3564        saverr = ret == -1 ? errno : 0;
3565    } else {
3566        /* fchdir should not fail here */
3567        FCHDIR_NOFAIL(lo->proc_self_fd);
3568        ret = setxattr(procname, name, value, size, flags);
3569        saverr = ret == -1 ? errno : 0;
3570        FCHDIR_NOFAIL(lo->root.fd);
3571    }
3572    if (switched_creds) {
3573        if (cap_fsetid_dropped)
3574            lo_restore_cred_gain_cap(&old, false, "FSETID");
3575        else
3576            lo_restore_cred(&old, false);
3577    }
3578
3579out:
3580    if (fd >= 0) {
3581        close(fd);
3582    }
3583
3584    lo_inode_put(lo, &inode);
3585    g_free(mapped_name);
3586    fuse_reply_err(req, saverr);
3587}
3588
3589static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *in_name)
3590{
3591    char procname[64];
3592    const char *name;
3593    char *mapped_name;
3594    struct lo_data *lo = lo_data(req);
3595    struct lo_inode *inode;
3596    ssize_t ret;
3597    int saverr;
3598    int fd = -1;
3599
3600    if (block_xattr(lo, in_name)) {
3601        fuse_reply_err(req, EOPNOTSUPP);
3602        return;
3603    }
3604
3605    mapped_name = NULL;
3606    name = in_name;
3607    if (lo->xattrmap) {
3608        ret = xattr_map_client(lo, in_name, &mapped_name);
3609        if (ret < 0) {
3610            fuse_reply_err(req, -ret);
3611            return;
3612        }
3613        if (mapped_name) {
3614            name = mapped_name;
3615        }
3616    }
3617
3618    inode = lo_inode(req, ino);
3619    if (!inode) {
3620        fuse_reply_err(req, EBADF);
3621        g_free(mapped_name);
3622        return;
3623    }
3624
3625    saverr = ENOSYS;
3626    if (!lo_data(req)->xattr) {
3627        goto out;
3628    }
3629
3630    fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ino,
3631             name);
3632
3633    sprintf(procname, "%i", inode->fd);
3634    if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
3635        fd = openat(lo->proc_self_fd, procname, O_RDONLY);
3636        if (fd < 0) {
3637            saverr = errno;
3638            goto out;
3639        }
3640        ret = fremovexattr(fd, name);
3641        saverr = ret == -1 ? errno : 0;
3642    } else {
3643        /* fchdir should not fail here */
3644        FCHDIR_NOFAIL(lo->proc_self_fd);
3645        ret = removexattr(procname, name);
3646        saverr = ret == -1 ? errno : 0;
3647        FCHDIR_NOFAIL(lo->root.fd);
3648    }
3649
3650out:
3651    if (fd >= 0) {
3652        close(fd);
3653    }
3654
3655    lo_inode_put(lo, &inode);
3656    g_free(mapped_name);
3657    fuse_reply_err(req, saverr);
3658}
3659
3660#ifdef HAVE_COPY_FILE_RANGE
3661static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in,
3662                               struct fuse_file_info *fi_in, fuse_ino_t ino_out,
3663                               off_t off_out, struct fuse_file_info *fi_out,
3664                               size_t len, int flags)
3665{
3666    int in_fd, out_fd;
3667    ssize_t res;
3668
3669    in_fd = lo_fi_fd(req, fi_in);
3670    out_fd = lo_fi_fd(req, fi_out);
3671
3672    fuse_log(FUSE_LOG_DEBUG,
3673             "lo_copy_file_range(ino=%" PRIu64 "/fd=%d, "
3674             "off=%ju, ino=%" PRIu64 "/fd=%d, "
3675             "off=%ju, size=%zd, flags=0x%x)\n",
3676             ino_in, in_fd, (intmax_t)off_in,
3677             ino_out, out_fd, (intmax_t)off_out, len, flags);
3678
3679    res = copy_file_range(in_fd, &off_in, out_fd, &off_out, len, flags);
3680    if (res < 0) {
3681        fuse_reply_err(req, errno);
3682    } else {
3683        fuse_reply_write(req, res);
3684    }
3685}
3686#endif
3687
3688static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence,
3689                     struct fuse_file_info *fi)
3690{
3691    off_t res;
3692
3693    (void)ino;
3694    res = lseek(lo_fi_fd(req, fi), off, whence);
3695    if (res != -1) {
3696        fuse_reply_lseek(req, res);
3697    } else {
3698        fuse_reply_err(req, errno);
3699    }
3700}
3701
3702static int lo_do_syncfs(struct lo_data *lo, struct lo_inode *inode)
3703{
3704    int fd, ret = 0;
3705
3706    fuse_log(FUSE_LOG_DEBUG, "lo_do_syncfs(ino=%" PRIu64 ")\n",
3707             inode->fuse_ino);
3708
3709    fd = lo_inode_open(lo, inode, O_RDONLY);
3710    if (fd < 0) {
3711        return -fd;
3712    }
3713
3714    if (syncfs(fd) < 0) {
3715        ret = errno;
3716    }
3717
3718    close(fd);
3719    return ret;
3720}
3721
3722static void lo_syncfs(fuse_req_t req, fuse_ino_t ino)
3723{
3724    struct lo_data *lo = lo_data(req);
3725    struct lo_inode *inode = lo_inode(req, ino);
3726    int err;
3727
3728    if (!inode) {
3729        fuse_reply_err(req, EBADF);
3730        return;
3731    }
3732
3733    err = lo_do_syncfs(lo, inode);
3734    lo_inode_put(lo, &inode);
3735
3736    /*
3737     * If submounts aren't announced, the client only sends a request to
3738     * sync the root inode. TODO: Track submounts internally and iterate
3739     * over them as well.
3740     */
3741
3742    fuse_reply_err(req, err);
3743}
3744
3745static void lo_destroy(void *userdata)
3746{
3747    struct lo_data *lo = (struct lo_data *)userdata;
3748
3749    pthread_mutex_lock(&lo->mutex);
3750    while (true) {
3751        GHashTableIter iter;
3752        gpointer key, value;
3753
3754        g_hash_table_iter_init(&iter, lo->inodes);
3755        if (!g_hash_table_iter_next(&iter, &key, &value)) {
3756            break;
3757        }
3758
3759        struct lo_inode *inode = value;
3760        unref_inode(lo, inode, inode->nlookup);
3761    }
3762    pthread_mutex_unlock(&lo->mutex);
3763}
3764
3765static struct fuse_lowlevel_ops lo_oper = {
3766    .init = lo_init,
3767    .lookup = lo_lookup,
3768    .mkdir = lo_mkdir,
3769    .mknod = lo_mknod,
3770    .symlink = lo_symlink,
3771    .link = lo_link,
3772    .unlink = lo_unlink,
3773    .rmdir = lo_rmdir,
3774    .rename = lo_rename,
3775    .forget = lo_forget,
3776    .forget_multi = lo_forget_multi,
3777    .getattr = lo_getattr,
3778    .setattr = lo_setattr,
3779    .readlink = lo_readlink,
3780    .opendir = lo_opendir,
3781    .readdir = lo_readdir,
3782    .readdirplus = lo_readdirplus,
3783    .releasedir = lo_releasedir,
3784    .fsyncdir = lo_fsyncdir,
3785    .create = lo_create,
3786    .getlk = lo_getlk,
3787    .setlk = lo_setlk,
3788    .open = lo_open,
3789    .release = lo_release,
3790    .flush = lo_flush,
3791    .fsync = lo_fsync,
3792    .read = lo_read,
3793    .write_buf = lo_write_buf,
3794    .statfs = lo_statfs,
3795    .fallocate = lo_fallocate,
3796    .flock = lo_flock,
3797    .getxattr = lo_getxattr,
3798    .listxattr = lo_listxattr,
3799    .setxattr = lo_setxattr,
3800    .removexattr = lo_removexattr,
3801#ifdef HAVE_COPY_FILE_RANGE
3802    .copy_file_range = lo_copy_file_range,
3803#endif
3804    .lseek = lo_lseek,
3805    .syncfs = lo_syncfs,
3806    .destroy = lo_destroy,
3807};
3808
3809/* Print vhost-user.json backend program capabilities */
3810static void print_capabilities(void)
3811{
3812    printf("{\n");
3813    printf("  \"type\": \"fs\"\n");
3814    printf("}\n");
3815}
3816
3817/*
3818 * Drop all Linux capabilities because the wait parent process only needs to
3819 * sit in waitpid(2) and terminate.
3820 */
3821static void setup_wait_parent_capabilities(void)
3822{
3823    capng_setpid(syscall(SYS_gettid));
3824    capng_clear(CAPNG_SELECT_BOTH);
3825    capng_apply(CAPNG_SELECT_BOTH);
3826}
3827
3828/*
3829 * Move to a new mount, net, and pid namespaces to isolate this process.
3830 */
3831static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
3832{
3833    pid_t child;
3834
3835    /*
3836     * Create a new pid namespace for *child* processes.  We'll have to
3837     * fork in order to enter the new pid namespace.  A new mount namespace
3838     * is also needed so that we can remount /proc for the new pid
3839     * namespace.
3840     *
3841     * Our UNIX domain sockets have been created.  Now we can move to
3842     * an empty network namespace to prevent TCP/IP and other network
3843     * activity in case this process is compromised.
3844     */
3845    if (unshare(CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET) != 0) {
3846        fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWPID | CLONE_NEWNS): %m\n");
3847        exit(1);
3848    }
3849
3850    child = fork();
3851    if (child < 0) {
3852        fuse_log(FUSE_LOG_ERR, "fork() failed: %m\n");
3853        exit(1);
3854    }
3855    if (child > 0) {
3856        pid_t waited;
3857        int wstatus;
3858
3859        setup_wait_parent_capabilities();
3860
3861        /* The parent waits for the child */
3862        do {
3863            waited = waitpid(child, &wstatus, 0);
3864        } while (waited < 0 && errno == EINTR && !se->exited);
3865
3866        /* We were terminated by a signal, see fuse_signals.c */
3867        if (se->exited) {
3868            exit(0);
3869        }
3870
3871        if (WIFEXITED(wstatus)) {
3872            exit(WEXITSTATUS(wstatus));
3873        }
3874
3875        exit(1);
3876    }
3877
3878    /* Send us SIGTERM when the parent thread terminates, see prctl(2) */
3879    prctl(PR_SET_PDEATHSIG, SIGTERM);
3880
3881    /*
3882     * If the mounts have shared propagation then we want to opt out so our
3883     * mount changes don't affect the parent mount namespace.
3884     */
3885    if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL) < 0) {
3886        fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_SLAVE): %m\n");
3887        exit(1);
3888    }
3889
3890    /* The child must remount /proc to use the new pid namespace */
3891    if (mount("proc", "/proc", "proc",
3892              MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0) {
3893        fuse_log(FUSE_LOG_ERR, "mount(/proc): %m\n");
3894        exit(1);
3895    }
3896
3897    /* Get the /proc/self/task descriptor */
3898    lo->proc_self_task = open("/proc/self/task/", O_PATH);
3899    if (lo->proc_self_task == -1) {
3900        fuse_log(FUSE_LOG_ERR, "open(/proc/self/task, O_PATH): %m\n");
3901        exit(1);
3902    }
3903
3904    lo->use_fscreate = is_fscreate_usable(lo);
3905
3906    /*
3907     * We only need /proc/self/fd. Prevent ".." from accessing parent
3908     * directories of /proc/self/fd by bind-mounting it over /proc. Since / was
3909     * previously remounted with MS_REC | MS_SLAVE this mount change only
3910     * affects our process.
3911     */
3912    if (mount("/proc/self/fd", "/proc", NULL, MS_BIND, NULL) < 0) {
3913        fuse_log(FUSE_LOG_ERR, "mount(/proc/self/fd, MS_BIND): %m\n");
3914        exit(1);
3915    }
3916
3917    /* Get the /proc (actually /proc/self/fd, see above) file descriptor */
3918    lo->proc_self_fd = open("/proc", O_PATH);
3919    if (lo->proc_self_fd == -1) {
3920        fuse_log(FUSE_LOG_ERR, "open(/proc, O_PATH): %m\n");
3921        exit(1);
3922    }
3923}
3924
3925/*
3926 * Capture the capability state, we'll need to restore this for individual
3927 * threads later; see load_capng.
3928 */
3929static void setup_capng(void)
3930{
3931    /* Note this accesses /proc so has to happen before the sandbox */
3932    if (capng_get_caps_process()) {
3933        fuse_log(FUSE_LOG_ERR, "capng_get_caps_process\n");
3934        exit(1);
3935    }
3936    pthread_mutex_init(&cap.mutex, NULL);
3937    pthread_mutex_lock(&cap.mutex);
3938    cap.saved = capng_save_state();
3939    if (!cap.saved) {
3940        fuse_log(FUSE_LOG_ERR, "capng_save_state\n");
3941        exit(1);
3942    }
3943    pthread_mutex_unlock(&cap.mutex);
3944}
3945
3946static void cleanup_capng(void)
3947{
3948    free(cap.saved);
3949    cap.saved = NULL;
3950    pthread_mutex_destroy(&cap.mutex);
3951}
3952
3953
3954/*
3955 * Make the source directory our root so symlinks cannot escape and no other
3956 * files are accessible.  Assumes unshare(CLONE_NEWNS) was already called.
3957 */
3958static void setup_mounts(const char *source)
3959{
3960    int oldroot;
3961    int newroot;
3962
3963    if (mount(source, source, NULL, MS_BIND | MS_REC, NULL) < 0) {
3964        fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source);
3965        exit(1);
3966    }
3967
3968    /* This magic is based on lxc's lxc_pivot_root() */
3969    oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
3970    if (oldroot < 0) {
3971        fuse_log(FUSE_LOG_ERR, "open(/): %m\n");
3972        exit(1);
3973    }
3974
3975    newroot = open(source, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
3976    if (newroot < 0) {
3977        fuse_log(FUSE_LOG_ERR, "open(%s): %m\n", source);
3978        exit(1);
3979    }
3980
3981    if (fchdir(newroot) < 0) {
3982        fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
3983        exit(1);
3984    }
3985
3986    if (syscall(__NR_pivot_root, ".", ".") < 0) {
3987        fuse_log(FUSE_LOG_ERR, "pivot_root(., .): %m\n");
3988        exit(1);
3989    }
3990
3991    if (fchdir(oldroot) < 0) {
3992        fuse_log(FUSE_LOG_ERR, "fchdir(oldroot): %m\n");
3993        exit(1);
3994    }
3995
3996    if (mount("", ".", "", MS_SLAVE | MS_REC, NULL) < 0) {
3997        fuse_log(FUSE_LOG_ERR, "mount(., MS_SLAVE | MS_REC): %m\n");
3998        exit(1);
3999    }
4000
4001    if (umount2(".", MNT_DETACH) < 0) {
4002        fuse_log(FUSE_LOG_ERR, "umount2(., MNT_DETACH): %m\n");
4003        exit(1);
4004    }
4005
4006    if (fchdir(newroot) < 0) {
4007        fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
4008        exit(1);
4009    }
4010
4011    close(newroot);
4012    close(oldroot);
4013}
4014
4015/*
4016 * Only keep capabilities in allowlist that are needed for file system operation
4017 * The (possibly NULL) modcaps_in string passed in is free'd before exit.
4018 */
4019static void setup_capabilities(char *modcaps_in)
4020{
4021    char *modcaps = modcaps_in;
4022    pthread_mutex_lock(&cap.mutex);
4023    capng_restore_state(&cap.saved);
4024
4025    /*
4026     * Add to allowlist file system-related capabilities that are needed for a
4027     * file server to act like root.  Drop everything else like networking and
4028     * sysadmin capabilities.
4029     *
4030     * Exclusions:
4031     * 1. CAP_LINUX_IMMUTABLE is not included because it's only used via ioctl
4032     *    and we don't support that.
4033     * 2. CAP_MAC_OVERRIDE is not included because it only seems to be
4034     *    used by the Smack LSM.  Omit it until there is demand for it.
4035     */
4036    capng_setpid(syscall(SYS_gettid));
4037    capng_clear(CAPNG_SELECT_BOTH);
4038    if (capng_updatev(CAPNG_ADD, CAPNG_PERMITTED | CAPNG_EFFECTIVE,
4039            CAP_CHOWN,
4040            CAP_DAC_OVERRIDE,
4041            CAP_FOWNER,
4042            CAP_FSETID,
4043            CAP_SETGID,
4044            CAP_SETUID,
4045            CAP_MKNOD,
4046            CAP_SETFCAP,
4047            -1)) {
4048        fuse_log(FUSE_LOG_ERR, "%s: capng_updatev failed\n", __func__);
4049        exit(1);
4050    }
4051
4052    /*
4053     * The modcaps option is a colon separated list of caps,
4054     * each preceded by either + or -.
4055     */
4056    while (modcaps) {
4057        capng_act_t action;
4058        int cap;
4059
4060        char *next = strchr(modcaps, ':');
4061        if (next) {
4062            *next = '\0';
4063            next++;
4064        }
4065
4066        switch (modcaps[0]) {
4067        case '+':
4068            action = CAPNG_ADD;
4069            break;
4070
4071        case '-':
4072            action = CAPNG_DROP;
4073            break;
4074
4075        default:
4076            fuse_log(FUSE_LOG_ERR,
4077                     "%s: Expecting '+'/'-' in modcaps but found '%c'\n",
4078                     __func__, modcaps[0]);
4079            exit(1);
4080        }
4081        cap = capng_name_to_capability(modcaps + 1);
4082        if (cap < 0) {
4083            fuse_log(FUSE_LOG_ERR, "%s: Unknown capability '%s'\n", __func__,
4084                     modcaps);
4085            exit(1);
4086        }
4087        if (capng_update(action, CAPNG_PERMITTED | CAPNG_EFFECTIVE, cap)) {
4088            fuse_log(FUSE_LOG_ERR, "%s: capng_update failed for '%s'\n",
4089                     __func__, modcaps);
4090            exit(1);
4091        }
4092
4093        modcaps = next;
4094    }
4095    g_free(modcaps_in);
4096
4097    if (capng_apply(CAPNG_SELECT_BOTH)) {
4098        fuse_log(FUSE_LOG_ERR, "%s: capng_apply failed\n", __func__);
4099        exit(1);
4100    }
4101
4102    cap.saved = capng_save_state();
4103    if (!cap.saved) {
4104        fuse_log(FUSE_LOG_ERR, "%s: capng_save_state failed\n", __func__);
4105        exit(1);
4106    }
4107    pthread_mutex_unlock(&cap.mutex);
4108}
4109
4110/*
4111 * Use chroot as a weaker sandbox for environments where the process is
4112 * launched without CAP_SYS_ADMIN.
4113 */
4114static void setup_chroot(struct lo_data *lo)
4115{
4116    lo->proc_self_fd = open("/proc/self/fd", O_PATH);
4117    if (lo->proc_self_fd == -1) {
4118        fuse_log(FUSE_LOG_ERR, "open(\"/proc/self/fd\", O_PATH): %m\n");
4119        exit(1);
4120    }
4121
4122    lo->proc_self_task = open("/proc/self/task", O_PATH);
4123    if (lo->proc_self_fd == -1) {
4124        fuse_log(FUSE_LOG_ERR, "open(\"/proc/self/task\", O_PATH): %m\n");
4125        exit(1);
4126    }
4127
4128    lo->use_fscreate = is_fscreate_usable(lo);
4129
4130    /*
4131     * Make the shared directory the file system root so that FUSE_OPEN
4132     * (lo_open()) cannot escape the shared directory by opening a symlink.
4133     *
4134     * The chroot(2) syscall is later disabled by seccomp and the
4135     * CAP_SYS_CHROOT capability is dropped so that tampering with the chroot
4136     * is not possible.
4137     *
4138     * However, it's still possible to escape the chroot via lo->proc_self_fd
4139     * but that requires first gaining control of the process.
4140     */
4141    if (chroot(lo->source) != 0) {
4142        fuse_log(FUSE_LOG_ERR, "chroot(\"%s\"): %m\n", lo->source);
4143        exit(1);
4144    }
4145
4146    /* Move into the chroot */
4147    if (chdir("/") != 0) {
4148        fuse_log(FUSE_LOG_ERR, "chdir(\"/\"): %m\n");
4149        exit(1);
4150    }
4151}
4152
4153/*
4154 * Lock down this process to prevent access to other processes or files outside
4155 * source directory.  This reduces the impact of arbitrary code execution bugs.
4156 */
4157static void setup_sandbox(struct lo_data *lo, struct fuse_session *se,
4158                          bool enable_syslog)
4159{
4160    if (lo->sandbox == SANDBOX_NAMESPACE) {
4161        setup_namespaces(lo, se);
4162        setup_mounts(lo->source);
4163    } else {
4164        setup_chroot(lo);
4165    }
4166
4167    setup_seccomp(enable_syslog);
4168    setup_capabilities(g_strdup(lo->modcaps));
4169}
4170
4171/* Set the maximum number of open file descriptors */
4172static void setup_nofile_rlimit(unsigned long rlimit_nofile)
4173{
4174    struct rlimit rlim = {
4175        .rlim_cur = rlimit_nofile,
4176        .rlim_max = rlimit_nofile,
4177    };
4178
4179    if (rlimit_nofile == 0) {
4180        return; /* nothing to do */
4181    }
4182
4183    if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) {
4184        /* Ignore SELinux denials */
4185        if (errno == EPERM) {
4186            return;
4187        }
4188
4189        fuse_log(FUSE_LOG_ERR, "setrlimit(RLIMIT_NOFILE): %m\n");
4190        exit(1);
4191    }
4192}
4193
4194static void log_func(enum fuse_log_level level, const char *fmt, va_list ap)
4195{
4196    g_autofree char *localfmt = NULL;
4197
4198    if (current_log_level < level) {
4199        return;
4200    }
4201
4202    if (current_log_level == FUSE_LOG_DEBUG) {
4203        if (use_syslog) {
4204            /* no timestamp needed */
4205            localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid),
4206                                       fmt);
4207        } else {
4208            g_autoptr(GDateTime) now = g_date_time_new_now_utc();
4209            g_autofree char *nowstr = g_date_time_format(now, "%Y-%m-%d %H:%M:%S.%f%z");
4210            localfmt = g_strdup_printf("[%s] [ID: %08ld] %s",
4211                                       nowstr, syscall(__NR_gettid), fmt);
4212        }
4213        fmt = localfmt;
4214    }
4215
4216    if (use_syslog) {
4217        int priority = LOG_ERR;
4218        switch (level) {
4219        case FUSE_LOG_EMERG:
4220            priority = LOG_EMERG;
4221            break;
4222        case FUSE_LOG_ALERT:
4223            priority = LOG_ALERT;
4224            break;
4225        case FUSE_LOG_CRIT:
4226            priority = LOG_CRIT;
4227            break;
4228        case FUSE_LOG_ERR:
4229            priority = LOG_ERR;
4230            break;
4231        case FUSE_LOG_WARNING:
4232            priority = LOG_WARNING;
4233            break;
4234        case FUSE_LOG_NOTICE:
4235            priority = LOG_NOTICE;
4236            break;
4237        case FUSE_LOG_INFO:
4238            priority = LOG_INFO;
4239            break;
4240        case FUSE_LOG_DEBUG:
4241            priority = LOG_DEBUG;
4242            break;
4243        }
4244        vsyslog(priority, fmt, ap);
4245    } else {
4246        vfprintf(stderr, fmt, ap);
4247    }
4248}
4249
4250static void setup_root(struct lo_data *lo, struct lo_inode *root)
4251{
4252    int fd, res;
4253    struct stat stat;
4254    uint64_t mnt_id;
4255
4256    fd = open("/", O_PATH);
4257    if (fd == -1) {
4258        fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", lo->source);
4259        exit(1);
4260    }
4261
4262    res = do_statx(lo, fd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW,
4263                   &mnt_id);
4264    if (res == -1) {
4265        fuse_log(FUSE_LOG_ERR, "fstatat(%s): %m\n", lo->source);
4266        exit(1);
4267    }
4268
4269    root->filetype = S_IFDIR;
4270    root->fd = fd;
4271    root->key.ino = stat.st_ino;
4272    root->key.dev = stat.st_dev;
4273    root->key.mnt_id = mnt_id;
4274    root->nlookup = 2;
4275    g_atomic_int_set(&root->refcount, 2);
4276    if (lo->posix_lock) {
4277        pthread_mutex_init(&root->plock_mutex, NULL);
4278        root->posix_locks = g_hash_table_new_full(
4279            g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy);
4280    }
4281}
4282
4283static guint lo_key_hash(gconstpointer key)
4284{
4285    const struct lo_key *lkey = key;
4286
4287    return (guint)lkey->ino + (guint)lkey->dev + (guint)lkey->mnt_id;
4288}
4289
4290static gboolean lo_key_equal(gconstpointer a, gconstpointer b)
4291{
4292    const struct lo_key *la = a;
4293    const struct lo_key *lb = b;
4294
4295    return la->ino == lb->ino && la->dev == lb->dev && la->mnt_id == lb->mnt_id;
4296}
4297
4298static void fuse_lo_data_cleanup(struct lo_data *lo)
4299{
4300    if (lo->inodes) {
4301        g_hash_table_destroy(lo->inodes);
4302    }
4303
4304    if (lo->root.posix_locks) {
4305        g_hash_table_destroy(lo->root.posix_locks);
4306    }
4307    lo_map_destroy(&lo->fd_map);
4308    lo_map_destroy(&lo->dirp_map);
4309    lo_map_destroy(&lo->ino_map);
4310
4311    if (lo->proc_self_fd >= 0) {
4312        close(lo->proc_self_fd);
4313    }
4314
4315    if (lo->proc_self_task >= 0) {
4316        close(lo->proc_self_task);
4317    }
4318
4319    if (lo->root.fd >= 0) {
4320        close(lo->root.fd);
4321    }
4322
4323    free(lo->xattrmap);
4324    free_xattrmap(lo);
4325    free(lo->xattr_security_capability);
4326    free(lo->source);
4327}
4328
4329static void qemu_version(void)
4330{
4331    printf("virtiofsd version " QEMU_FULL_VERSION "\n" QEMU_COPYRIGHT "\n");
4332}
4333
4334int main(int argc, char *argv[])
4335{
4336    struct fuse_args args = FUSE_ARGS_INIT(argc, argv);
4337    struct fuse_session *se;
4338    struct fuse_cmdline_opts opts;
4339    struct lo_data lo = {
4340        .sandbox = SANDBOX_NAMESPACE,
4341        .debug = 0,
4342        .writeback = 0,
4343        .posix_lock = 0,
4344        .allow_direct_io = 0,
4345        .proc_self_fd = -1,
4346        .proc_self_task = -1,
4347        .user_killpriv_v2 = -1,
4348        .user_posix_acl = -1,
4349        .user_security_label = -1,
4350    };
4351    struct lo_map_elem *root_elem;
4352    struct lo_map_elem *reserve_elem;
4353    int ret = -1;
4354
4355    /* Initialize time conversion information for localtime_r(). */
4356    tzset();
4357
4358    /* Don't mask creation mode, kernel already did that */
4359    umask(0);
4360
4361    qemu_init_exec_dir(argv[0]);
4362
4363    drop_supplementary_groups();
4364
4365    pthread_mutex_init(&lo.mutex, NULL);
4366    lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal);
4367    lo.root.fd = -1;
4368    lo.root.fuse_ino = FUSE_ROOT_ID;
4369    lo.cache = CACHE_AUTO;
4370
4371    /*
4372     * Set up the ino map like this:
4373     * [0] Reserved (will not be used)
4374     * [1] Root inode
4375     */
4376    lo_map_init(&lo.ino_map);
4377    reserve_elem = lo_map_reserve(&lo.ino_map, 0);
4378    if (!reserve_elem) {
4379        fuse_log(FUSE_LOG_ERR, "failed to alloc reserve_elem.\n");
4380        goto err_out1;
4381    }
4382    reserve_elem->in_use = false;
4383    root_elem = lo_map_reserve(&lo.ino_map, lo.root.fuse_ino);
4384    if (!root_elem) {
4385        fuse_log(FUSE_LOG_ERR, "failed to alloc root_elem.\n");
4386        goto err_out1;
4387    }
4388    root_elem->inode = &lo.root;
4389
4390    lo_map_init(&lo.dirp_map);
4391    lo_map_init(&lo.fd_map);
4392
4393    if (fuse_parse_cmdline(&args, &opts) != 0) {
4394        goto err_out1;
4395    }
4396    fuse_set_log_func(log_func);
4397    use_syslog = opts.syslog;
4398    if (use_syslog) {
4399        openlog("virtiofsd", LOG_PID, LOG_DAEMON);
4400    }
4401
4402    if (opts.show_help) {
4403        printf("usage: %s [options]\n\n", argv[0]);
4404        fuse_cmdline_help();
4405        printf("    -o source=PATH             shared directory tree\n");
4406        fuse_lowlevel_help();
4407        ret = 0;
4408        goto err_out1;
4409    } else if (opts.show_version) {
4410        qemu_version();
4411        fuse_lowlevel_version();
4412        ret = 0;
4413        goto err_out1;
4414    } else if (opts.print_capabilities) {
4415        print_capabilities();
4416        ret = 0;
4417        goto err_out1;
4418    }
4419
4420    if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) {
4421        goto err_out1;
4422    }
4423
4424    if (opts.log_level != 0) {
4425        current_log_level = opts.log_level;
4426    } else {
4427        /* default log level is INFO */
4428        current_log_level = FUSE_LOG_INFO;
4429    }
4430    lo.debug = opts.debug;
4431    if (lo.debug) {
4432        current_log_level = FUSE_LOG_DEBUG;
4433    }
4434    if (lo.source) {
4435        struct stat stat;
4436        int res;
4437
4438        res = lstat(lo.source, &stat);
4439        if (res == -1) {
4440            fuse_log(FUSE_LOG_ERR, "failed to stat source (\"%s\"): %m\n",
4441                     lo.source);
4442            exit(1);
4443        }
4444        if (!S_ISDIR(stat.st_mode)) {
4445            fuse_log(FUSE_LOG_ERR, "source is not a directory\n");
4446            exit(1);
4447        }
4448    } else {
4449        lo.source = strdup("/");
4450        if (!lo.source) {
4451            fuse_log(FUSE_LOG_ERR, "failed to strdup source\n");
4452            goto err_out1;
4453        }
4454    }
4455
4456    if (lo.xattrmap) {
4457        lo.xattr = 1;
4458        parse_xattrmap(&lo);
4459    }
4460
4461    if (!lo.timeout_set) {
4462        switch (lo.cache) {
4463        case CACHE_NONE:
4464            lo.timeout = 0.0;
4465            break;
4466
4467        case CACHE_AUTO:
4468            lo.timeout = 1.0;
4469            break;
4470
4471        case CACHE_ALWAYS:
4472            lo.timeout = 86400.0;
4473            break;
4474        }
4475    } else if (lo.timeout < 0) {
4476        fuse_log(FUSE_LOG_ERR, "timeout is negative (%lf)\n", lo.timeout);
4477        exit(1);
4478    }
4479
4480    if (lo.user_posix_acl == 1 && !lo.xattr) {
4481        fuse_log(FUSE_LOG_ERR, "Can't enable posix ACLs. xattrs are disabled."
4482                 "\n");
4483        exit(1);
4484    }
4485
4486    lo.use_statx = true;
4487
4488    se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo);
4489    if (se == NULL) {
4490        goto err_out1;
4491    }
4492
4493    if (fuse_set_signal_handlers(se) != 0) {
4494        goto err_out2;
4495    }
4496
4497    if (fuse_session_mount(se) != 0) {
4498        goto err_out3;
4499    }
4500
4501    fuse_daemonize(opts.foreground);
4502
4503    setup_nofile_rlimit(opts.rlimit_nofile);
4504
4505    /* Must be before sandbox since it wants /proc */
4506    setup_capng();
4507
4508    setup_sandbox(&lo, se, opts.syslog);
4509
4510    setup_root(&lo, &lo.root);
4511    /* Block until ctrl+c or fusermount -u */
4512    ret = virtio_loop(se);
4513
4514    fuse_session_unmount(se);
4515    cleanup_capng();
4516err_out3:
4517    fuse_remove_signal_handlers(se);
4518err_out2:
4519    fuse_session_destroy(se);
4520err_out1:
4521    fuse_opt_free_args(&args);
4522
4523    fuse_lo_data_cleanup(&lo);
4524
4525    return ret ? 1 : 0;
4526}
4527