qemu/tools/virtiofsd/passthrough_ll.c
<<
>>
Prefs
   1/*
   2 * FUSE: Filesystem in Userspace
   3 * Copyright (C) 2001-2007  Miklos Szeredi <miklos@szeredi.hu>
   4 *
   5 * This program can be distributed under the terms of the GNU GPLv2.
   6 * See the file COPYING.
   7 */
   8
   9/*
  10 *
  11 * This file system mirrors the existing file system hierarchy of the
  12 * system, starting at the root file system. This is implemented by
  13 * just "passing through" all requests to the corresponding user-space
  14 * libc functions. In contrast to passthrough.c and passthrough_fh.c,
  15 * this implementation uses the low-level API. Its performance should
  16 * be the least bad among the three, but many operations are not
  17 * implemented. In particular, it is not possible to remove files (or
  18 * directories) because the code necessary to defer actual removal
  19 * until the file is not opened anymore would make the example much
  20 * more complicated.
  21 *
  22 * When writeback caching is enabled (-o writeback mount option), it
  23 * is only possible to write to files for which the mounting user has
  24 * read permissions. This is because the writeback cache requires the
  25 * kernel to be able to issue read requests for all files (which the
  26 * passthrough filesystem cannot satisfy if it can't read the file in
  27 * the underlying filesystem).
  28 *
  29 * Compile with:
  30 *
  31 *     gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o
  32 * passthrough_ll
  33 *
  34 * ## Source code ##
  35 * \include passthrough_ll.c
  36 */
  37
  38#include "qemu/osdep.h"
  39#include "qemu/timer.h"
  40#include "qemu-version.h"
  41#include "qemu-common.h"
  42#include "fuse_virtio.h"
  43#include "fuse_log.h"
  44#include "fuse_lowlevel.h"
  45#include "standard-headers/linux/fuse.h"
  46#include <cap-ng.h>
  47#include <dirent.h>
  48#include <pthread.h>
  49#include <sys/file.h>
  50#include <sys/mount.h>
  51#include <sys/prctl.h>
  52#include <sys/resource.h>
  53#include <sys/syscall.h>
  54#include <sys/wait.h>
  55#include <sys/xattr.h>
  56#include <syslog.h>
  57
  58#include "qemu/cutils.h"
  59#include "passthrough_helpers.h"
  60#include "passthrough_seccomp.h"
  61
  62/* Keep track of inode posix locks for each owner. */
  63struct lo_inode_plock {
  64    uint64_t lock_owner;
  65    int fd; /* fd for OFD locks */
  66};
  67
  68struct lo_map_elem {
  69    union {
  70        struct lo_inode *inode;
  71        struct lo_dirp *dirp;
  72        int fd;
  73        ssize_t freelist;
  74    };
  75    bool in_use;
  76};
  77
  78/* Maps FUSE fh or ino values to internal objects */
  79struct lo_map {
  80    struct lo_map_elem *elems;
  81    size_t nelems;
  82    ssize_t freelist;
  83};
  84
  85struct lo_key {
  86    ino_t ino;
  87    dev_t dev;
  88    uint64_t mnt_id;
  89};
  90
  91struct lo_inode {
  92    int fd;
  93
  94    /*
  95     * Atomic reference count for this object.  The nlookup field holds a
  96     * reference and release it when nlookup reaches 0.
  97     */
  98    gint refcount;
  99
 100    struct lo_key key;
 101
 102    /*
 103     * This counter keeps the inode alive during the FUSE session.
 104     * Incremented when the FUSE inode number is sent in a reply
 105     * (FUSE_LOOKUP, FUSE_READDIRPLUS, etc).  Decremented when an inode is
 106     * released by a FUSE_FORGET request.
 107     *
 108     * Note that this value is untrusted because the client can manipulate
 109     * it arbitrarily using FUSE_FORGET requests.
 110     *
 111     * Protected by lo->mutex.
 112     */
 113    uint64_t nlookup;
 114
 115    fuse_ino_t fuse_ino;
 116    pthread_mutex_t plock_mutex;
 117    GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */
 118
 119    mode_t filetype;
 120};
 121
 122struct lo_cred {
 123    uid_t euid;
 124    gid_t egid;
 125    mode_t umask;
 126};
 127
 128enum {
 129    CACHE_NONE,
 130    CACHE_AUTO,
 131    CACHE_ALWAYS,
 132};
 133
 134enum {
 135    SANDBOX_NAMESPACE,
 136    SANDBOX_CHROOT,
 137};
 138
 139typedef struct xattr_map_entry {
 140    char *key;
 141    char *prepend;
 142    unsigned int flags;
 143} XattrMapEntry;
 144
 145struct lo_data {
 146    pthread_mutex_t mutex;
 147    int sandbox;
 148    int debug;
 149    int writeback;
 150    int flock;
 151    int posix_lock;
 152    int xattr;
 153    char *xattrmap;
 154    char *xattr_security_capability;
 155    char *source;
 156    char *modcaps;
 157    double timeout;
 158    int cache;
 159    int timeout_set;
 160    int readdirplus_set;
 161    int readdirplus_clear;
 162    int allow_direct_io;
 163    int announce_submounts;
 164    bool use_statx;
 165    struct lo_inode root;
 166    GHashTable *inodes; /* protected by lo->mutex */
 167    struct lo_map ino_map; /* protected by lo->mutex */
 168    struct lo_map dirp_map; /* protected by lo->mutex */
 169    struct lo_map fd_map; /* protected by lo->mutex */
 170    XattrMapEntry *xattr_map_list;
 171    size_t xattr_map_nentries;
 172
 173    /* An O_PATH file descriptor to /proc/self/fd/ */
 174    int proc_self_fd;
 175    int user_killpriv_v2, killpriv_v2;
 176    /* If set, virtiofsd is responsible for setting umask during creation */
 177    bool change_umask;
 178    int user_posix_acl, posix_acl;
 179};
 180
 181static const struct fuse_opt lo_opts[] = {
 182    { "sandbox=namespace",
 183      offsetof(struct lo_data, sandbox),
 184      SANDBOX_NAMESPACE },
 185    { "sandbox=chroot",
 186      offsetof(struct lo_data, sandbox),
 187      SANDBOX_CHROOT },
 188    { "writeback", offsetof(struct lo_data, writeback), 1 },
 189    { "no_writeback", offsetof(struct lo_data, writeback), 0 },
 190    { "source=%s", offsetof(struct lo_data, source), 0 },
 191    { "flock", offsetof(struct lo_data, flock), 1 },
 192    { "no_flock", offsetof(struct lo_data, flock), 0 },
 193    { "posix_lock", offsetof(struct lo_data, posix_lock), 1 },
 194    { "no_posix_lock", offsetof(struct lo_data, posix_lock), 0 },
 195    { "xattr", offsetof(struct lo_data, xattr), 1 },
 196    { "no_xattr", offsetof(struct lo_data, xattr), 0 },
 197    { "xattrmap=%s", offsetof(struct lo_data, xattrmap), 0 },
 198    { "modcaps=%s", offsetof(struct lo_data, modcaps), 0 },
 199    { "timeout=%lf", offsetof(struct lo_data, timeout), 0 },
 200    { "timeout=", offsetof(struct lo_data, timeout_set), 1 },
 201    { "cache=none", offsetof(struct lo_data, cache), CACHE_NONE },
 202    { "cache=auto", offsetof(struct lo_data, cache), CACHE_AUTO },
 203    { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS },
 204    { "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 },
 205    { "no_readdirplus", offsetof(struct lo_data, readdirplus_clear), 1 },
 206    { "allow_direct_io", offsetof(struct lo_data, allow_direct_io), 1 },
 207    { "no_allow_direct_io", offsetof(struct lo_data, allow_direct_io), 0 },
 208    { "announce_submounts", offsetof(struct lo_data, announce_submounts), 1 },
 209    { "killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 1 },
 210    { "no_killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 0 },
 211    { "posix_acl", offsetof(struct lo_data, user_posix_acl), 1 },
 212    { "no_posix_acl", offsetof(struct lo_data, user_posix_acl), 0 },
 213    FUSE_OPT_END
 214};
 215static bool use_syslog = false;
 216static int current_log_level;
 217static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
 218                                 uint64_t n);
 219
 220static struct {
 221    pthread_mutex_t mutex;
 222    void *saved;
 223} cap;
 224/* That we loaded cap-ng in the current thread from the saved */
 225static __thread bool cap_loaded = 0;
 226
 227static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
 228                                uint64_t mnt_id);
 229static int xattr_map_client(const struct lo_data *lo, const char *client_name,
 230                            char **out_name);
 231
 232static bool is_dot_or_dotdot(const char *name)
 233{
 234    return name[0] == '.' &&
 235           (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'));
 236}
 237
 238/* Is `path` a single path component that is not "." or ".."? */
 239static bool is_safe_path_component(const char *path)
 240{
 241    if (strchr(path, '/')) {
 242        return false;
 243    }
 244
 245    return !is_dot_or_dotdot(path);
 246}
 247
 248static bool is_empty(const char *name)
 249{
 250    return name[0] == '\0';
 251}
 252
 253static struct lo_data *lo_data(fuse_req_t req)
 254{
 255    return (struct lo_data *)fuse_req_userdata(req);
 256}
 257
 258/*
 259 * Load capng's state from our saved state if the current thread
 260 * hadn't previously been loaded.
 261 * returns 0 on success
 262 */
 263static int load_capng(void)
 264{
 265    if (!cap_loaded) {
 266        pthread_mutex_lock(&cap.mutex);
 267        capng_restore_state(&cap.saved);
 268        /*
 269         * restore_state free's the saved copy
 270         * so make another.
 271         */
 272        cap.saved = capng_save_state();
 273        if (!cap.saved) {
 274            pthread_mutex_unlock(&cap.mutex);
 275            fuse_log(FUSE_LOG_ERR, "capng_save_state (thread)\n");
 276            return -EINVAL;
 277        }
 278        pthread_mutex_unlock(&cap.mutex);
 279
 280        /*
 281         * We want to use the loaded state for our pid,
 282         * not the original
 283         */
 284        capng_setpid(syscall(SYS_gettid));
 285        cap_loaded = true;
 286    }
 287    return 0;
 288}
 289
 290/*
 291 * Helpers for dropping and regaining effective capabilities. Returns 0
 292 * on success, error otherwise
 293 */
 294static int drop_effective_cap(const char *cap_name, bool *cap_dropped)
 295{
 296    int cap, ret;
 297
 298    cap = capng_name_to_capability(cap_name);
 299    if (cap < 0) {
 300        ret = errno;
 301        fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
 302                 cap_name, strerror(errno));
 303        goto out;
 304    }
 305
 306    if (load_capng()) {
 307        ret = errno;
 308        fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
 309        goto out;
 310    }
 311
 312    /* We dont have this capability in effective set already. */
 313    if (!capng_have_capability(CAPNG_EFFECTIVE, cap)) {
 314        ret = 0;
 315        goto out;
 316    }
 317
 318    if (capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, cap)) {
 319        ret = errno;
 320        fuse_log(FUSE_LOG_ERR, "capng_update(DROP,) failed\n");
 321        goto out;
 322    }
 323
 324    if (capng_apply(CAPNG_SELECT_CAPS)) {
 325        ret = errno;
 326        fuse_log(FUSE_LOG_ERR, "drop:capng_apply() failed\n");
 327        goto out;
 328    }
 329
 330    ret = 0;
 331    if (cap_dropped) {
 332        *cap_dropped = true;
 333    }
 334
 335out:
 336    return ret;
 337}
 338
 339static int gain_effective_cap(const char *cap_name)
 340{
 341    int cap;
 342    int ret = 0;
 343
 344    cap = capng_name_to_capability(cap_name);
 345    if (cap < 0) {
 346        ret = errno;
 347        fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
 348                 cap_name, strerror(errno));
 349        goto out;
 350    }
 351
 352    if (load_capng()) {
 353        ret = errno;
 354        fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
 355        goto out;
 356    }
 357
 358    if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, cap)) {
 359        ret = errno;
 360        fuse_log(FUSE_LOG_ERR, "capng_update(ADD,) failed\n");
 361        goto out;
 362    }
 363
 364    if (capng_apply(CAPNG_SELECT_CAPS)) {
 365        ret = errno;
 366        fuse_log(FUSE_LOG_ERR, "gain:capng_apply() failed\n");
 367        goto out;
 368    }
 369    ret = 0;
 370
 371out:
 372    return ret;
 373}
 374
 375/*
 376 * The host kernel normally drops security.capability xattr's on
 377 * any write, however if we're remapping xattr names we need to drop
 378 * whatever the clients security.capability is actually stored as.
 379 */
 380static int drop_security_capability(const struct lo_data *lo, int fd)
 381{
 382    if (!lo->xattr_security_capability) {
 383        /* We didn't remap the name, let the host kernel do it */
 384        return 0;
 385    }
 386    if (!fremovexattr(fd, lo->xattr_security_capability)) {
 387        /* All good */
 388        return 0;
 389    }
 390
 391    switch (errno) {
 392    case ENODATA:
 393        /* Attribute didn't exist, that's fine */
 394        return 0;
 395
 396    case ENOTSUP:
 397        /* FS didn't support attribute anyway, also fine */
 398        return 0;
 399
 400    default:
 401        /* Hmm other error */
 402        return errno;
 403    }
 404}
 405
 406static void lo_map_init(struct lo_map *map)
 407{
 408    map->elems = NULL;
 409    map->nelems = 0;
 410    map->freelist = -1;
 411}
 412
 413static void lo_map_destroy(struct lo_map *map)
 414{
 415    g_free(map->elems);
 416}
 417
 418static int lo_map_grow(struct lo_map *map, size_t new_nelems)
 419{
 420    struct lo_map_elem *new_elems;
 421    size_t i;
 422
 423    if (new_nelems <= map->nelems) {
 424        return 1;
 425    }
 426
 427    new_elems = g_try_realloc_n(map->elems, new_nelems, sizeof(map->elems[0]));
 428    if (!new_elems) {
 429        return 0;
 430    }
 431
 432    for (i = map->nelems; i < new_nelems; i++) {
 433        new_elems[i].freelist = i + 1;
 434        new_elems[i].in_use = false;
 435    }
 436    new_elems[new_nelems - 1].freelist = -1;
 437
 438    map->elems = new_elems;
 439    map->freelist = map->nelems;
 440    map->nelems = new_nelems;
 441    return 1;
 442}
 443
 444static struct lo_map_elem *lo_map_alloc_elem(struct lo_map *map)
 445{
 446    struct lo_map_elem *elem;
 447
 448    if (map->freelist == -1 && !lo_map_grow(map, map->nelems + 256)) {
 449        return NULL;
 450    }
 451
 452    elem = &map->elems[map->freelist];
 453    map->freelist = elem->freelist;
 454
 455    elem->in_use = true;
 456
 457    return elem;
 458}
 459
 460static struct lo_map_elem *lo_map_reserve(struct lo_map *map, size_t key)
 461{
 462    ssize_t *prev;
 463
 464    if (!lo_map_grow(map, key + 1)) {
 465        return NULL;
 466    }
 467
 468    for (prev = &map->freelist; *prev != -1;
 469         prev = &map->elems[*prev].freelist) {
 470        if (*prev == key) {
 471            struct lo_map_elem *elem = &map->elems[key];
 472
 473            *prev = elem->freelist;
 474            elem->in_use = true;
 475            return elem;
 476        }
 477    }
 478    return NULL;
 479}
 480
 481static struct lo_map_elem *lo_map_get(struct lo_map *map, size_t key)
 482{
 483    if (key >= map->nelems) {
 484        return NULL;
 485    }
 486    if (!map->elems[key].in_use) {
 487        return NULL;
 488    }
 489    return &map->elems[key];
 490}
 491
 492static void lo_map_remove(struct lo_map *map, size_t key)
 493{
 494    struct lo_map_elem *elem;
 495
 496    if (key >= map->nelems) {
 497        return;
 498    }
 499
 500    elem = &map->elems[key];
 501    if (!elem->in_use) {
 502        return;
 503    }
 504
 505    elem->in_use = false;
 506
 507    elem->freelist = map->freelist;
 508    map->freelist = key;
 509}
 510
 511/* Assumes lo->mutex is held */
 512static ssize_t lo_add_fd_mapping(struct lo_data *lo, int fd)
 513{
 514    struct lo_map_elem *elem;
 515
 516    elem = lo_map_alloc_elem(&lo->fd_map);
 517    if (!elem) {
 518        return -1;
 519    }
 520
 521    elem->fd = fd;
 522    return elem - lo->fd_map.elems;
 523}
 524
 525/* Assumes lo->mutex is held */
 526static ssize_t lo_add_dirp_mapping(fuse_req_t req, struct lo_dirp *dirp)
 527{
 528    struct lo_map_elem *elem;
 529
 530    elem = lo_map_alloc_elem(&lo_data(req)->dirp_map);
 531    if (!elem) {
 532        return -1;
 533    }
 534
 535    elem->dirp = dirp;
 536    return elem - lo_data(req)->dirp_map.elems;
 537}
 538
 539/* Assumes lo->mutex is held */
 540static ssize_t lo_add_inode_mapping(fuse_req_t req, struct lo_inode *inode)
 541{
 542    struct lo_map_elem *elem;
 543
 544    elem = lo_map_alloc_elem(&lo_data(req)->ino_map);
 545    if (!elem) {
 546        return -1;
 547    }
 548
 549    elem->inode = inode;
 550    return elem - lo_data(req)->ino_map.elems;
 551}
 552
 553static void lo_inode_put(struct lo_data *lo, struct lo_inode **inodep)
 554{
 555    struct lo_inode *inode = *inodep;
 556
 557    if (!inode) {
 558        return;
 559    }
 560
 561    *inodep = NULL;
 562
 563    if (g_atomic_int_dec_and_test(&inode->refcount)) {
 564        close(inode->fd);
 565        free(inode);
 566    }
 567}
 568
 569/* Caller must release refcount using lo_inode_put() */
 570static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino)
 571{
 572    struct lo_data *lo = lo_data(req);
 573    struct lo_map_elem *elem;
 574
 575    pthread_mutex_lock(&lo->mutex);
 576    elem = lo_map_get(&lo->ino_map, ino);
 577    if (elem) {
 578        g_atomic_int_inc(&elem->inode->refcount);
 579    }
 580    pthread_mutex_unlock(&lo->mutex);
 581
 582    if (!elem) {
 583        return NULL;
 584    }
 585
 586    return elem->inode;
 587}
 588
 589/*
 590 * TODO Remove this helper and force callers to hold an inode refcount until
 591 * they are done with the fd.  This will be done in a later patch to make
 592 * review easier.
 593 */
 594static int lo_fd(fuse_req_t req, fuse_ino_t ino)
 595{
 596    struct lo_inode *inode = lo_inode(req, ino);
 597    int fd;
 598
 599    if (!inode) {
 600        return -1;
 601    }
 602
 603    fd = inode->fd;
 604    lo_inode_put(lo_data(req), &inode);
 605    return fd;
 606}
 607
 608/*
 609 * Open a file descriptor for an inode. Returns -EBADF if the inode is not a
 610 * regular file or a directory.
 611 *
 612 * Use this helper function instead of raw openat(2) to prevent security issues
 613 * when a malicious client opens special files such as block device nodes.
 614 * Symlink inodes are also rejected since symlinks must already have been
 615 * traversed on the client side.
 616 */
 617static int lo_inode_open(struct lo_data *lo, struct lo_inode *inode,
 618                         int open_flags)
 619{
 620    g_autofree char *fd_str = g_strdup_printf("%d", inode->fd);
 621    int fd;
 622
 623    if (!S_ISREG(inode->filetype) && !S_ISDIR(inode->filetype)) {
 624        return -EBADF;
 625    }
 626
 627    /*
 628     * The file is a symlink so O_NOFOLLOW must be ignored. We checked earlier
 629     * that the inode is not a special file but if an external process races
 630     * with us then symlinks are traversed here. It is not possible to escape
 631     * the shared directory since it is mounted as "/" though.
 632     */
 633    fd = openat(lo->proc_self_fd, fd_str, open_flags & ~O_NOFOLLOW);
 634    if (fd < 0) {
 635        return -errno;
 636    }
 637    return fd;
 638}
 639
 640static void lo_init(void *userdata, struct fuse_conn_info *conn)
 641{
 642    struct lo_data *lo = (struct lo_data *)userdata;
 643
 644    if (conn->capable & FUSE_CAP_EXPORT_SUPPORT) {
 645        conn->want |= FUSE_CAP_EXPORT_SUPPORT;
 646    }
 647
 648    if (lo->writeback && conn->capable & FUSE_CAP_WRITEBACK_CACHE) {
 649        fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n");
 650        conn->want |= FUSE_CAP_WRITEBACK_CACHE;
 651    }
 652    if (conn->capable & FUSE_CAP_FLOCK_LOCKS) {
 653        if (lo->flock) {
 654            fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n");
 655            conn->want |= FUSE_CAP_FLOCK_LOCKS;
 656        } else {
 657            fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling flock locks\n");
 658            conn->want &= ~FUSE_CAP_FLOCK_LOCKS;
 659        }
 660    }
 661
 662    if (conn->capable & FUSE_CAP_POSIX_LOCKS) {
 663        if (lo->posix_lock) {
 664            fuse_log(FUSE_LOG_DEBUG, "lo_init: activating posix locks\n");
 665            conn->want |= FUSE_CAP_POSIX_LOCKS;
 666        } else {
 667            fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix locks\n");
 668            conn->want &= ~FUSE_CAP_POSIX_LOCKS;
 669        }
 670    }
 671
 672    if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) ||
 673        lo->readdirplus_clear) {
 674        fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n");
 675        conn->want &= ~FUSE_CAP_READDIRPLUS;
 676    }
 677
 678    if (!(conn->capable & FUSE_CAP_SUBMOUNTS) && lo->announce_submounts) {
 679        fuse_log(FUSE_LOG_WARNING, "lo_init: Cannot announce submounts, client "
 680                 "does not support it\n");
 681        lo->announce_submounts = false;
 682    }
 683
 684    if (lo->user_killpriv_v2 == 1) {
 685        /*
 686         * User explicitly asked for this option. Enable it unconditionally.
 687         * If connection does not have this capability, it should fail
 688         * in fuse_lowlevel.c
 689         */
 690        fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling killpriv_v2\n");
 691        conn->want |= FUSE_CAP_HANDLE_KILLPRIV_V2;
 692        lo->killpriv_v2 = 1;
 693    } else if (lo->user_killpriv_v2 == -1 &&
 694               conn->capable & FUSE_CAP_HANDLE_KILLPRIV_V2) {
 695        /*
 696         * User did not specify a value for killpriv_v2. By default enable it
 697         * if connection offers this capability
 698         */
 699        fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling killpriv_v2\n");
 700        conn->want |= FUSE_CAP_HANDLE_KILLPRIV_V2;
 701        lo->killpriv_v2 = 1;
 702    } else {
 703        /*
 704         * Either user specified to disable killpriv_v2, or connection does
 705         * not offer this capability. Disable killpriv_v2 in both the cases
 706         */
 707        fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling killpriv_v2\n");
 708        conn->want &= ~FUSE_CAP_HANDLE_KILLPRIV_V2;
 709        lo->killpriv_v2 = 0;
 710    }
 711
 712    if (lo->user_posix_acl == 1) {
 713        /*
 714         * User explicitly asked for this option. Enable it unconditionally.
 715         * If connection does not have this capability, print error message
 716         * now. It will fail later in fuse_lowlevel.c
 717         */
 718        if (!(conn->capable & FUSE_CAP_POSIX_ACL) ||
 719            !(conn->capable & FUSE_CAP_DONT_MASK) ||
 720            !(conn->capable & FUSE_CAP_SETXATTR_EXT)) {
 721            fuse_log(FUSE_LOG_ERR, "lo_init: Can not enable posix acl."
 722                     " kernel does not support FUSE_POSIX_ACL, FUSE_DONT_MASK"
 723                     " or FUSE_SETXATTR_EXT capability.\n");
 724        } else {
 725            fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling posix acl\n");
 726        }
 727
 728        conn->want |= FUSE_CAP_POSIX_ACL | FUSE_CAP_DONT_MASK |
 729                      FUSE_CAP_SETXATTR_EXT;
 730        lo->change_umask = true;
 731        lo->posix_acl = true;
 732    } else {
 733        /* User either did not specify anything or wants it disabled */
 734        fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix_acl\n");
 735        conn->want &= ~FUSE_CAP_POSIX_ACL;
 736    }
 737}
 738
 739static void lo_getattr(fuse_req_t req, fuse_ino_t ino,
 740                       struct fuse_file_info *fi)
 741{
 742    int res;
 743    struct stat buf;
 744    struct lo_data *lo = lo_data(req);
 745
 746    (void)fi;
 747
 748    res =
 749        fstatat(lo_fd(req, ino), "", &buf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
 750    if (res == -1) {
 751        return (void)fuse_reply_err(req, errno);
 752    }
 753
 754    fuse_reply_attr(req, &buf, lo->timeout);
 755}
 756
 757static int lo_fi_fd(fuse_req_t req, struct fuse_file_info *fi)
 758{
 759    struct lo_data *lo = lo_data(req);
 760    struct lo_map_elem *elem;
 761
 762    pthread_mutex_lock(&lo->mutex);
 763    elem = lo_map_get(&lo->fd_map, fi->fh);
 764    pthread_mutex_unlock(&lo->mutex);
 765
 766    if (!elem) {
 767        return -1;
 768    }
 769
 770    return elem->fd;
 771}
 772
 773static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
 774                       int valid, struct fuse_file_info *fi)
 775{
 776    int saverr;
 777    char procname[64];
 778    struct lo_data *lo = lo_data(req);
 779    struct lo_inode *inode;
 780    int ifd;
 781    int res;
 782    int fd = -1;
 783
 784    inode = lo_inode(req, ino);
 785    if (!inode) {
 786        fuse_reply_err(req, EBADF);
 787        return;
 788    }
 789
 790    ifd = inode->fd;
 791
 792    /* If fi->fh is invalid we'll report EBADF later */
 793    if (fi) {
 794        fd = lo_fi_fd(req, fi);
 795    }
 796
 797    if (valid & FUSE_SET_ATTR_MODE) {
 798        if (fi) {
 799            res = fchmod(fd, attr->st_mode);
 800        } else {
 801            sprintf(procname, "%i", ifd);
 802            res = fchmodat(lo->proc_self_fd, procname, attr->st_mode, 0);
 803        }
 804        if (res == -1) {
 805            saverr = errno;
 806            goto out_err;
 807        }
 808    }
 809    if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) {
 810        uid_t uid = (valid & FUSE_SET_ATTR_UID) ? attr->st_uid : (uid_t)-1;
 811        gid_t gid = (valid & FUSE_SET_ATTR_GID) ? attr->st_gid : (gid_t)-1;
 812
 813        saverr = drop_security_capability(lo, ifd);
 814        if (saverr) {
 815            goto out_err;
 816        }
 817
 818        res = fchownat(ifd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
 819        if (res == -1) {
 820            saverr = errno;
 821            goto out_err;
 822        }
 823    }
 824    if (valid & FUSE_SET_ATTR_SIZE) {
 825        int truncfd;
 826        bool kill_suidgid;
 827        bool cap_fsetid_dropped = false;
 828
 829        kill_suidgid = lo->killpriv_v2 && (valid & FUSE_SET_ATTR_KILL_SUIDGID);
 830        if (fi) {
 831            truncfd = fd;
 832        } else {
 833            truncfd = lo_inode_open(lo, inode, O_RDWR);
 834            if (truncfd < 0) {
 835                saverr = -truncfd;
 836                goto out_err;
 837            }
 838        }
 839
 840        saverr = drop_security_capability(lo, truncfd);
 841        if (saverr) {
 842            if (!fi) {
 843                close(truncfd);
 844            }
 845            goto out_err;
 846        }
 847
 848        if (kill_suidgid) {
 849            res = drop_effective_cap("FSETID", &cap_fsetid_dropped);
 850            if (res != 0) {
 851                saverr = res;
 852                if (!fi) {
 853                    close(truncfd);
 854                }
 855                goto out_err;
 856            }
 857        }
 858
 859        res = ftruncate(truncfd, attr->st_size);
 860        saverr = res == -1 ? errno : 0;
 861
 862        if (cap_fsetid_dropped) {
 863            if (gain_effective_cap("FSETID")) {
 864                fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
 865            }
 866        }
 867        if (!fi) {
 868            close(truncfd);
 869        }
 870        if (res == -1) {
 871            goto out_err;
 872        }
 873    }
 874    if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) {
 875        struct timespec tv[2];
 876
 877        tv[0].tv_sec = 0;
 878        tv[1].tv_sec = 0;
 879        tv[0].tv_nsec = UTIME_OMIT;
 880        tv[1].tv_nsec = UTIME_OMIT;
 881
 882        if (valid & FUSE_SET_ATTR_ATIME_NOW) {
 883            tv[0].tv_nsec = UTIME_NOW;
 884        } else if (valid & FUSE_SET_ATTR_ATIME) {
 885            tv[0] = attr->st_atim;
 886        }
 887
 888        if (valid & FUSE_SET_ATTR_MTIME_NOW) {
 889            tv[1].tv_nsec = UTIME_NOW;
 890        } else if (valid & FUSE_SET_ATTR_MTIME) {
 891            tv[1] = attr->st_mtim;
 892        }
 893
 894        if (fi) {
 895            res = futimens(fd, tv);
 896        } else {
 897            sprintf(procname, "%i", inode->fd);
 898            res = utimensat(lo->proc_self_fd, procname, tv, 0);
 899        }
 900        if (res == -1) {
 901            saverr = errno;
 902            goto out_err;
 903        }
 904    }
 905    lo_inode_put(lo, &inode);
 906
 907    return lo_getattr(req, ino, fi);
 908
 909out_err:
 910    lo_inode_put(lo, &inode);
 911    fuse_reply_err(req, saverr);
 912}
 913
 914static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
 915                                uint64_t mnt_id)
 916{
 917    struct lo_inode *p;
 918    struct lo_key key = {
 919        .ino = st->st_ino,
 920        .dev = st->st_dev,
 921        .mnt_id = mnt_id,
 922    };
 923
 924    pthread_mutex_lock(&lo->mutex);
 925    p = g_hash_table_lookup(lo->inodes, &key);
 926    if (p) {
 927        assert(p->nlookup > 0);
 928        p->nlookup++;
 929        g_atomic_int_inc(&p->refcount);
 930    }
 931    pthread_mutex_unlock(&lo->mutex);
 932
 933    return p;
 934}
 935
 936/* value_destroy_func for posix_locks GHashTable */
 937static void posix_locks_value_destroy(gpointer data)
 938{
 939    struct lo_inode_plock *plock = data;
 940
 941    /*
 942     * We had used open() for locks and had only one fd. So
 943     * closing this fd should release all OFD locks.
 944     */
 945    close(plock->fd);
 946    free(plock);
 947}
 948
 949static int do_statx(struct lo_data *lo, int dirfd, const char *pathname,
 950                    struct stat *statbuf, int flags, uint64_t *mnt_id)
 951{
 952    int res;
 953
 954#if defined(CONFIG_STATX) && defined(STATX_MNT_ID)
 955    if (lo->use_statx) {
 956        struct statx statxbuf;
 957
 958        res = statx(dirfd, pathname, flags, STATX_BASIC_STATS | STATX_MNT_ID,
 959                    &statxbuf);
 960        if (!res) {
 961            memset(statbuf, 0, sizeof(*statbuf));
 962            statbuf->st_dev = makedev(statxbuf.stx_dev_major,
 963                                      statxbuf.stx_dev_minor);
 964            statbuf->st_ino = statxbuf.stx_ino;
 965            statbuf->st_mode = statxbuf.stx_mode;
 966            statbuf->st_nlink = statxbuf.stx_nlink;
 967            statbuf->st_uid = statxbuf.stx_uid;
 968            statbuf->st_gid = statxbuf.stx_gid;
 969            statbuf->st_rdev = makedev(statxbuf.stx_rdev_major,
 970                                       statxbuf.stx_rdev_minor);
 971            statbuf->st_size = statxbuf.stx_size;
 972            statbuf->st_blksize = statxbuf.stx_blksize;
 973            statbuf->st_blocks = statxbuf.stx_blocks;
 974            statbuf->st_atim.tv_sec = statxbuf.stx_atime.tv_sec;
 975            statbuf->st_atim.tv_nsec = statxbuf.stx_atime.tv_nsec;
 976            statbuf->st_mtim.tv_sec = statxbuf.stx_mtime.tv_sec;
 977            statbuf->st_mtim.tv_nsec = statxbuf.stx_mtime.tv_nsec;
 978            statbuf->st_ctim.tv_sec = statxbuf.stx_ctime.tv_sec;
 979            statbuf->st_ctim.tv_nsec = statxbuf.stx_ctime.tv_nsec;
 980
 981            if (statxbuf.stx_mask & STATX_MNT_ID) {
 982                *mnt_id = statxbuf.stx_mnt_id;
 983            } else {
 984                *mnt_id = 0;
 985            }
 986            return 0;
 987        } else if (errno != ENOSYS) {
 988            return -1;
 989        }
 990        lo->use_statx = false;
 991        /* fallback */
 992    }
 993#endif
 994    res = fstatat(dirfd, pathname, statbuf, flags);
 995    if (res == -1) {
 996        return -1;
 997    }
 998    *mnt_id = 0;
 999
1000    return 0;
1001}
1002
1003/*
1004 * Increments nlookup on the inode on success. unref_inode_lolocked() must be
1005 * called eventually to decrement nlookup again. If inodep is non-NULL, the
1006 * inode pointer is stored and the caller must call lo_inode_put().
1007 */
1008static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
1009                        struct fuse_entry_param *e,
1010                        struct lo_inode **inodep)
1011{
1012    int newfd;
1013    int res;
1014    int saverr;
1015    uint64_t mnt_id;
1016    struct lo_data *lo = lo_data(req);
1017    struct lo_inode *inode = NULL;
1018    struct lo_inode *dir = lo_inode(req, parent);
1019
1020    if (inodep) {
1021        *inodep = NULL; /* in case there is an error */
1022    }
1023
1024    /*
1025     * name_to_handle_at() and open_by_handle_at() can reach here with fuse
1026     * mount point in guest, but we don't have its inode info in the
1027     * ino_map.
1028     */
1029    if (!dir) {
1030        return ENOENT;
1031    }
1032
1033    memset(e, 0, sizeof(*e));
1034    e->attr_timeout = lo->timeout;
1035    e->entry_timeout = lo->timeout;
1036
1037    /* Do not allow escaping root directory */
1038    if (dir == &lo->root && strcmp(name, "..") == 0) {
1039        name = ".";
1040    }
1041
1042    newfd = openat(dir->fd, name, O_PATH | O_NOFOLLOW);
1043    if (newfd == -1) {
1044        goto out_err;
1045    }
1046
1047    res = do_statx(lo, newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW,
1048                   &mnt_id);
1049    if (res == -1) {
1050        goto out_err;
1051    }
1052
1053    if (S_ISDIR(e->attr.st_mode) && lo->announce_submounts &&
1054        (e->attr.st_dev != dir->key.dev || mnt_id != dir->key.mnt_id)) {
1055        e->attr_flags |= FUSE_ATTR_SUBMOUNT;
1056    }
1057
1058    inode = lo_find(lo, &e->attr, mnt_id);
1059    if (inode) {
1060        close(newfd);
1061    } else {
1062        inode = calloc(1, sizeof(struct lo_inode));
1063        if (!inode) {
1064            goto out_err;
1065        }
1066
1067        /* cache only filetype */
1068        inode->filetype = (e->attr.st_mode & S_IFMT);
1069
1070        /*
1071         * One for the caller and one for nlookup (released in
1072         * unref_inode_lolocked())
1073         */
1074        g_atomic_int_set(&inode->refcount, 2);
1075
1076        inode->nlookup = 1;
1077        inode->fd = newfd;
1078        inode->key.ino = e->attr.st_ino;
1079        inode->key.dev = e->attr.st_dev;
1080        inode->key.mnt_id = mnt_id;
1081        if (lo->posix_lock) {
1082            pthread_mutex_init(&inode->plock_mutex, NULL);
1083            inode->posix_locks = g_hash_table_new_full(
1084                g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy);
1085        }
1086        pthread_mutex_lock(&lo->mutex);
1087        inode->fuse_ino = lo_add_inode_mapping(req, inode);
1088        g_hash_table_insert(lo->inodes, &inode->key, inode);
1089        pthread_mutex_unlock(&lo->mutex);
1090    }
1091    e->ino = inode->fuse_ino;
1092
1093    /* Transfer ownership of inode pointer to caller or drop it */
1094    if (inodep) {
1095        *inodep = inode;
1096    } else {
1097        lo_inode_put(lo, &inode);
1098    }
1099
1100    lo_inode_put(lo, &dir);
1101
1102    fuse_log(FUSE_LOG_DEBUG, "  %lli/%s -> %lli\n", (unsigned long long)parent,
1103             name, (unsigned long long)e->ino);
1104
1105    return 0;
1106
1107out_err:
1108    saverr = errno;
1109    if (newfd != -1) {
1110        close(newfd);
1111    }
1112    lo_inode_put(lo, &inode);
1113    lo_inode_put(lo, &dir);
1114    return saverr;
1115}
1116
1117static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)
1118{
1119    struct fuse_entry_param e;
1120    int err;
1121
1122    fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", parent,
1123             name);
1124
1125    if (is_empty(name)) {
1126        fuse_reply_err(req, ENOENT);
1127        return;
1128    }
1129
1130    /*
1131     * Don't use is_safe_path_component(), allow "." and ".." for NFS export
1132     * support.
1133     */
1134    if (strchr(name, '/')) {
1135        fuse_reply_err(req, EINVAL);
1136        return;
1137    }
1138
1139    err = lo_do_lookup(req, parent, name, &e, NULL);
1140    if (err) {
1141        fuse_reply_err(req, err);
1142    } else {
1143        fuse_reply_entry(req, &e);
1144    }
1145}
1146
1147/*
1148 * On some archs, setres*id is limited to 2^16 but they
1149 * provide setres*id32 variants that allow 2^32.
1150 * Others just let setres*id do 2^32 anyway.
1151 */
1152#ifdef SYS_setresgid32
1153#define OURSYS_setresgid SYS_setresgid32
1154#else
1155#define OURSYS_setresgid SYS_setresgid
1156#endif
1157
1158#ifdef SYS_setresuid32
1159#define OURSYS_setresuid SYS_setresuid32
1160#else
1161#define OURSYS_setresuid SYS_setresuid
1162#endif
1163
1164/*
1165 * Change to uid/gid of caller so that file is created with
1166 * ownership of caller.
1167 * TODO: What about selinux context?
1168 */
1169static int lo_change_cred(fuse_req_t req, struct lo_cred *old,
1170                          bool change_umask)
1171{
1172    int res;
1173
1174    old->euid = geteuid();
1175    old->egid = getegid();
1176
1177    res = syscall(OURSYS_setresgid, -1, fuse_req_ctx(req)->gid, -1);
1178    if (res == -1) {
1179        return errno;
1180    }
1181
1182    res = syscall(OURSYS_setresuid, -1, fuse_req_ctx(req)->uid, -1);
1183    if (res == -1) {
1184        int errno_save = errno;
1185
1186        syscall(OURSYS_setresgid, -1, old->egid, -1);
1187        return errno_save;
1188    }
1189
1190    if (change_umask) {
1191        old->umask = umask(req->ctx.umask);
1192    }
1193    return 0;
1194}
1195
1196/* Regain Privileges */
1197static void lo_restore_cred(struct lo_cred *old, bool restore_umask)
1198{
1199    int res;
1200
1201    res = syscall(OURSYS_setresuid, -1, old->euid, -1);
1202    if (res == -1) {
1203        fuse_log(FUSE_LOG_ERR, "seteuid(%u): %m\n", old->euid);
1204        exit(1);
1205    }
1206
1207    res = syscall(OURSYS_setresgid, -1, old->egid, -1);
1208    if (res == -1) {
1209        fuse_log(FUSE_LOG_ERR, "setegid(%u): %m\n", old->egid);
1210        exit(1);
1211    }
1212
1213    if (restore_umask)
1214        umask(old->umask);
1215}
1216
1217/*
1218 * A helper to change cred and drop capability. Returns 0 on success and
1219 * errno on error
1220 */
1221static int lo_drop_cap_change_cred(fuse_req_t req, struct lo_cred *old,
1222                                   bool change_umask, const char *cap_name,
1223                                   bool *cap_dropped)
1224{
1225    int ret;
1226    bool __cap_dropped;
1227
1228    assert(cap_name);
1229
1230    ret = drop_effective_cap(cap_name, &__cap_dropped);
1231    if (ret) {
1232        return ret;
1233    }
1234
1235    ret = lo_change_cred(req, old, change_umask);
1236    if (ret) {
1237        if (__cap_dropped) {
1238            if (gain_effective_cap(cap_name)) {
1239                fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_%s\n", cap_name);
1240            }
1241        }
1242    }
1243
1244    if (cap_dropped) {
1245        *cap_dropped = __cap_dropped;
1246    }
1247    return ret;
1248}
1249
1250static void lo_restore_cred_gain_cap(struct lo_cred *old, bool restore_umask,
1251                                     const char *cap_name)
1252{
1253    assert(cap_name);
1254
1255    lo_restore_cred(old, restore_umask);
1256
1257    if (gain_effective_cap(cap_name)) {
1258        fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_%s\n", cap_name);
1259    }
1260}
1261
1262static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent,
1263                             const char *name, mode_t mode, dev_t rdev,
1264                             const char *link)
1265{
1266    int res;
1267    int saverr;
1268    struct lo_data *lo = lo_data(req);
1269    struct lo_inode *dir;
1270    struct fuse_entry_param e;
1271    struct lo_cred old = {};
1272
1273    if (is_empty(name)) {
1274        fuse_reply_err(req, ENOENT);
1275        return;
1276    }
1277
1278    if (!is_safe_path_component(name)) {
1279        fuse_reply_err(req, EINVAL);
1280        return;
1281    }
1282
1283    dir = lo_inode(req, parent);
1284    if (!dir) {
1285        fuse_reply_err(req, EBADF);
1286        return;
1287    }
1288
1289    saverr = lo_change_cred(req, &old, lo->change_umask && !S_ISLNK(mode));
1290    if (saverr) {
1291        goto out;
1292    }
1293
1294    res = mknod_wrapper(dir->fd, name, link, mode, rdev);
1295
1296    saverr = errno;
1297
1298    lo_restore_cred(&old, lo->change_umask && !S_ISLNK(mode));
1299
1300    if (res == -1) {
1301        goto out;
1302    }
1303
1304    saverr = lo_do_lookup(req, parent, name, &e, NULL);
1305    if (saverr) {
1306        goto out;
1307    }
1308
1309    fuse_log(FUSE_LOG_DEBUG, "  %lli/%s -> %lli\n", (unsigned long long)parent,
1310             name, (unsigned long long)e.ino);
1311
1312    fuse_reply_entry(req, &e);
1313    lo_inode_put(lo, &dir);
1314    return;
1315
1316out:
1317    lo_inode_put(lo, &dir);
1318    fuse_reply_err(req, saverr);
1319}
1320
1321static void lo_mknod(fuse_req_t req, fuse_ino_t parent, const char *name,
1322                     mode_t mode, dev_t rdev)
1323{
1324    lo_mknod_symlink(req, parent, name, mode, rdev, NULL);
1325}
1326
1327static void lo_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name,
1328                     mode_t mode)
1329{
1330    lo_mknod_symlink(req, parent, name, S_IFDIR | mode, 0, NULL);
1331}
1332
1333static void lo_symlink(fuse_req_t req, const char *link, fuse_ino_t parent,
1334                       const char *name)
1335{
1336    lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link);
1337}
1338
1339static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent,
1340                    const char *name)
1341{
1342    int res;
1343    struct lo_data *lo = lo_data(req);
1344    struct lo_inode *parent_inode;
1345    struct lo_inode *inode;
1346    struct fuse_entry_param e;
1347    char procname[64];
1348    int saverr;
1349
1350    if (is_empty(name)) {
1351        fuse_reply_err(req, ENOENT);
1352        return;
1353    }
1354
1355    if (!is_safe_path_component(name)) {
1356        fuse_reply_err(req, EINVAL);
1357        return;
1358    }
1359
1360    parent_inode = lo_inode(req, parent);
1361    inode = lo_inode(req, ino);
1362    if (!parent_inode || !inode) {
1363        errno = EBADF;
1364        goto out_err;
1365    }
1366
1367    memset(&e, 0, sizeof(struct fuse_entry_param));
1368    e.attr_timeout = lo->timeout;
1369    e.entry_timeout = lo->timeout;
1370
1371    sprintf(procname, "%i", inode->fd);
1372    res = linkat(lo->proc_self_fd, procname, parent_inode->fd, name,
1373                 AT_SYMLINK_FOLLOW);
1374    if (res == -1) {
1375        goto out_err;
1376    }
1377
1378    res = fstatat(inode->fd, "", &e.attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1379    if (res == -1) {
1380        goto out_err;
1381    }
1382
1383    pthread_mutex_lock(&lo->mutex);
1384    inode->nlookup++;
1385    pthread_mutex_unlock(&lo->mutex);
1386    e.ino = inode->fuse_ino;
1387
1388    fuse_log(FUSE_LOG_DEBUG, "  %lli/%s -> %lli\n", (unsigned long long)parent,
1389             name, (unsigned long long)e.ino);
1390
1391    fuse_reply_entry(req, &e);
1392    lo_inode_put(lo, &parent_inode);
1393    lo_inode_put(lo, &inode);
1394    return;
1395
1396out_err:
1397    saverr = errno;
1398    lo_inode_put(lo, &parent_inode);
1399    lo_inode_put(lo, &inode);
1400    fuse_reply_err(req, saverr);
1401}
1402
1403/* Increments nlookup and caller must release refcount using lo_inode_put() */
1404static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent,
1405                                    const char *name)
1406{
1407    int res;
1408    uint64_t mnt_id;
1409    struct stat attr;
1410    struct lo_data *lo = lo_data(req);
1411    struct lo_inode *dir = lo_inode(req, parent);
1412
1413    if (!dir) {
1414        return NULL;
1415    }
1416
1417    res = do_statx(lo, dir->fd, name, &attr, AT_SYMLINK_NOFOLLOW, &mnt_id);
1418    lo_inode_put(lo, &dir);
1419    if (res == -1) {
1420        return NULL;
1421    }
1422
1423    return lo_find(lo, &attr, mnt_id);
1424}
1425
1426static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name)
1427{
1428    int res;
1429    struct lo_inode *inode;
1430    struct lo_data *lo = lo_data(req);
1431
1432    if (is_empty(name)) {
1433        fuse_reply_err(req, ENOENT);
1434        return;
1435    }
1436
1437    if (!is_safe_path_component(name)) {
1438        fuse_reply_err(req, EINVAL);
1439        return;
1440    }
1441
1442    inode = lookup_name(req, parent, name);
1443    if (!inode) {
1444        fuse_reply_err(req, EIO);
1445        return;
1446    }
1447
1448    res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR);
1449
1450    fuse_reply_err(req, res == -1 ? errno : 0);
1451    unref_inode_lolocked(lo, inode, 1);
1452    lo_inode_put(lo, &inode);
1453}
1454
1455static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name,
1456                      fuse_ino_t newparent, const char *newname,
1457                      unsigned int flags)
1458{
1459    int res;
1460    struct lo_inode *parent_inode;
1461    struct lo_inode *newparent_inode;
1462    struct lo_inode *oldinode = NULL;
1463    struct lo_inode *newinode = NULL;
1464    struct lo_data *lo = lo_data(req);
1465
1466    if (is_empty(name) || is_empty(newname)) {
1467        fuse_reply_err(req, ENOENT);
1468        return;
1469    }
1470
1471    if (!is_safe_path_component(name) || !is_safe_path_component(newname)) {
1472        fuse_reply_err(req, EINVAL);
1473        return;
1474    }
1475
1476    parent_inode = lo_inode(req, parent);
1477    newparent_inode = lo_inode(req, newparent);
1478    if (!parent_inode || !newparent_inode) {
1479        fuse_reply_err(req, EBADF);
1480        goto out;
1481    }
1482
1483    oldinode = lookup_name(req, parent, name);
1484    newinode = lookup_name(req, newparent, newname);
1485
1486    if (!oldinode) {
1487        fuse_reply_err(req, EIO);
1488        goto out;
1489    }
1490
1491    if (flags) {
1492#ifndef SYS_renameat2
1493        fuse_reply_err(req, EINVAL);
1494#else
1495        res = syscall(SYS_renameat2, parent_inode->fd, name,
1496                        newparent_inode->fd, newname, flags);
1497        if (res == -1 && errno == ENOSYS) {
1498            fuse_reply_err(req, EINVAL);
1499        } else {
1500            fuse_reply_err(req, res == -1 ? errno : 0);
1501        }
1502#endif
1503        goto out;
1504    }
1505
1506    res = renameat(parent_inode->fd, name, newparent_inode->fd, newname);
1507
1508    fuse_reply_err(req, res == -1 ? errno : 0);
1509out:
1510    unref_inode_lolocked(lo, oldinode, 1);
1511    unref_inode_lolocked(lo, newinode, 1);
1512    lo_inode_put(lo, &oldinode);
1513    lo_inode_put(lo, &newinode);
1514    lo_inode_put(lo, &parent_inode);
1515    lo_inode_put(lo, &newparent_inode);
1516}
1517
1518static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name)
1519{
1520    int res;
1521    struct lo_inode *inode;
1522    struct lo_data *lo = lo_data(req);
1523
1524    if (is_empty(name)) {
1525        fuse_reply_err(req, ENOENT);
1526        return;
1527    }
1528
1529    if (!is_safe_path_component(name)) {
1530        fuse_reply_err(req, EINVAL);
1531        return;
1532    }
1533
1534    inode = lookup_name(req, parent, name);
1535    if (!inode) {
1536        fuse_reply_err(req, EIO);
1537        return;
1538    }
1539
1540    res = unlinkat(lo_fd(req, parent), name, 0);
1541
1542    fuse_reply_err(req, res == -1 ? errno : 0);
1543    unref_inode_lolocked(lo, inode, 1);
1544    lo_inode_put(lo, &inode);
1545}
1546
1547/* To be called with lo->mutex held */
1548static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n)
1549{
1550    if (!inode) {
1551        return;
1552    }
1553
1554    assert(inode->nlookup >= n);
1555    inode->nlookup -= n;
1556    if (!inode->nlookup) {
1557        lo_map_remove(&lo->ino_map, inode->fuse_ino);
1558        g_hash_table_remove(lo->inodes, &inode->key);
1559        if (lo->posix_lock) {
1560            if (g_hash_table_size(inode->posix_locks)) {
1561                fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n");
1562            }
1563            g_hash_table_destroy(inode->posix_locks);
1564            pthread_mutex_destroy(&inode->plock_mutex);
1565        }
1566        /* Drop our refcount from lo_do_lookup() */
1567        lo_inode_put(lo, &inode);
1568    }
1569}
1570
1571static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
1572                                 uint64_t n)
1573{
1574    if (!inode) {
1575        return;
1576    }
1577
1578    pthread_mutex_lock(&lo->mutex);
1579    unref_inode(lo, inode, n);
1580    pthread_mutex_unlock(&lo->mutex);
1581}
1582
1583static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
1584{
1585    struct lo_data *lo = lo_data(req);
1586    struct lo_inode *inode;
1587
1588    inode = lo_inode(req, ino);
1589    if (!inode) {
1590        return;
1591    }
1592
1593    fuse_log(FUSE_LOG_DEBUG, "  forget %lli %lli -%lli\n",
1594             (unsigned long long)ino, (unsigned long long)inode->nlookup,
1595             (unsigned long long)nlookup);
1596
1597    unref_inode_lolocked(lo, inode, nlookup);
1598    lo_inode_put(lo, &inode);
1599}
1600
1601static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
1602{
1603    lo_forget_one(req, ino, nlookup);
1604    fuse_reply_none(req);
1605}
1606
1607static void lo_forget_multi(fuse_req_t req, size_t count,
1608                            struct fuse_forget_data *forgets)
1609{
1610    int i;
1611
1612    for (i = 0; i < count; i++) {
1613        lo_forget_one(req, forgets[i].ino, forgets[i].nlookup);
1614    }
1615    fuse_reply_none(req);
1616}
1617
1618static void lo_readlink(fuse_req_t req, fuse_ino_t ino)
1619{
1620    char buf[PATH_MAX + 1];
1621    int res;
1622
1623    res = readlinkat(lo_fd(req, ino), "", buf, sizeof(buf));
1624    if (res == -1) {
1625        return (void)fuse_reply_err(req, errno);
1626    }
1627
1628    if (res == sizeof(buf)) {
1629        return (void)fuse_reply_err(req, ENAMETOOLONG);
1630    }
1631
1632    buf[res] = '\0';
1633
1634    fuse_reply_readlink(req, buf);
1635}
1636
1637struct lo_dirp {
1638    gint refcount;
1639    DIR *dp;
1640    struct dirent *entry;
1641    off_t offset;
1642};
1643
1644static void lo_dirp_put(struct lo_dirp **dp)
1645{
1646    struct lo_dirp *d = *dp;
1647
1648    if (!d) {
1649        return;
1650    }
1651    *dp = NULL;
1652
1653    if (g_atomic_int_dec_and_test(&d->refcount)) {
1654        closedir(d->dp);
1655        free(d);
1656    }
1657}
1658
1659/* Call lo_dirp_put() on the return value when no longer needed */
1660static struct lo_dirp *lo_dirp(fuse_req_t req, struct fuse_file_info *fi)
1661{
1662    struct lo_data *lo = lo_data(req);
1663    struct lo_map_elem *elem;
1664
1665    pthread_mutex_lock(&lo->mutex);
1666    elem = lo_map_get(&lo->dirp_map, fi->fh);
1667    if (elem) {
1668        g_atomic_int_inc(&elem->dirp->refcount);
1669    }
1670    pthread_mutex_unlock(&lo->mutex);
1671    if (!elem) {
1672        return NULL;
1673    }
1674
1675    return elem->dirp;
1676}
1677
1678static void lo_opendir(fuse_req_t req, fuse_ino_t ino,
1679                       struct fuse_file_info *fi)
1680{
1681    int error = ENOMEM;
1682    struct lo_data *lo = lo_data(req);
1683    struct lo_dirp *d;
1684    int fd;
1685    ssize_t fh;
1686
1687    d = calloc(1, sizeof(struct lo_dirp));
1688    if (d == NULL) {
1689        goto out_err;
1690    }
1691
1692    fd = openat(lo_fd(req, ino), ".", O_RDONLY);
1693    if (fd == -1) {
1694        goto out_errno;
1695    }
1696
1697    d->dp = fdopendir(fd);
1698    if (d->dp == NULL) {
1699        goto out_errno;
1700    }
1701
1702    d->offset = 0;
1703    d->entry = NULL;
1704
1705    g_atomic_int_set(&d->refcount, 1); /* paired with lo_releasedir() */
1706    pthread_mutex_lock(&lo->mutex);
1707    fh = lo_add_dirp_mapping(req, d);
1708    pthread_mutex_unlock(&lo->mutex);
1709    if (fh == -1) {
1710        goto out_err;
1711    }
1712
1713    fi->fh = fh;
1714    if (lo->cache == CACHE_ALWAYS) {
1715        fi->cache_readdir = 1;
1716    }
1717    fuse_reply_open(req, fi);
1718    return;
1719
1720out_errno:
1721    error = errno;
1722out_err:
1723    if (d) {
1724        if (d->dp) {
1725            closedir(d->dp);
1726        } else if (fd != -1) {
1727            close(fd);
1728        }
1729        free(d);
1730    }
1731    fuse_reply_err(req, error);
1732}
1733
1734static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
1735                          off_t offset, struct fuse_file_info *fi, int plus)
1736{
1737    struct lo_data *lo = lo_data(req);
1738    struct lo_dirp *d = NULL;
1739    struct lo_inode *dinode;
1740    g_autofree char *buf = NULL;
1741    char *p;
1742    size_t rem = size;
1743    int err = EBADF;
1744
1745    dinode = lo_inode(req, ino);
1746    if (!dinode) {
1747        goto error;
1748    }
1749
1750    d = lo_dirp(req, fi);
1751    if (!d) {
1752        goto error;
1753    }
1754
1755    err = ENOMEM;
1756    buf = g_try_malloc0(size);
1757    if (!buf) {
1758        goto error;
1759    }
1760    p = buf;
1761
1762    if (offset != d->offset) {
1763        seekdir(d->dp, offset);
1764        d->entry = NULL;
1765        d->offset = offset;
1766    }
1767    while (1) {
1768        size_t entsize;
1769        off_t nextoff;
1770        const char *name;
1771
1772        if (!d->entry) {
1773            errno = 0;
1774            d->entry = readdir(d->dp);
1775            if (!d->entry) {
1776                if (errno) { /* Error */
1777                    err = errno;
1778                    goto error;
1779                } else { /* End of stream */
1780                    break;
1781                }
1782            }
1783        }
1784        nextoff = d->entry->d_off;
1785        name = d->entry->d_name;
1786
1787        fuse_ino_t entry_ino = 0;
1788        struct fuse_entry_param e = (struct fuse_entry_param){
1789            .attr.st_ino = d->entry->d_ino,
1790            .attr.st_mode = d->entry->d_type << 12,
1791        };
1792
1793        /* Hide root's parent directory */
1794        if (dinode == &lo->root && strcmp(name, "..") == 0) {
1795            e.attr.st_ino = lo->root.key.ino;
1796            e.attr.st_mode = DT_DIR << 12;
1797        }
1798
1799        if (plus) {
1800            if (!is_dot_or_dotdot(name)) {
1801                err = lo_do_lookup(req, ino, name, &e, NULL);
1802                if (err) {
1803                    goto error;
1804                }
1805                entry_ino = e.ino;
1806            }
1807
1808            entsize = fuse_add_direntry_plus(req, p, rem, name, &e, nextoff);
1809        } else {
1810            entsize = fuse_add_direntry(req, p, rem, name, &e.attr, nextoff);
1811        }
1812        if (entsize > rem) {
1813            if (entry_ino != 0) {
1814                lo_forget_one(req, entry_ino, 1);
1815            }
1816            break;
1817        }
1818
1819        p += entsize;
1820        rem -= entsize;
1821
1822        d->entry = NULL;
1823        d->offset = nextoff;
1824    }
1825
1826    err = 0;
1827error:
1828    lo_dirp_put(&d);
1829    lo_inode_put(lo, &dinode);
1830
1831    /*
1832     * If there's an error, we can only signal it if we haven't stored
1833     * any entries yet - otherwise we'd end up with wrong lookup
1834     * counts for the entries that are already in the buffer. So we
1835     * return what we've collected until that point.
1836     */
1837    if (err && rem == size) {
1838        fuse_reply_err(req, err);
1839    } else {
1840        fuse_reply_buf(req, buf, size - rem);
1841    }
1842}
1843
1844static void lo_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
1845                       off_t offset, struct fuse_file_info *fi)
1846{
1847    lo_do_readdir(req, ino, size, offset, fi, 0);
1848}
1849
1850static void lo_readdirplus(fuse_req_t req, fuse_ino_t ino, size_t size,
1851                           off_t offset, struct fuse_file_info *fi)
1852{
1853    lo_do_readdir(req, ino, size, offset, fi, 1);
1854}
1855
1856static void lo_releasedir(fuse_req_t req, fuse_ino_t ino,
1857                          struct fuse_file_info *fi)
1858{
1859    struct lo_data *lo = lo_data(req);
1860    struct lo_map_elem *elem;
1861    struct lo_dirp *d;
1862
1863    (void)ino;
1864
1865    pthread_mutex_lock(&lo->mutex);
1866    elem = lo_map_get(&lo->dirp_map, fi->fh);
1867    if (!elem) {
1868        pthread_mutex_unlock(&lo->mutex);
1869        fuse_reply_err(req, EBADF);
1870        return;
1871    }
1872
1873    d = elem->dirp;
1874    lo_map_remove(&lo->dirp_map, fi->fh);
1875    pthread_mutex_unlock(&lo->mutex);
1876
1877    lo_dirp_put(&d); /* paired with lo_opendir() */
1878
1879    fuse_reply_err(req, 0);
1880}
1881
1882static void update_open_flags(int writeback, int allow_direct_io,
1883                              struct fuse_file_info *fi)
1884{
1885    /*
1886     * With writeback cache, kernel may send read requests even
1887     * when userspace opened write-only
1888     */
1889    if (writeback && (fi->flags & O_ACCMODE) == O_WRONLY) {
1890        fi->flags &= ~O_ACCMODE;
1891        fi->flags |= O_RDWR;
1892    }
1893
1894    /*
1895     * With writeback cache, O_APPEND is handled by the kernel.
1896     * This breaks atomicity (since the file may change in the
1897     * underlying filesystem, so that the kernel's idea of the
1898     * end of the file isn't accurate anymore). In this example,
1899     * we just accept that. A more rigorous filesystem may want
1900     * to return an error here
1901     */
1902    if (writeback && (fi->flags & O_APPEND)) {
1903        fi->flags &= ~O_APPEND;
1904    }
1905
1906    /*
1907     * O_DIRECT in guest should not necessarily mean bypassing page
1908     * cache on host as well. Therefore, we discard it by default
1909     * ('-o no_allow_direct_io'). If somebody needs that behavior,
1910     * the '-o allow_direct_io' option should be set.
1911     */
1912    if (!allow_direct_io) {
1913        fi->flags &= ~O_DIRECT;
1914    }
1915}
1916
1917/*
1918 * Open a regular file, set up an fd mapping, and fill out the struct
1919 * fuse_file_info for it. If existing_fd is not negative, use that fd instead
1920 * opening a new one. Takes ownership of existing_fd.
1921 *
1922 * Returns 0 on success or a positive errno.
1923 */
1924static int lo_do_open(struct lo_data *lo, struct lo_inode *inode,
1925                      int existing_fd, struct fuse_file_info *fi)
1926{
1927    ssize_t fh;
1928    int fd = existing_fd;
1929    int err;
1930    bool cap_fsetid_dropped = false;
1931    bool kill_suidgid = lo->killpriv_v2 && fi->kill_priv;
1932
1933    update_open_flags(lo->writeback, lo->allow_direct_io, fi);
1934
1935    if (fd < 0) {
1936        if (kill_suidgid) {
1937            err = drop_effective_cap("FSETID", &cap_fsetid_dropped);
1938            if (err) {
1939                return err;
1940            }
1941        }
1942
1943        fd = lo_inode_open(lo, inode, fi->flags);
1944
1945        if (cap_fsetid_dropped) {
1946            if (gain_effective_cap("FSETID")) {
1947                fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
1948            }
1949        }
1950        if (fd < 0) {
1951            return -fd;
1952        }
1953        if (fi->flags & (O_TRUNC)) {
1954            int err = drop_security_capability(lo, fd);
1955            if (err) {
1956                close(fd);
1957                return err;
1958            }
1959        }
1960    }
1961
1962    pthread_mutex_lock(&lo->mutex);
1963    fh = lo_add_fd_mapping(lo, fd);
1964    pthread_mutex_unlock(&lo->mutex);
1965    if (fh == -1) {
1966        close(fd);
1967        return ENOMEM;
1968    }
1969
1970    fi->fh = fh;
1971    if (lo->cache == CACHE_NONE) {
1972        fi->direct_io = 1;
1973    } else if (lo->cache == CACHE_ALWAYS) {
1974        fi->keep_cache = 1;
1975    }
1976    return 0;
1977}
1978
1979static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name,
1980                      mode_t mode, struct fuse_file_info *fi)
1981{
1982    int fd = -1;
1983    struct lo_data *lo = lo_data(req);
1984    struct lo_inode *parent_inode;
1985    struct lo_inode *inode = NULL;
1986    struct fuse_entry_param e;
1987    int err;
1988    struct lo_cred old = {};
1989
1990    fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)"
1991             " kill_priv=%d\n", parent, name, fi->kill_priv);
1992
1993    if (!is_safe_path_component(name)) {
1994        fuse_reply_err(req, EINVAL);
1995        return;
1996    }
1997
1998    parent_inode = lo_inode(req, parent);
1999    if (!parent_inode) {
2000        fuse_reply_err(req, EBADF);
2001        return;
2002    }
2003
2004    err = lo_change_cred(req, &old, lo->change_umask);
2005    if (err) {
2006        goto out;
2007    }
2008
2009    update_open_flags(lo->writeback, lo->allow_direct_io, fi);
2010
2011    /* Try to create a new file but don't open existing files */
2012    fd = openat(parent_inode->fd, name, fi->flags | O_CREAT | O_EXCL, mode);
2013    err = fd == -1 ? errno : 0;
2014
2015    lo_restore_cred(&old, lo->change_umask);
2016
2017    /* Ignore the error if file exists and O_EXCL was not given */
2018    if (err && (err != EEXIST || (fi->flags & O_EXCL))) {
2019        goto out;
2020    }
2021
2022    err = lo_do_lookup(req, parent, name, &e, &inode);
2023    if (err) {
2024        goto out;
2025    }
2026
2027    err = lo_do_open(lo, inode, fd, fi);
2028    fd = -1; /* lo_do_open() takes ownership of fd */
2029    if (err) {
2030        /* Undo lo_do_lookup() nlookup ref */
2031        unref_inode_lolocked(lo, inode, 1);
2032    }
2033
2034out:
2035    lo_inode_put(lo, &inode);
2036    lo_inode_put(lo, &parent_inode);
2037
2038    if (err) {
2039        if (fd >= 0) {
2040            close(fd);
2041        }
2042
2043        fuse_reply_err(req, err);
2044    } else {
2045        fuse_reply_create(req, &e, fi);
2046    }
2047}
2048
2049/* Should be called with inode->plock_mutex held */
2050static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo,
2051                                                      struct lo_inode *inode,
2052                                                      uint64_t lock_owner,
2053                                                      pid_t pid, int *err)
2054{
2055    struct lo_inode_plock *plock;
2056    int fd;
2057
2058    plock =
2059        g_hash_table_lookup(inode->posix_locks, GUINT_TO_POINTER(lock_owner));
2060
2061    if (plock) {
2062        return plock;
2063    }
2064
2065    plock = malloc(sizeof(struct lo_inode_plock));
2066    if (!plock) {
2067        *err = ENOMEM;
2068        return NULL;
2069    }
2070
2071    /* Open another instance of file which can be used for ofd locks. */
2072    /* TODO: What if file is not writable? */
2073    fd = lo_inode_open(lo, inode, O_RDWR);
2074    if (fd < 0) {
2075        *err = -fd;
2076        free(plock);
2077        return NULL;
2078    }
2079
2080    plock->lock_owner = lock_owner;
2081    plock->fd = fd;
2082    g_hash_table_insert(inode->posix_locks, GUINT_TO_POINTER(plock->lock_owner),
2083                        plock);
2084    return plock;
2085}
2086
2087static void lo_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
2088                     struct flock *lock)
2089{
2090    struct lo_data *lo = lo_data(req);
2091    struct lo_inode *inode;
2092    struct lo_inode_plock *plock;
2093    int ret, saverr = 0;
2094
2095    fuse_log(FUSE_LOG_DEBUG,
2096             "lo_getlk(ino=%" PRIu64 ", flags=%d)"
2097             " owner=0x%" PRIx64 ", l_type=%d l_start=0x%" PRIx64
2098             " l_len=0x%" PRIx64 "\n",
2099             ino, fi->flags, fi->lock_owner, lock->l_type,
2100             (uint64_t)lock->l_start, (uint64_t)lock->l_len);
2101
2102    if (!lo->posix_lock) {
2103        fuse_reply_err(req, ENOSYS);
2104        return;
2105    }
2106
2107    inode = lo_inode(req, ino);
2108    if (!inode) {
2109        fuse_reply_err(req, EBADF);
2110        return;
2111    }
2112
2113    pthread_mutex_lock(&inode->plock_mutex);
2114    plock =
2115        lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
2116    if (!plock) {
2117        saverr = ret;
2118        goto out;
2119    }
2120
2121    ret = fcntl(plock->fd, F_OFD_GETLK, lock);
2122    if (ret == -1) {
2123        saverr = errno;
2124    }
2125
2126out:
2127    pthread_mutex_unlock(&inode->plock_mutex);
2128    lo_inode_put(lo, &inode);
2129
2130    if (saverr) {
2131        fuse_reply_err(req, saverr);
2132    } else {
2133        fuse_reply_lock(req, lock);
2134    }
2135}
2136
2137static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
2138                     struct flock *lock, int sleep)
2139{
2140    struct lo_data *lo = lo_data(req);
2141    struct lo_inode *inode;
2142    struct lo_inode_plock *plock;
2143    int ret, saverr = 0;
2144
2145    fuse_log(FUSE_LOG_DEBUG,
2146             "lo_setlk(ino=%" PRIu64 ", flags=%d)"
2147             " cmd=%d pid=%d owner=0x%" PRIx64 " sleep=%d l_whence=%d"
2148             " l_start=0x%" PRIx64 " l_len=0x%" PRIx64 "\n",
2149             ino, fi->flags, lock->l_type, lock->l_pid, fi->lock_owner, sleep,
2150             lock->l_whence, (uint64_t)lock->l_start, (uint64_t)lock->l_len);
2151
2152    if (!lo->posix_lock) {
2153        fuse_reply_err(req, ENOSYS);
2154        return;
2155    }
2156
2157    if (sleep) {
2158        fuse_reply_err(req, EOPNOTSUPP);
2159        return;
2160    }
2161
2162    inode = lo_inode(req, ino);
2163    if (!inode) {
2164        fuse_reply_err(req, EBADF);
2165        return;
2166    }
2167
2168    pthread_mutex_lock(&inode->plock_mutex);
2169    plock =
2170        lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
2171
2172    if (!plock) {
2173        saverr = ret;
2174        goto out;
2175    }
2176
2177    /* TODO: Is it alright to modify flock? */
2178    lock->l_pid = 0;
2179    ret = fcntl(plock->fd, F_OFD_SETLK, lock);
2180    if (ret == -1) {
2181        saverr = errno;
2182    }
2183
2184out:
2185    pthread_mutex_unlock(&inode->plock_mutex);
2186    lo_inode_put(lo, &inode);
2187
2188    fuse_reply_err(req, saverr);
2189}
2190
2191static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync,
2192                        struct fuse_file_info *fi)
2193{
2194    int res;
2195    struct lo_dirp *d;
2196    int fd;
2197
2198    (void)ino;
2199
2200    d = lo_dirp(req, fi);
2201    if (!d) {
2202        fuse_reply_err(req, EBADF);
2203        return;
2204    }
2205
2206    fd = dirfd(d->dp);
2207    if (datasync) {
2208        res = fdatasync(fd);
2209    } else {
2210        res = fsync(fd);
2211    }
2212
2213    lo_dirp_put(&d);
2214
2215    fuse_reply_err(req, res == -1 ? errno : 0);
2216}
2217
2218static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
2219{
2220    struct lo_data *lo = lo_data(req);
2221    struct lo_inode *inode = lo_inode(req, ino);
2222    int err;
2223
2224    fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d, kill_priv=%d)"
2225             "\n", ino, fi->flags, fi->kill_priv);
2226
2227    if (!inode) {
2228        fuse_reply_err(req, EBADF);
2229        return;
2230    }
2231
2232    err = lo_do_open(lo, inode, -1, fi);
2233    lo_inode_put(lo, &inode);
2234    if (err) {
2235        fuse_reply_err(req, err);
2236    } else {
2237        fuse_reply_open(req, fi);
2238    }
2239}
2240
2241static void lo_release(fuse_req_t req, fuse_ino_t ino,
2242                       struct fuse_file_info *fi)
2243{
2244    struct lo_data *lo = lo_data(req);
2245    struct lo_map_elem *elem;
2246    int fd = -1;
2247
2248    (void)ino;
2249
2250    pthread_mutex_lock(&lo->mutex);
2251    elem = lo_map_get(&lo->fd_map, fi->fh);
2252    if (elem) {
2253        fd = elem->fd;
2254        elem = NULL;
2255        lo_map_remove(&lo->fd_map, fi->fh);
2256    }
2257    pthread_mutex_unlock(&lo->mutex);
2258
2259    close(fd);
2260    fuse_reply_err(req, 0);
2261}
2262
2263static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
2264{
2265    int res;
2266    (void)ino;
2267    struct lo_inode *inode;
2268    struct lo_data *lo = lo_data(req);
2269
2270    inode = lo_inode(req, ino);
2271    if (!inode) {
2272        fuse_reply_err(req, EBADF);
2273        return;
2274    }
2275
2276    if (!S_ISREG(inode->filetype)) {
2277        lo_inode_put(lo, &inode);
2278        fuse_reply_err(req, EBADF);
2279        return;
2280    }
2281
2282    /* An fd is going away. Cleanup associated posix locks */
2283    if (lo->posix_lock) {
2284        pthread_mutex_lock(&inode->plock_mutex);
2285        g_hash_table_remove(inode->posix_locks,
2286            GUINT_TO_POINTER(fi->lock_owner));
2287        pthread_mutex_unlock(&inode->plock_mutex);
2288    }
2289    res = close(dup(lo_fi_fd(req, fi)));
2290    lo_inode_put(lo, &inode);
2291    fuse_reply_err(req, res == -1 ? errno : 0);
2292}
2293
2294static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync,
2295                     struct fuse_file_info *fi)
2296{
2297    struct lo_inode *inode = lo_inode(req, ino);
2298    struct lo_data *lo = lo_data(req);
2299    int res;
2300    int fd;
2301
2302    fuse_log(FUSE_LOG_DEBUG, "lo_fsync(ino=%" PRIu64 ", fi=0x%p)\n", ino,
2303             (void *)fi);
2304
2305    if (!inode) {
2306        fuse_reply_err(req, EBADF);
2307        return;
2308    }
2309
2310    if (!fi) {
2311        fd = lo_inode_open(lo, inode, O_RDWR);
2312        if (fd < 0) {
2313            res = -fd;
2314            goto out;
2315        }
2316    } else {
2317        fd = lo_fi_fd(req, fi);
2318    }
2319
2320    if (datasync) {
2321        res = fdatasync(fd) == -1 ? errno : 0;
2322    } else {
2323        res = fsync(fd) == -1 ? errno : 0;
2324    }
2325    if (!fi) {
2326        close(fd);
2327    }
2328out:
2329    lo_inode_put(lo, &inode);
2330    fuse_reply_err(req, res);
2331}
2332
2333static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset,
2334                    struct fuse_file_info *fi)
2335{
2336    struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size);
2337
2338    fuse_log(FUSE_LOG_DEBUG,
2339             "lo_read(ino=%" PRIu64 ", size=%zd, "
2340             "off=%lu)\n",
2341             ino, size, (unsigned long)offset);
2342
2343    buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
2344    buf.buf[0].fd = lo_fi_fd(req, fi);
2345    buf.buf[0].pos = offset;
2346
2347    fuse_reply_data(req, &buf);
2348}
2349
2350static void lo_write_buf(fuse_req_t req, fuse_ino_t ino,
2351                         struct fuse_bufvec *in_buf, off_t off,
2352                         struct fuse_file_info *fi)
2353{
2354    (void)ino;
2355    ssize_t res;
2356    struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf));
2357    bool cap_fsetid_dropped = false;
2358
2359    out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
2360    out_buf.buf[0].fd = lo_fi_fd(req, fi);
2361    out_buf.buf[0].pos = off;
2362
2363    fuse_log(FUSE_LOG_DEBUG,
2364             "lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu kill_priv=%d)\n",
2365             ino, out_buf.buf[0].size, (unsigned long)off, fi->kill_priv);
2366
2367    res = drop_security_capability(lo_data(req), out_buf.buf[0].fd);
2368    if (res) {
2369        fuse_reply_err(req, res);
2370        return;
2371    }
2372
2373    /*
2374     * If kill_priv is set, drop CAP_FSETID which should lead to kernel
2375     * clearing setuid/setgid on file. Note, for WRITE, we need to do
2376     * this even if killpriv_v2 is not enabled. fuse direct write path
2377     * relies on this.
2378     */
2379    if (fi->kill_priv) {
2380        res = drop_effective_cap("FSETID", &cap_fsetid_dropped);
2381        if (res != 0) {
2382            fuse_reply_err(req, res);
2383            return;
2384        }
2385    }
2386
2387    res = fuse_buf_copy(&out_buf, in_buf);
2388    if (res < 0) {
2389        fuse_reply_err(req, -res);
2390    } else {
2391        fuse_reply_write(req, (size_t)res);
2392    }
2393
2394    if (cap_fsetid_dropped) {
2395        res = gain_effective_cap("FSETID");
2396        if (res) {
2397            fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
2398        }
2399    }
2400}
2401
2402static void lo_statfs(fuse_req_t req, fuse_ino_t ino)
2403{
2404    int res;
2405    struct statvfs stbuf;
2406
2407    res = fstatvfs(lo_fd(req, ino), &stbuf);
2408    if (res == -1) {
2409        fuse_reply_err(req, errno);
2410    } else {
2411        fuse_reply_statfs(req, &stbuf);
2412    }
2413}
2414
2415static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset,
2416                         off_t length, struct fuse_file_info *fi)
2417{
2418    int err = EOPNOTSUPP;
2419    (void)ino;
2420
2421#ifdef CONFIG_FALLOCATE
2422    err = fallocate(lo_fi_fd(req, fi), mode, offset, length);
2423    if (err < 0) {
2424        err = errno;
2425    }
2426
2427#elif defined(CONFIG_POSIX_FALLOCATE)
2428    if (mode) {
2429        fuse_reply_err(req, EOPNOTSUPP);
2430        return;
2431    }
2432
2433    err = posix_fallocate(lo_fi_fd(req, fi), offset, length);
2434#endif
2435
2436    fuse_reply_err(req, err);
2437}
2438
2439static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
2440                     int op)
2441{
2442    int res;
2443    (void)ino;
2444
2445    res = flock(lo_fi_fd(req, fi), op);
2446
2447    fuse_reply_err(req, res == -1 ? errno : 0);
2448}
2449
2450/* types */
2451/*
2452 * Exit; process attribute unmodified if matched.
2453 * An empty key applies to all.
2454 */
2455#define XATTR_MAP_FLAG_OK      (1 <<  0)
2456/*
2457 * The attribute is unwanted;
2458 * EPERM on write, hidden on read.
2459 */
2460#define XATTR_MAP_FLAG_BAD     (1 <<  1)
2461/*
2462 * For attr that start with 'key' prepend 'prepend'
2463 * 'key' may be empty to prepend for all attrs
2464 * key is defined from set/remove point of view.
2465 * Automatically reversed on read
2466 */
2467#define XATTR_MAP_FLAG_PREFIX  (1 <<  2)
2468
2469/* scopes */
2470/* Apply rule to get/set/remove */
2471#define XATTR_MAP_FLAG_CLIENT  (1 << 16)
2472/* Apply rule to list */
2473#define XATTR_MAP_FLAG_SERVER  (1 << 17)
2474/* Apply rule to all */
2475#define XATTR_MAP_FLAG_ALL   (XATTR_MAP_FLAG_SERVER | XATTR_MAP_FLAG_CLIENT)
2476
2477static void add_xattrmap_entry(struct lo_data *lo,
2478                               const XattrMapEntry *new_entry)
2479{
2480    XattrMapEntry *res = g_realloc_n(lo->xattr_map_list,
2481                                     lo->xattr_map_nentries + 1,
2482                                     sizeof(XattrMapEntry));
2483    res[lo->xattr_map_nentries++] = *new_entry;
2484
2485    lo->xattr_map_list = res;
2486}
2487
2488static void free_xattrmap(struct lo_data *lo)
2489{
2490    XattrMapEntry *map = lo->xattr_map_list;
2491    size_t i;
2492
2493    if (!map) {
2494        return;
2495    }
2496
2497    for (i = 0; i < lo->xattr_map_nentries; i++) {
2498        g_free(map[i].key);
2499        g_free(map[i].prepend);
2500    };
2501
2502    g_free(map);
2503    lo->xattr_map_list = NULL;
2504    lo->xattr_map_nentries = -1;
2505}
2506
2507/*
2508 * Handle the 'map' type, which is sugar for a set of commands
2509 * for the common case of prefixing a subset or everything,
2510 * and allowing anything not prefixed through.
2511 * It must be the last entry in the stream, although there
2512 * can be other entries before it.
2513 * The form is:
2514 *    :map:key:prefix:
2515 *
2516 * key maybe empty in which case all entries are prefixed.
2517 */
2518static void parse_xattrmap_map(struct lo_data *lo,
2519                               const char *rule, char sep)
2520{
2521    const char *tmp;
2522    char *key;
2523    char *prefix;
2524    XattrMapEntry tmp_entry;
2525
2526    if (*rule != sep) {
2527        fuse_log(FUSE_LOG_ERR,
2528                 "%s: Expecting '%c' after 'map' keyword, found '%c'\n",
2529                 __func__, sep, *rule);
2530        exit(1);
2531    }
2532
2533    rule++;
2534
2535    /* At start of 'key' field */
2536    tmp = strchr(rule, sep);
2537    if (!tmp) {
2538        fuse_log(FUSE_LOG_ERR,
2539                 "%s: Missing '%c' at end of key field in map rule\n",
2540                 __func__, sep);
2541        exit(1);
2542    }
2543
2544    key = g_strndup(rule, tmp - rule);
2545    rule = tmp + 1;
2546
2547    /* At start of prefix field */
2548    tmp = strchr(rule, sep);
2549    if (!tmp) {
2550        fuse_log(FUSE_LOG_ERR,
2551                 "%s: Missing '%c' at end of prefix field in map rule\n",
2552                 __func__, sep);
2553        exit(1);
2554    }
2555
2556    prefix = g_strndup(rule, tmp - rule);
2557    rule = tmp + 1;
2558
2559    /*
2560     * This should be the end of the string, we don't allow
2561     * any more commands after 'map'.
2562     */
2563    if (*rule) {
2564        fuse_log(FUSE_LOG_ERR,
2565                 "%s: Expecting end of command after map, found '%c'\n",
2566                 __func__, *rule);
2567        exit(1);
2568    }
2569
2570    /* 1st: Prefix matches/everything */
2571    tmp_entry.flags = XATTR_MAP_FLAG_PREFIX | XATTR_MAP_FLAG_ALL;
2572    tmp_entry.key = g_strdup(key);
2573    tmp_entry.prepend = g_strdup(prefix);
2574    add_xattrmap_entry(lo, &tmp_entry);
2575
2576    if (!*key) {
2577        /* Prefix all case */
2578
2579        /* 2nd: Hide any non-prefixed entries on the host */
2580        tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_ALL;
2581        tmp_entry.key = g_strdup("");
2582        tmp_entry.prepend = g_strdup("");
2583        add_xattrmap_entry(lo, &tmp_entry);
2584    } else {
2585        /* Prefix matching case */
2586
2587        /* 2nd: Hide non-prefixed but matching entries on the host */
2588        tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_SERVER;
2589        tmp_entry.key = g_strdup(""); /* Not used */
2590        tmp_entry.prepend = g_strdup(key);
2591        add_xattrmap_entry(lo, &tmp_entry);
2592
2593        /* 3rd: Stop the client accessing prefixed attributes directly */
2594        tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_CLIENT;
2595        tmp_entry.key = g_strdup(prefix);
2596        tmp_entry.prepend = g_strdup(""); /* Not used */
2597        add_xattrmap_entry(lo, &tmp_entry);
2598
2599        /* 4th: Everything else is OK */
2600        tmp_entry.flags = XATTR_MAP_FLAG_OK | XATTR_MAP_FLAG_ALL;
2601        tmp_entry.key = g_strdup("");
2602        tmp_entry.prepend = g_strdup("");
2603        add_xattrmap_entry(lo, &tmp_entry);
2604    }
2605
2606    g_free(key);
2607    g_free(prefix);
2608}
2609
2610static void parse_xattrmap(struct lo_data *lo)
2611{
2612    const char *map = lo->xattrmap;
2613    const char *tmp;
2614    int ret;
2615
2616    lo->xattr_map_nentries = 0;
2617    while (*map) {
2618        XattrMapEntry tmp_entry;
2619        char sep;
2620
2621        if (isspace(*map)) {
2622            map++;
2623            continue;
2624        }
2625        /* The separator is the first non-space of the rule */
2626        sep = *map++;
2627        if (!sep) {
2628            break;
2629        }
2630
2631        tmp_entry.flags = 0;
2632        /* Start of 'type' */
2633        if (strstart(map, "prefix", &map)) {
2634            tmp_entry.flags |= XATTR_MAP_FLAG_PREFIX;
2635        } else if (strstart(map, "ok", &map)) {
2636            tmp_entry.flags |= XATTR_MAP_FLAG_OK;
2637        } else if (strstart(map, "bad", &map)) {
2638            tmp_entry.flags |= XATTR_MAP_FLAG_BAD;
2639        } else if (strstart(map, "map", &map)) {
2640            /*
2641             * map is sugar that adds a number of rules, and must be
2642             * the last entry.
2643             */
2644            parse_xattrmap_map(lo, map, sep);
2645            break;
2646        } else {
2647            fuse_log(FUSE_LOG_ERR,
2648                     "%s: Unexpected type;"
2649                     "Expecting 'prefix', 'ok', 'bad' or 'map' in rule %zu\n",
2650                     __func__, lo->xattr_map_nentries);
2651            exit(1);
2652        }
2653
2654        if (*map++ != sep) {
2655            fuse_log(FUSE_LOG_ERR,
2656                     "%s: Missing '%c' at end of type field of rule %zu\n",
2657                     __func__, sep, lo->xattr_map_nentries);
2658            exit(1);
2659        }
2660
2661        /* Start of 'scope' */
2662        if (strstart(map, "client", &map)) {
2663            tmp_entry.flags |= XATTR_MAP_FLAG_CLIENT;
2664        } else if (strstart(map, "server", &map)) {
2665            tmp_entry.flags |= XATTR_MAP_FLAG_SERVER;
2666        } else if (strstart(map, "all", &map)) {
2667            tmp_entry.flags |= XATTR_MAP_FLAG_ALL;
2668        } else {
2669            fuse_log(FUSE_LOG_ERR,
2670                     "%s: Unexpected scope;"
2671                     " Expecting 'client', 'server', or 'all', in rule %zu\n",
2672                     __func__, lo->xattr_map_nentries);
2673            exit(1);
2674        }
2675
2676        if (*map++ != sep) {
2677            fuse_log(FUSE_LOG_ERR,
2678                     "%s: Expecting '%c' found '%c'"
2679                     " after scope in rule %zu\n",
2680                     __func__, sep, *map, lo->xattr_map_nentries);
2681            exit(1);
2682        }
2683
2684        /* At start of 'key' field */
2685        tmp = strchr(map, sep);
2686        if (!tmp) {
2687            fuse_log(FUSE_LOG_ERR,
2688                     "%s: Missing '%c' at end of key field of rule %zu",
2689                     __func__, sep, lo->xattr_map_nentries);
2690            exit(1);
2691        }
2692        tmp_entry.key = g_strndup(map, tmp - map);
2693        map = tmp + 1;
2694
2695        /* At start of 'prepend' field */
2696        tmp = strchr(map, sep);
2697        if (!tmp) {
2698            fuse_log(FUSE_LOG_ERR,
2699                     "%s: Missing '%c' at end of prepend field of rule %zu",
2700                     __func__, sep, lo->xattr_map_nentries);
2701            exit(1);
2702        }
2703        tmp_entry.prepend = g_strndup(map, tmp - map);
2704        map = tmp + 1;
2705
2706        add_xattrmap_entry(lo, &tmp_entry);
2707        /* End of rule - go around again for another rule */
2708    }
2709
2710    if (!lo->xattr_map_nentries) {
2711        fuse_log(FUSE_LOG_ERR, "Empty xattr map\n");
2712        exit(1);
2713    }
2714
2715    ret = xattr_map_client(lo, "security.capability",
2716                           &lo->xattr_security_capability);
2717    if (ret) {
2718        fuse_log(FUSE_LOG_ERR, "Failed to map security.capability: %s\n",
2719                strerror(ret));
2720        exit(1);
2721    }
2722    if (!lo->xattr_security_capability ||
2723        !strcmp(lo->xattr_security_capability, "security.capability")) {
2724        /* 1-1 mapping, don't need to do anything */
2725        free(lo->xattr_security_capability);
2726        lo->xattr_security_capability = NULL;
2727    }
2728}
2729
2730/*
2731 * For use with getxattr/setxattr/removexattr, where the client
2732 * gives us a name and we may need to choose a different one.
2733 * Allocates a buffer for the result placing it in *out_name.
2734 *   If there's no change then *out_name is not set.
2735 * Returns 0 on success
2736 * Can return -EPERM to indicate we block a given attribute
2737 *   (in which case out_name is not allocated)
2738 * Can return -ENOMEM to indicate out_name couldn't be allocated.
2739 */
2740static int xattr_map_client(const struct lo_data *lo, const char *client_name,
2741                            char **out_name)
2742{
2743    size_t i;
2744    for (i = 0; i < lo->xattr_map_nentries; i++) {
2745        const XattrMapEntry *cur_entry = lo->xattr_map_list + i;
2746
2747        if ((cur_entry->flags & XATTR_MAP_FLAG_CLIENT) &&
2748            (strstart(client_name, cur_entry->key, NULL))) {
2749            if (cur_entry->flags & XATTR_MAP_FLAG_BAD) {
2750                return -EPERM;
2751            }
2752            if (cur_entry->flags & XATTR_MAP_FLAG_OK) {
2753                /* Unmodified name */
2754                return 0;
2755            }
2756            if (cur_entry->flags & XATTR_MAP_FLAG_PREFIX) {
2757                *out_name = g_try_malloc(strlen(client_name) +
2758                                         strlen(cur_entry->prepend) + 1);
2759                if (!*out_name) {
2760                    return -ENOMEM;
2761                }
2762                sprintf(*out_name, "%s%s", cur_entry->prepend, client_name);
2763                return 0;
2764            }
2765        }
2766    }
2767
2768    return -EPERM;
2769}
2770
2771/*
2772 * For use with listxattr where the server fs gives us a name and we may need
2773 * to sanitize this for the client.
2774 * Returns a pointer to the result in *out_name
2775 *   This is always the original string or the current string with some prefix
2776 *   removed; no reallocation is done.
2777 * Returns 0 on success
2778 * Can return -ENODATA to indicate the name should be dropped from the list.
2779 */
2780static int xattr_map_server(const struct lo_data *lo, const char *server_name,
2781                            const char **out_name)
2782{
2783    size_t i;
2784    const char *end;
2785
2786    for (i = 0; i < lo->xattr_map_nentries; i++) {
2787        const XattrMapEntry *cur_entry = lo->xattr_map_list + i;
2788
2789        if ((cur_entry->flags & XATTR_MAP_FLAG_SERVER) &&
2790            (strstart(server_name, cur_entry->prepend, &end))) {
2791            if (cur_entry->flags & XATTR_MAP_FLAG_BAD) {
2792                return -ENODATA;
2793            }
2794            if (cur_entry->flags & XATTR_MAP_FLAG_OK) {
2795                *out_name = server_name;
2796                return 0;
2797            }
2798            if (cur_entry->flags & XATTR_MAP_FLAG_PREFIX) {
2799                /* Remove prefix */
2800                *out_name = end;
2801                return 0;
2802            }
2803        }
2804    }
2805
2806    return -ENODATA;
2807}
2808
2809#define FCHDIR_NOFAIL(fd) do {                         \
2810        int fchdir_res = fchdir(fd);                   \
2811        assert(fchdir_res == 0);                       \
2812    } while (0)
2813
2814static bool block_xattr(struct lo_data *lo, const char *name)
2815{
2816    /*
2817     * If user explicitly enabled posix_acl or did not provide any option,
2818     * do not block acl. Otherwise block system.posix_acl_access and
2819     * system.posix_acl_default xattrs.
2820     */
2821    if (lo->user_posix_acl) {
2822        return false;
2823    }
2824    if (!strcmp(name, "system.posix_acl_access") ||
2825        !strcmp(name, "system.posix_acl_default"))
2826            return true;
2827
2828    return false;
2829}
2830
2831/*
2832 * Returns number of bytes in xattr_list after filtering on success. This
2833 * could be zero as well if nothing is left after filtering.
2834 *
2835 * Returns negative error code on failure.
2836 * xattr_list is modified in place.
2837 */
2838static int remove_blocked_xattrs(struct lo_data *lo, char *xattr_list,
2839                                 unsigned in_size)
2840{
2841    size_t out_index, in_index;
2842
2843    /*
2844     * As of now we only filter out acl xattrs. If acls are enabled or
2845     * they have not been explicitly disabled, there is nothing to
2846     * filter.
2847     */
2848    if (lo->user_posix_acl) {
2849        return in_size;
2850    }
2851
2852    out_index = 0;
2853    in_index = 0;
2854    while (in_index < in_size) {
2855        char *in_ptr = xattr_list + in_index;
2856
2857        /* Length of current attribute name */
2858        size_t in_len = strlen(xattr_list + in_index) + 1;
2859
2860        if (!block_xattr(lo, in_ptr)) {
2861            if (in_index != out_index) {
2862                memmove(xattr_list + out_index, xattr_list + in_index, in_len);
2863            }
2864            out_index += in_len;
2865        }
2866        in_index += in_len;
2867     }
2868    return out_index;
2869}
2870
2871static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
2872                        size_t size)
2873{
2874    struct lo_data *lo = lo_data(req);
2875    g_autofree char *value = NULL;
2876    char procname[64];
2877    const char *name;
2878    char *mapped_name;
2879    struct lo_inode *inode;
2880    ssize_t ret;
2881    int saverr;
2882    int fd = -1;
2883
2884    if (block_xattr(lo, in_name)) {
2885        fuse_reply_err(req, EOPNOTSUPP);
2886        return;
2887    }
2888
2889    mapped_name = NULL;
2890    name = in_name;
2891    if (lo->xattrmap) {
2892        ret = xattr_map_client(lo, in_name, &mapped_name);
2893        if (ret < 0) {
2894            if (ret == -EPERM) {
2895                ret = -ENODATA;
2896            }
2897            fuse_reply_err(req, -ret);
2898            return;
2899        }
2900        if (mapped_name) {
2901            name = mapped_name;
2902        }
2903    }
2904
2905    inode = lo_inode(req, ino);
2906    if (!inode) {
2907        fuse_reply_err(req, EBADF);
2908        g_free(mapped_name);
2909        return;
2910    }
2911
2912    saverr = ENOSYS;
2913    if (!lo_data(req)->xattr) {
2914        goto out;
2915    }
2916
2917    fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n",
2918             ino, name, size);
2919
2920    if (size) {
2921        value = g_try_malloc(size);
2922        if (!value) {
2923            goto out_err;
2924        }
2925    }
2926
2927    sprintf(procname, "%i", inode->fd);
2928    /*
2929     * It is not safe to open() non-regular/non-dir files in file server
2930     * unless O_PATH is used, so use that method for regular files/dir
2931     * only (as it seems giving less performance overhead).
2932     * Otherwise, call fchdir() to avoid open().
2933     */
2934    if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
2935        fd = openat(lo->proc_self_fd, procname, O_RDONLY);
2936        if (fd < 0) {
2937            goto out_err;
2938        }
2939        ret = fgetxattr(fd, name, value, size);
2940        saverr = ret == -1 ? errno : 0;
2941    } else {
2942        /* fchdir should not fail here */
2943        FCHDIR_NOFAIL(lo->proc_self_fd);
2944        ret = getxattr(procname, name, value, size);
2945        saverr = ret == -1 ? errno : 0;
2946        FCHDIR_NOFAIL(lo->root.fd);
2947    }
2948
2949    if (ret == -1) {
2950        goto out;
2951    }
2952    if (size) {
2953        saverr = 0;
2954        if (ret == 0) {
2955            goto out;
2956        }
2957        fuse_reply_buf(req, value, ret);
2958    } else {
2959        fuse_reply_xattr(req, ret);
2960    }
2961out_free:
2962    if (fd >= 0) {
2963        close(fd);
2964    }
2965
2966    lo_inode_put(lo, &inode);
2967    return;
2968
2969out_err:
2970    saverr = errno;
2971out:
2972    fuse_reply_err(req, saverr);
2973    g_free(mapped_name);
2974    goto out_free;
2975}
2976
2977static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size)
2978{
2979    struct lo_data *lo = lo_data(req);
2980    g_autofree char *value = NULL;
2981    char procname[64];
2982    struct lo_inode *inode;
2983    ssize_t ret;
2984    int saverr;
2985    int fd = -1;
2986
2987    inode = lo_inode(req, ino);
2988    if (!inode) {
2989        fuse_reply_err(req, EBADF);
2990        return;
2991    }
2992
2993    saverr = ENOSYS;
2994    if (!lo_data(req)->xattr) {
2995        goto out;
2996    }
2997
2998    fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ino,
2999             size);
3000
3001    if (size) {
3002        value = g_try_malloc(size);
3003        if (!value) {
3004            goto out_err;
3005        }
3006    }
3007
3008    sprintf(procname, "%i", inode->fd);
3009    if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
3010        fd = openat(lo->proc_self_fd, procname, O_RDONLY);
3011        if (fd < 0) {
3012            goto out_err;
3013        }
3014        ret = flistxattr(fd, value, size);
3015        saverr = ret == -1 ? errno : 0;
3016    } else {
3017        /* fchdir should not fail here */
3018        FCHDIR_NOFAIL(lo->proc_self_fd);
3019        ret = listxattr(procname, value, size);
3020        saverr = ret == -1 ? errno : 0;
3021        FCHDIR_NOFAIL(lo->root.fd);
3022    }
3023
3024    if (ret == -1) {
3025        goto out;
3026    }
3027    if (size) {
3028        saverr = 0;
3029        if (ret == 0) {
3030            goto out;
3031        }
3032
3033        if (lo->xattr_map_list) {
3034            /*
3035             * Map the names back, some attributes might be dropped,
3036             * some shortened, but not increased, so we shouldn't
3037             * run out of room.
3038             */
3039            size_t out_index, in_index;
3040            out_index = 0;
3041            in_index = 0;
3042            while (in_index < ret) {
3043                const char *map_out;
3044                char *in_ptr = value + in_index;
3045                /* Length of current attribute name */
3046                size_t in_len = strlen(value + in_index) + 1;
3047
3048                int mapret = xattr_map_server(lo, in_ptr, &map_out);
3049                if (mapret != -ENODATA && mapret != 0) {
3050                    /* Shouldn't happen */
3051                    saverr = -mapret;
3052                    goto out;
3053                }
3054                if (mapret == 0) {
3055                    /* Either unchanged, or truncated */
3056                    size_t out_len;
3057                    if (map_out != in_ptr) {
3058                        /* +1 copies the NIL */
3059                        out_len = strlen(map_out) + 1;
3060                    } else {
3061                        /* No change */
3062                        out_len = in_len;
3063                    }
3064                    /*
3065                     * Move result along, may still be needed for an unchanged
3066                     * entry if a previous entry was changed.
3067                     */
3068                    memmove(value + out_index, map_out, out_len);
3069
3070                    out_index += out_len;
3071                }
3072                in_index += in_len;
3073            }
3074            ret = out_index;
3075            if (ret == 0) {
3076                goto out;
3077            }
3078        }
3079
3080        ret = remove_blocked_xattrs(lo, value, ret);
3081        if (ret <= 0) {
3082            saverr = -ret;
3083            goto out;
3084        }
3085        fuse_reply_buf(req, value, ret);
3086    } else {
3087        /*
3088         * xattrmap only ever shortens the result,
3089         * so we don't need to do anything clever with the
3090         * allocation length here.
3091         */
3092        fuse_reply_xattr(req, ret);
3093    }
3094out_free:
3095    if (fd >= 0) {
3096        close(fd);
3097    }
3098
3099    lo_inode_put(lo, &inode);
3100    return;
3101
3102out_err:
3103    saverr = errno;
3104out:
3105    fuse_reply_err(req, saverr);
3106    goto out_free;
3107}
3108
3109static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
3110                        const char *value, size_t size, int flags,
3111                        uint32_t extra_flags)
3112{
3113    char procname[64];
3114    const char *name;
3115    char *mapped_name;
3116    struct lo_data *lo = lo_data(req);
3117    struct lo_inode *inode;
3118    ssize_t ret;
3119    int saverr;
3120    int fd = -1;
3121    bool switched_creds = false;
3122    bool cap_fsetid_dropped = false;
3123    struct lo_cred old = {};
3124
3125    if (block_xattr(lo, in_name)) {
3126        fuse_reply_err(req, EOPNOTSUPP);
3127        return;
3128    }
3129
3130    mapped_name = NULL;
3131    name = in_name;
3132    if (lo->xattrmap) {
3133        ret = xattr_map_client(lo, in_name, &mapped_name);
3134        if (ret < 0) {
3135            fuse_reply_err(req, -ret);
3136            return;
3137        }
3138        if (mapped_name) {
3139            name = mapped_name;
3140        }
3141    }
3142
3143    inode = lo_inode(req, ino);
3144    if (!inode) {
3145        fuse_reply_err(req, EBADF);
3146        g_free(mapped_name);
3147        return;
3148    }
3149
3150    saverr = ENOSYS;
3151    if (!lo_data(req)->xattr) {
3152        goto out;
3153    }
3154
3155    fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64
3156             ", name=%s value=%s size=%zd)\n", ino, name, value, size);
3157
3158    sprintf(procname, "%i", inode->fd);
3159    /*
3160     * If we are setting posix access acl and if SGID needs to be
3161     * cleared, then switch to caller's gid and drop CAP_FSETID
3162     * and that should make sure host kernel clears SGID.
3163     *
3164     * This probably will not work when we support idmapped mounts.
3165     * In that case we will need to find a non-root gid and switch
3166     * to it. (Instead of gid in request). Fix it when we support
3167     * idmapped mounts.
3168     */
3169    if (lo->posix_acl && !strcmp(name, "system.posix_acl_access")
3170        && (extra_flags & FUSE_SETXATTR_ACL_KILL_SGID)) {
3171        ret = lo_drop_cap_change_cred(req, &old, false, "FSETID",
3172                                      &cap_fsetid_dropped);
3173        if (ret) {
3174            saverr = ret;
3175            goto out;
3176        }
3177        switched_creds = true;
3178    }
3179    if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
3180        fd = openat(lo->proc_self_fd, procname, O_RDONLY);
3181        if (fd < 0) {
3182            saverr = errno;
3183            goto out;
3184        }
3185        ret = fsetxattr(fd, name, value, size, flags);
3186        saverr = ret == -1 ? errno : 0;
3187    } else {
3188        /* fchdir should not fail here */
3189        FCHDIR_NOFAIL(lo->proc_self_fd);
3190        ret = setxattr(procname, name, value, size, flags);
3191        saverr = ret == -1 ? errno : 0;
3192        FCHDIR_NOFAIL(lo->root.fd);
3193    }
3194    if (switched_creds) {
3195        if (cap_fsetid_dropped)
3196            lo_restore_cred_gain_cap(&old, false, "FSETID");
3197        else
3198            lo_restore_cred(&old, false);
3199    }
3200
3201out:
3202    if (fd >= 0) {
3203        close(fd);
3204    }
3205
3206    lo_inode_put(lo, &inode);
3207    g_free(mapped_name);
3208    fuse_reply_err(req, saverr);
3209}
3210
3211static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *in_name)
3212{
3213    char procname[64];
3214    const char *name;
3215    char *mapped_name;
3216    struct lo_data *lo = lo_data(req);
3217    struct lo_inode *inode;
3218    ssize_t ret;
3219    int saverr;
3220    int fd = -1;
3221
3222    if (block_xattr(lo, in_name)) {
3223        fuse_reply_err(req, EOPNOTSUPP);
3224        return;
3225    }
3226
3227    mapped_name = NULL;
3228    name = in_name;
3229    if (lo->xattrmap) {
3230        ret = xattr_map_client(lo, in_name, &mapped_name);
3231        if (ret < 0) {
3232            fuse_reply_err(req, -ret);
3233            return;
3234        }
3235        if (mapped_name) {
3236            name = mapped_name;
3237        }
3238    }
3239
3240    inode = lo_inode(req, ino);
3241    if (!inode) {
3242        fuse_reply_err(req, EBADF);
3243        g_free(mapped_name);
3244        return;
3245    }
3246
3247    saverr = ENOSYS;
3248    if (!lo_data(req)->xattr) {
3249        goto out;
3250    }
3251
3252    fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ino,
3253             name);
3254
3255    sprintf(procname, "%i", inode->fd);
3256    if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
3257        fd = openat(lo->proc_self_fd, procname, O_RDONLY);
3258        if (fd < 0) {
3259            saverr = errno;
3260            goto out;
3261        }
3262        ret = fremovexattr(fd, name);
3263        saverr = ret == -1 ? errno : 0;
3264    } else {
3265        /* fchdir should not fail here */
3266        FCHDIR_NOFAIL(lo->proc_self_fd);
3267        ret = removexattr(procname, name);
3268        saverr = ret == -1 ? errno : 0;
3269        FCHDIR_NOFAIL(lo->root.fd);
3270    }
3271
3272out:
3273    if (fd >= 0) {
3274        close(fd);
3275    }
3276
3277    lo_inode_put(lo, &inode);
3278    g_free(mapped_name);
3279    fuse_reply_err(req, saverr);
3280}
3281
3282#ifdef HAVE_COPY_FILE_RANGE
3283static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in,
3284                               struct fuse_file_info *fi_in, fuse_ino_t ino_out,
3285                               off_t off_out, struct fuse_file_info *fi_out,
3286                               size_t len, int flags)
3287{
3288    int in_fd, out_fd;
3289    ssize_t res;
3290
3291    in_fd = lo_fi_fd(req, fi_in);
3292    out_fd = lo_fi_fd(req, fi_out);
3293
3294    fuse_log(FUSE_LOG_DEBUG,
3295             "lo_copy_file_range(ino=%" PRIu64 "/fd=%d, "
3296             "off=%ju, ino=%" PRIu64 "/fd=%d, "
3297             "off=%ju, size=%zd, flags=0x%x)\n",
3298             ino_in, in_fd, (intmax_t)off_in,
3299             ino_out, out_fd, (intmax_t)off_out, len, flags);
3300
3301    res = copy_file_range(in_fd, &off_in, out_fd, &off_out, len, flags);
3302    if (res < 0) {
3303        fuse_reply_err(req, errno);
3304    } else {
3305        fuse_reply_write(req, res);
3306    }
3307}
3308#endif
3309
3310static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence,
3311                     struct fuse_file_info *fi)
3312{
3313    off_t res;
3314
3315    (void)ino;
3316    res = lseek(lo_fi_fd(req, fi), off, whence);
3317    if (res != -1) {
3318        fuse_reply_lseek(req, res);
3319    } else {
3320        fuse_reply_err(req, errno);
3321    }
3322}
3323
3324static void lo_destroy(void *userdata)
3325{
3326    struct lo_data *lo = (struct lo_data *)userdata;
3327
3328    pthread_mutex_lock(&lo->mutex);
3329    while (true) {
3330        GHashTableIter iter;
3331        gpointer key, value;
3332
3333        g_hash_table_iter_init(&iter, lo->inodes);
3334        if (!g_hash_table_iter_next(&iter, &key, &value)) {
3335            break;
3336        }
3337
3338        struct lo_inode *inode = value;
3339        unref_inode(lo, inode, inode->nlookup);
3340    }
3341    pthread_mutex_unlock(&lo->mutex);
3342}
3343
3344static struct fuse_lowlevel_ops lo_oper = {
3345    .init = lo_init,
3346    .lookup = lo_lookup,
3347    .mkdir = lo_mkdir,
3348    .mknod = lo_mknod,
3349    .symlink = lo_symlink,
3350    .link = lo_link,
3351    .unlink = lo_unlink,
3352    .rmdir = lo_rmdir,
3353    .rename = lo_rename,
3354    .forget = lo_forget,
3355    .forget_multi = lo_forget_multi,
3356    .getattr = lo_getattr,
3357    .setattr = lo_setattr,
3358    .readlink = lo_readlink,
3359    .opendir = lo_opendir,
3360    .readdir = lo_readdir,
3361    .readdirplus = lo_readdirplus,
3362    .releasedir = lo_releasedir,
3363    .fsyncdir = lo_fsyncdir,
3364    .create = lo_create,
3365    .getlk = lo_getlk,
3366    .setlk = lo_setlk,
3367    .open = lo_open,
3368    .release = lo_release,
3369    .flush = lo_flush,
3370    .fsync = lo_fsync,
3371    .read = lo_read,
3372    .write_buf = lo_write_buf,
3373    .statfs = lo_statfs,
3374    .fallocate = lo_fallocate,
3375    .flock = lo_flock,
3376    .getxattr = lo_getxattr,
3377    .listxattr = lo_listxattr,
3378    .setxattr = lo_setxattr,
3379    .removexattr = lo_removexattr,
3380#ifdef HAVE_COPY_FILE_RANGE
3381    .copy_file_range = lo_copy_file_range,
3382#endif
3383    .lseek = lo_lseek,
3384    .destroy = lo_destroy,
3385};
3386
3387/* Print vhost-user.json backend program capabilities */
3388static void print_capabilities(void)
3389{
3390    printf("{\n");
3391    printf("  \"type\": \"fs\"\n");
3392    printf("}\n");
3393}
3394
3395/*
3396 * Drop all Linux capabilities because the wait parent process only needs to
3397 * sit in waitpid(2) and terminate.
3398 */
3399static void setup_wait_parent_capabilities(void)
3400{
3401    capng_setpid(syscall(SYS_gettid));
3402    capng_clear(CAPNG_SELECT_BOTH);
3403    capng_apply(CAPNG_SELECT_BOTH);
3404}
3405
3406/*
3407 * Move to a new mount, net, and pid namespaces to isolate this process.
3408 */
3409static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
3410{
3411    pid_t child;
3412
3413    /*
3414     * Create a new pid namespace for *child* processes.  We'll have to
3415     * fork in order to enter the new pid namespace.  A new mount namespace
3416     * is also needed so that we can remount /proc for the new pid
3417     * namespace.
3418     *
3419     * Our UNIX domain sockets have been created.  Now we can move to
3420     * an empty network namespace to prevent TCP/IP and other network
3421     * activity in case this process is compromised.
3422     */
3423    if (unshare(CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET) != 0) {
3424        fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWPID | CLONE_NEWNS): %m\n");
3425        exit(1);
3426    }
3427
3428    child = fork();
3429    if (child < 0) {
3430        fuse_log(FUSE_LOG_ERR, "fork() failed: %m\n");
3431        exit(1);
3432    }
3433    if (child > 0) {
3434        pid_t waited;
3435        int wstatus;
3436
3437        setup_wait_parent_capabilities();
3438
3439        /* The parent waits for the child */
3440        do {
3441            waited = waitpid(child, &wstatus, 0);
3442        } while (waited < 0 && errno == EINTR && !se->exited);
3443
3444        /* We were terminated by a signal, see fuse_signals.c */
3445        if (se->exited) {
3446            exit(0);
3447        }
3448
3449        if (WIFEXITED(wstatus)) {
3450            exit(WEXITSTATUS(wstatus));
3451        }
3452
3453        exit(1);
3454    }
3455
3456    /* Send us SIGTERM when the parent thread terminates, see prctl(2) */
3457    prctl(PR_SET_PDEATHSIG, SIGTERM);
3458
3459    /*
3460     * If the mounts have shared propagation then we want to opt out so our
3461     * mount changes don't affect the parent mount namespace.
3462     */
3463    if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL) < 0) {
3464        fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_SLAVE): %m\n");
3465        exit(1);
3466    }
3467
3468    /* The child must remount /proc to use the new pid namespace */
3469    if (mount("proc", "/proc", "proc",
3470              MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0) {
3471        fuse_log(FUSE_LOG_ERR, "mount(/proc): %m\n");
3472        exit(1);
3473    }
3474
3475    /*
3476     * We only need /proc/self/fd. Prevent ".." from accessing parent
3477     * directories of /proc/self/fd by bind-mounting it over /proc. Since / was
3478     * previously remounted with MS_REC | MS_SLAVE this mount change only
3479     * affects our process.
3480     */
3481    if (mount("/proc/self/fd", "/proc", NULL, MS_BIND, NULL) < 0) {
3482        fuse_log(FUSE_LOG_ERR, "mount(/proc/self/fd, MS_BIND): %m\n");
3483        exit(1);
3484    }
3485
3486    /* Get the /proc (actually /proc/self/fd, see above) file descriptor */
3487    lo->proc_self_fd = open("/proc", O_PATH);
3488    if (lo->proc_self_fd == -1) {
3489        fuse_log(FUSE_LOG_ERR, "open(/proc, O_PATH): %m\n");
3490        exit(1);
3491    }
3492}
3493
3494/*
3495 * Capture the capability state, we'll need to restore this for individual
3496 * threads later; see load_capng.
3497 */
3498static void setup_capng(void)
3499{
3500    /* Note this accesses /proc so has to happen before the sandbox */
3501    if (capng_get_caps_process()) {
3502        fuse_log(FUSE_LOG_ERR, "capng_get_caps_process\n");
3503        exit(1);
3504    }
3505    pthread_mutex_init(&cap.mutex, NULL);
3506    pthread_mutex_lock(&cap.mutex);
3507    cap.saved = capng_save_state();
3508    if (!cap.saved) {
3509        fuse_log(FUSE_LOG_ERR, "capng_save_state\n");
3510        exit(1);
3511    }
3512    pthread_mutex_unlock(&cap.mutex);
3513}
3514
3515static void cleanup_capng(void)
3516{
3517    free(cap.saved);
3518    cap.saved = NULL;
3519    pthread_mutex_destroy(&cap.mutex);
3520}
3521
3522
3523/*
3524 * Make the source directory our root so symlinks cannot escape and no other
3525 * files are accessible.  Assumes unshare(CLONE_NEWNS) was already called.
3526 */
3527static void setup_mounts(const char *source)
3528{
3529    int oldroot;
3530    int newroot;
3531
3532    if (mount(source, source, NULL, MS_BIND | MS_REC, NULL) < 0) {
3533        fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source);
3534        exit(1);
3535    }
3536
3537    /* This magic is based on lxc's lxc_pivot_root() */
3538    oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
3539    if (oldroot < 0) {
3540        fuse_log(FUSE_LOG_ERR, "open(/): %m\n");
3541        exit(1);
3542    }
3543
3544    newroot = open(source, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
3545    if (newroot < 0) {
3546        fuse_log(FUSE_LOG_ERR, "open(%s): %m\n", source);
3547        exit(1);
3548    }
3549
3550    if (fchdir(newroot) < 0) {
3551        fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
3552        exit(1);
3553    }
3554
3555    if (syscall(__NR_pivot_root, ".", ".") < 0) {
3556        fuse_log(FUSE_LOG_ERR, "pivot_root(., .): %m\n");
3557        exit(1);
3558    }
3559
3560    if (fchdir(oldroot) < 0) {
3561        fuse_log(FUSE_LOG_ERR, "fchdir(oldroot): %m\n");
3562        exit(1);
3563    }
3564
3565    if (mount("", ".", "", MS_SLAVE | MS_REC, NULL) < 0) {
3566        fuse_log(FUSE_LOG_ERR, "mount(., MS_SLAVE | MS_REC): %m\n");
3567        exit(1);
3568    }
3569
3570    if (umount2(".", MNT_DETACH) < 0) {
3571        fuse_log(FUSE_LOG_ERR, "umount2(., MNT_DETACH): %m\n");
3572        exit(1);
3573    }
3574
3575    if (fchdir(newroot) < 0) {
3576        fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
3577        exit(1);
3578    }
3579
3580    close(newroot);
3581    close(oldroot);
3582}
3583
3584/*
3585 * Only keep capabilities in allowlist that are needed for file system operation
3586 * The (possibly NULL) modcaps_in string passed in is free'd before exit.
3587 */
3588static void setup_capabilities(char *modcaps_in)
3589{
3590    char *modcaps = modcaps_in;
3591    pthread_mutex_lock(&cap.mutex);
3592    capng_restore_state(&cap.saved);
3593
3594    /*
3595     * Add to allowlist file system-related capabilities that are needed for a
3596     * file server to act like root.  Drop everything else like networking and
3597     * sysadmin capabilities.
3598     *
3599     * Exclusions:
3600     * 1. CAP_LINUX_IMMUTABLE is not included because it's only used via ioctl
3601     *    and we don't support that.
3602     * 2. CAP_MAC_OVERRIDE is not included because it only seems to be
3603     *    used by the Smack LSM.  Omit it until there is demand for it.
3604     */
3605    capng_setpid(syscall(SYS_gettid));
3606    capng_clear(CAPNG_SELECT_BOTH);
3607    if (capng_updatev(CAPNG_ADD, CAPNG_PERMITTED | CAPNG_EFFECTIVE,
3608            CAP_CHOWN,
3609            CAP_DAC_OVERRIDE,
3610            CAP_FOWNER,
3611            CAP_FSETID,
3612            CAP_SETGID,
3613            CAP_SETUID,
3614            CAP_MKNOD,
3615            CAP_SETFCAP,
3616            -1)) {
3617        fuse_log(FUSE_LOG_ERR, "%s: capng_updatev failed\n", __func__);
3618        exit(1);
3619    }
3620
3621    /*
3622     * The modcaps option is a colon separated list of caps,
3623     * each preceded by either + or -.
3624     */
3625    while (modcaps) {
3626        capng_act_t action;
3627        int cap;
3628
3629        char *next = strchr(modcaps, ':');
3630        if (next) {
3631            *next = '\0';
3632            next++;
3633        }
3634
3635        switch (modcaps[0]) {
3636        case '+':
3637            action = CAPNG_ADD;
3638            break;
3639
3640        case '-':
3641            action = CAPNG_DROP;
3642            break;
3643
3644        default:
3645            fuse_log(FUSE_LOG_ERR,
3646                     "%s: Expecting '+'/'-' in modcaps but found '%c'\n",
3647                     __func__, modcaps[0]);
3648            exit(1);
3649        }
3650        cap = capng_name_to_capability(modcaps + 1);
3651        if (cap < 0) {
3652            fuse_log(FUSE_LOG_ERR, "%s: Unknown capability '%s'\n", __func__,
3653                     modcaps);
3654            exit(1);
3655        }
3656        if (capng_update(action, CAPNG_PERMITTED | CAPNG_EFFECTIVE, cap)) {
3657            fuse_log(FUSE_LOG_ERR, "%s: capng_update failed for '%s'\n",
3658                     __func__, modcaps);
3659            exit(1);
3660        }
3661
3662        modcaps = next;
3663    }
3664    g_free(modcaps_in);
3665
3666    if (capng_apply(CAPNG_SELECT_BOTH)) {
3667        fuse_log(FUSE_LOG_ERR, "%s: capng_apply failed\n", __func__);
3668        exit(1);
3669    }
3670
3671    cap.saved = capng_save_state();
3672    if (!cap.saved) {
3673        fuse_log(FUSE_LOG_ERR, "%s: capng_save_state failed\n", __func__);
3674        exit(1);
3675    }
3676    pthread_mutex_unlock(&cap.mutex);
3677}
3678
3679/*
3680 * Use chroot as a weaker sandbox for environments where the process is
3681 * launched without CAP_SYS_ADMIN.
3682 */
3683static void setup_chroot(struct lo_data *lo)
3684{
3685    lo->proc_self_fd = open("/proc/self/fd", O_PATH);
3686    if (lo->proc_self_fd == -1) {
3687        fuse_log(FUSE_LOG_ERR, "open(\"/proc/self/fd\", O_PATH): %m\n");
3688        exit(1);
3689    }
3690
3691    /*
3692     * Make the shared directory the file system root so that FUSE_OPEN
3693     * (lo_open()) cannot escape the shared directory by opening a symlink.
3694     *
3695     * The chroot(2) syscall is later disabled by seccomp and the
3696     * CAP_SYS_CHROOT capability is dropped so that tampering with the chroot
3697     * is not possible.
3698     *
3699     * However, it's still possible to escape the chroot via lo->proc_self_fd
3700     * but that requires first gaining control of the process.
3701     */
3702    if (chroot(lo->source) != 0) {
3703        fuse_log(FUSE_LOG_ERR, "chroot(\"%s\"): %m\n", lo->source);
3704        exit(1);
3705    }
3706
3707    /* Move into the chroot */
3708    if (chdir("/") != 0) {
3709        fuse_log(FUSE_LOG_ERR, "chdir(\"/\"): %m\n");
3710        exit(1);
3711    }
3712}
3713
3714/*
3715 * Lock down this process to prevent access to other processes or files outside
3716 * source directory.  This reduces the impact of arbitrary code execution bugs.
3717 */
3718static void setup_sandbox(struct lo_data *lo, struct fuse_session *se,
3719                          bool enable_syslog)
3720{
3721    if (lo->sandbox == SANDBOX_NAMESPACE) {
3722        setup_namespaces(lo, se);
3723        setup_mounts(lo->source);
3724    } else {
3725        setup_chroot(lo);
3726    }
3727
3728    setup_seccomp(enable_syslog);
3729    setup_capabilities(g_strdup(lo->modcaps));
3730}
3731
3732/* Set the maximum number of open file descriptors */
3733static void setup_nofile_rlimit(unsigned long rlimit_nofile)
3734{
3735    struct rlimit rlim = {
3736        .rlim_cur = rlimit_nofile,
3737        .rlim_max = rlimit_nofile,
3738    };
3739
3740    if (rlimit_nofile == 0) {
3741        return; /* nothing to do */
3742    }
3743
3744    if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) {
3745        /* Ignore SELinux denials */
3746        if (errno == EPERM) {
3747            return;
3748        }
3749
3750        fuse_log(FUSE_LOG_ERR, "setrlimit(RLIMIT_NOFILE): %m\n");
3751        exit(1);
3752    }
3753}
3754
3755static void log_func(enum fuse_log_level level, const char *fmt, va_list ap)
3756{
3757    g_autofree char *localfmt = NULL;
3758
3759    if (current_log_level < level) {
3760        return;
3761    }
3762
3763    if (current_log_level == FUSE_LOG_DEBUG) {
3764        if (use_syslog) {
3765            /* no timestamp needed */
3766            localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid),
3767                                       fmt);
3768        } else {
3769            g_autoptr(GDateTime) now = g_date_time_new_now_utc();
3770            g_autofree char *nowstr = g_date_time_format(now, "%Y-%m-%d %H:%M:%S.%f%z");
3771            localfmt = g_strdup_printf("[%s] [ID: %08ld] %s",
3772                                       nowstr, syscall(__NR_gettid), fmt);
3773        }
3774        fmt = localfmt;
3775    }
3776
3777    if (use_syslog) {
3778        int priority = LOG_ERR;
3779        switch (level) {
3780        case FUSE_LOG_EMERG:
3781            priority = LOG_EMERG;
3782            break;
3783        case FUSE_LOG_ALERT:
3784            priority = LOG_ALERT;
3785            break;
3786        case FUSE_LOG_CRIT:
3787            priority = LOG_CRIT;
3788            break;
3789        case FUSE_LOG_ERR:
3790            priority = LOG_ERR;
3791            break;
3792        case FUSE_LOG_WARNING:
3793            priority = LOG_WARNING;
3794            break;
3795        case FUSE_LOG_NOTICE:
3796            priority = LOG_NOTICE;
3797            break;
3798        case FUSE_LOG_INFO:
3799            priority = LOG_INFO;
3800            break;
3801        case FUSE_LOG_DEBUG:
3802            priority = LOG_DEBUG;
3803            break;
3804        }
3805        vsyslog(priority, fmt, ap);
3806    } else {
3807        vfprintf(stderr, fmt, ap);
3808    }
3809}
3810
3811static void setup_root(struct lo_data *lo, struct lo_inode *root)
3812{
3813    int fd, res;
3814    struct stat stat;
3815    uint64_t mnt_id;
3816
3817    fd = open("/", O_PATH);
3818    if (fd == -1) {
3819        fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", lo->source);
3820        exit(1);
3821    }
3822
3823    res = do_statx(lo, fd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW,
3824                   &mnt_id);
3825    if (res == -1) {
3826        fuse_log(FUSE_LOG_ERR, "fstatat(%s): %m\n", lo->source);
3827        exit(1);
3828    }
3829
3830    root->filetype = S_IFDIR;
3831    root->fd = fd;
3832    root->key.ino = stat.st_ino;
3833    root->key.dev = stat.st_dev;
3834    root->key.mnt_id = mnt_id;
3835    root->nlookup = 2;
3836    g_atomic_int_set(&root->refcount, 2);
3837    if (lo->posix_lock) {
3838        pthread_mutex_init(&root->plock_mutex, NULL);
3839        root->posix_locks = g_hash_table_new_full(
3840            g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy);
3841    }
3842}
3843
3844static guint lo_key_hash(gconstpointer key)
3845{
3846    const struct lo_key *lkey = key;
3847
3848    return (guint)lkey->ino + (guint)lkey->dev + (guint)lkey->mnt_id;
3849}
3850
3851static gboolean lo_key_equal(gconstpointer a, gconstpointer b)
3852{
3853    const struct lo_key *la = a;
3854    const struct lo_key *lb = b;
3855
3856    return la->ino == lb->ino && la->dev == lb->dev && la->mnt_id == lb->mnt_id;
3857}
3858
3859static void fuse_lo_data_cleanup(struct lo_data *lo)
3860{
3861    if (lo->inodes) {
3862        g_hash_table_destroy(lo->inodes);
3863    }
3864
3865    if (lo->root.posix_locks) {
3866        g_hash_table_destroy(lo->root.posix_locks);
3867    }
3868    lo_map_destroy(&lo->fd_map);
3869    lo_map_destroy(&lo->dirp_map);
3870    lo_map_destroy(&lo->ino_map);
3871
3872    if (lo->proc_self_fd >= 0) {
3873        close(lo->proc_self_fd);
3874    }
3875
3876    if (lo->root.fd >= 0) {
3877        close(lo->root.fd);
3878    }
3879
3880    free(lo->xattrmap);
3881    free_xattrmap(lo);
3882    free(lo->xattr_security_capability);
3883    free(lo->source);
3884}
3885
3886static void qemu_version(void)
3887{
3888    printf("virtiofsd version " QEMU_FULL_VERSION "\n" QEMU_COPYRIGHT "\n");
3889}
3890
3891int main(int argc, char *argv[])
3892{
3893    struct fuse_args args = FUSE_ARGS_INIT(argc, argv);
3894    struct fuse_session *se;
3895    struct fuse_cmdline_opts opts;
3896    struct lo_data lo = {
3897        .sandbox = SANDBOX_NAMESPACE,
3898        .debug = 0,
3899        .writeback = 0,
3900        .posix_lock = 0,
3901        .allow_direct_io = 0,
3902        .proc_self_fd = -1,
3903        .user_killpriv_v2 = -1,
3904        .user_posix_acl = -1,
3905    };
3906    struct lo_map_elem *root_elem;
3907    struct lo_map_elem *reserve_elem;
3908    int ret = -1;
3909
3910    /* Initialize time conversion information for localtime_r(). */
3911    tzset();
3912
3913    /* Don't mask creation mode, kernel already did that */
3914    umask(0);
3915
3916    qemu_init_exec_dir(argv[0]);
3917
3918    pthread_mutex_init(&lo.mutex, NULL);
3919    lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal);
3920    lo.root.fd = -1;
3921    lo.root.fuse_ino = FUSE_ROOT_ID;
3922    lo.cache = CACHE_AUTO;
3923
3924    /*
3925     * Set up the ino map like this:
3926     * [0] Reserved (will not be used)
3927     * [1] Root inode
3928     */
3929    lo_map_init(&lo.ino_map);
3930    reserve_elem = lo_map_reserve(&lo.ino_map, 0);
3931    if (!reserve_elem) {
3932        fuse_log(FUSE_LOG_ERR, "failed to alloc reserve_elem.\n");
3933        goto err_out1;
3934    }
3935    reserve_elem->in_use = false;
3936    root_elem = lo_map_reserve(&lo.ino_map, lo.root.fuse_ino);
3937    if (!root_elem) {
3938        fuse_log(FUSE_LOG_ERR, "failed to alloc root_elem.\n");
3939        goto err_out1;
3940    }
3941    root_elem->inode = &lo.root;
3942
3943    lo_map_init(&lo.dirp_map);
3944    lo_map_init(&lo.fd_map);
3945
3946    if (fuse_parse_cmdline(&args, &opts) != 0) {
3947        goto err_out1;
3948    }
3949    fuse_set_log_func(log_func);
3950    use_syslog = opts.syslog;
3951    if (use_syslog) {
3952        openlog("virtiofsd", LOG_PID, LOG_DAEMON);
3953    }
3954
3955    if (opts.show_help) {
3956        printf("usage: %s [options]\n\n", argv[0]);
3957        fuse_cmdline_help();
3958        printf("    -o source=PATH             shared directory tree\n");
3959        fuse_lowlevel_help();
3960        ret = 0;
3961        goto err_out1;
3962    } else if (opts.show_version) {
3963        qemu_version();
3964        fuse_lowlevel_version();
3965        ret = 0;
3966        goto err_out1;
3967    } else if (opts.print_capabilities) {
3968        print_capabilities();
3969        ret = 0;
3970        goto err_out1;
3971    }
3972
3973    if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) {
3974        goto err_out1;
3975    }
3976
3977    if (opts.log_level != 0) {
3978        current_log_level = opts.log_level;
3979    } else {
3980        /* default log level is INFO */
3981        current_log_level = FUSE_LOG_INFO;
3982    }
3983    lo.debug = opts.debug;
3984    if (lo.debug) {
3985        current_log_level = FUSE_LOG_DEBUG;
3986    }
3987    if (lo.source) {
3988        struct stat stat;
3989        int res;
3990
3991        res = lstat(lo.source, &stat);
3992        if (res == -1) {
3993            fuse_log(FUSE_LOG_ERR, "failed to stat source (\"%s\"): %m\n",
3994                     lo.source);
3995            exit(1);
3996        }
3997        if (!S_ISDIR(stat.st_mode)) {
3998            fuse_log(FUSE_LOG_ERR, "source is not a directory\n");
3999            exit(1);
4000        }
4001    } else {
4002        lo.source = strdup("/");
4003        if (!lo.source) {
4004            fuse_log(FUSE_LOG_ERR, "failed to strdup source\n");
4005            goto err_out1;
4006        }
4007    }
4008
4009    if (lo.xattrmap) {
4010        lo.xattr = 1;
4011        parse_xattrmap(&lo);
4012    }
4013
4014    if (!lo.timeout_set) {
4015        switch (lo.cache) {
4016        case CACHE_NONE:
4017            lo.timeout = 0.0;
4018            break;
4019
4020        case CACHE_AUTO:
4021            lo.timeout = 1.0;
4022            break;
4023
4024        case CACHE_ALWAYS:
4025            lo.timeout = 86400.0;
4026            break;
4027        }
4028    } else if (lo.timeout < 0) {
4029        fuse_log(FUSE_LOG_ERR, "timeout is negative (%lf)\n", lo.timeout);
4030        exit(1);
4031    }
4032
4033    if (lo.user_posix_acl == 1 && !lo.xattr) {
4034        fuse_log(FUSE_LOG_ERR, "Can't enable posix ACLs. xattrs are disabled."
4035                 "\n");
4036        exit(1);
4037    }
4038
4039    lo.use_statx = true;
4040
4041    se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo);
4042    if (se == NULL) {
4043        goto err_out1;
4044    }
4045
4046    if (fuse_set_signal_handlers(se) != 0) {
4047        goto err_out2;
4048    }
4049
4050    if (fuse_session_mount(se) != 0) {
4051        goto err_out3;
4052    }
4053
4054    fuse_daemonize(opts.foreground);
4055
4056    setup_nofile_rlimit(opts.rlimit_nofile);
4057
4058    /* Must be before sandbox since it wants /proc */
4059    setup_capng();
4060
4061    setup_sandbox(&lo, se, opts.syslog);
4062
4063    setup_root(&lo, &lo.root);
4064    /* Block until ctrl+c or fusermount -u */
4065    ret = virtio_loop(se);
4066
4067    fuse_session_unmount(se);
4068    cleanup_capng();
4069err_out3:
4070    fuse_remove_signal_handlers(se);
4071err_out2:
4072    fuse_session_destroy(se);
4073err_out1:
4074    fuse_opt_free_args(&args);
4075
4076    fuse_lo_data_cleanup(&lo);
4077
4078    return ret ? 1 : 0;
4079}
4080