qemu/tools/virtiofsd/fuse_virtio.c
<<
>>
Prefs
   1/*
   2 * virtio-fs glue for FUSE
   3 * Copyright (C) 2018 Red Hat, Inc. and/or its affiliates
   4 *
   5 * Authors:
   6 *   Dave Gilbert  <dgilbert@redhat.com>
   7 *
   8 * Implements the glue between libfuse and libvhost-user
   9 *
  10 * This program can be distributed under the terms of the GNU LGPLv2.
  11 * See the file COPYING.LIB
  12 */
  13
  14#include "qemu/osdep.h"
  15#include "qemu/iov.h"
  16#include "qapi/error.h"
  17#include "fuse_i.h"
  18#include "standard-headers/linux/fuse.h"
  19#include "fuse_misc.h"
  20#include "fuse_opt.h"
  21#include "fuse_virtio.h"
  22
  23#include <assert.h>
  24#include <errno.h>
  25#include <glib.h>
  26#include <stdint.h>
  27#include <stdio.h>
  28#include <stdlib.h>
  29#include <string.h>
  30#include <sys/eventfd.h>
  31#include <sys/socket.h>
  32#include <sys/types.h>
  33#include <sys/un.h>
  34#include <sys/types.h>
  35#include <grp.h>
  36#include <unistd.h>
  37
  38#include "contrib/libvhost-user/libvhost-user.h"
  39
  40struct fv_VuDev;
  41struct fv_QueueInfo {
  42    pthread_t thread;
  43    /*
  44     * This lock protects the VuVirtq preventing races between
  45     * fv_queue_thread() and fv_queue_worker().
  46     */
  47    pthread_mutex_t vq_lock;
  48
  49    struct fv_VuDev *virtio_dev;
  50
  51    /* Our queue index, corresponds to array position */
  52    int qidx;
  53    int kick_fd;
  54    int kill_fd; /* For killing the thread */
  55};
  56
  57/* A FUSE request */
  58typedef struct {
  59    VuVirtqElement elem;
  60    struct fuse_chan ch;
  61
  62    /* Used to complete requests that involve no reply */
  63    bool reply_sent;
  64} FVRequest;
  65
  66/*
  67 * We pass the dev element into libvhost-user
  68 * and then use it to get back to the outer
  69 * container for other data.
  70 */
  71struct fv_VuDev {
  72    VuDev dev;
  73    struct fuse_session *se;
  74
  75    /*
  76     * Either handle virtqueues or vhost-user protocol messages.  Don't do
  77     * both at the same time since that could lead to race conditions if
  78     * virtqueues or memory tables change while another thread is accessing
  79     * them.
  80     *
  81     * The assumptions are:
  82     * 1. fv_queue_thread() reads/writes to virtqueues and only reads VuDev.
  83     * 2. virtio_loop() reads/writes virtqueues and VuDev.
  84     */
  85    pthread_rwlock_t vu_dispatch_rwlock;
  86
  87    /*
  88     * The following pair of fields are only accessed in the main
  89     * virtio_loop
  90     */
  91    size_t nqueues;
  92    struct fv_QueueInfo **qi;
  93};
  94
  95/* From spec */
  96struct virtio_fs_config {
  97    char tag[36];
  98    uint32_t num_queues;
  99};
 100
 101/* Callback from libvhost-user */
 102static uint64_t fv_get_features(VuDev *dev)
 103{
 104    return 1ULL << VIRTIO_F_VERSION_1;
 105}
 106
 107/* Callback from libvhost-user */
 108static void fv_set_features(VuDev *dev, uint64_t features)
 109{
 110}
 111
 112/*
 113 * Callback from libvhost-user if there's a new fd we're supposed to listen
 114 * to, typically a queue kick?
 115 */
 116static void fv_set_watch(VuDev *dev, int fd, int condition, vu_watch_cb cb,
 117                         void *data)
 118{
 119    fuse_log(FUSE_LOG_WARNING, "%s: TODO! fd=%d\n", __func__, fd);
 120}
 121
 122/*
 123 * Callback from libvhost-user if we're no longer supposed to listen on an fd
 124 */
 125static void fv_remove_watch(VuDev *dev, int fd)
 126{
 127    fuse_log(FUSE_LOG_WARNING, "%s: TODO! fd=%d\n", __func__, fd);
 128}
 129
 130/* Callback from libvhost-user to panic */
 131static void fv_panic(VuDev *dev, const char *err)
 132{
 133    fuse_log(FUSE_LOG_ERR, "%s: libvhost-user: %s\n", __func__, err);
 134    /* TODO: Allow reconnects?? */
 135    exit(EXIT_FAILURE);
 136}
 137
 138/*
 139 * Copy from an iovec into a fuse_buf (memory only)
 140 * Caller must ensure there is space
 141 */
 142static void copy_from_iov(struct fuse_buf *buf, size_t out_num,
 143                          const struct iovec *out_sg)
 144{
 145    void *dest = buf->mem;
 146
 147    while (out_num) {
 148        size_t onelen = out_sg->iov_len;
 149        memcpy(dest, out_sg->iov_base, onelen);
 150        dest += onelen;
 151        out_sg++;
 152        out_num--;
 153    }
 154}
 155
 156/*
 157 * Copy from one iov to another, the given number of bytes
 158 * The caller must have checked sizes.
 159 */
 160static void copy_iov(struct iovec *src_iov, int src_count,
 161                     struct iovec *dst_iov, int dst_count, size_t to_copy)
 162{
 163    size_t dst_offset = 0;
 164    /* Outer loop copies 'src' elements */
 165    while (to_copy) {
 166        assert(src_count);
 167        size_t src_len = src_iov[0].iov_len;
 168        size_t src_offset = 0;
 169
 170        if (src_len > to_copy) {
 171            src_len = to_copy;
 172        }
 173        /* Inner loop copies contents of one 'src' to maybe multiple dst. */
 174        while (src_len) {
 175            assert(dst_count);
 176            size_t dst_len = dst_iov[0].iov_len - dst_offset;
 177            if (dst_len > src_len) {
 178                dst_len = src_len;
 179            }
 180
 181            memcpy(dst_iov[0].iov_base + dst_offset,
 182                   src_iov[0].iov_base + src_offset, dst_len);
 183            src_len -= dst_len;
 184            to_copy -= dst_len;
 185            src_offset += dst_len;
 186            dst_offset += dst_len;
 187
 188            assert(dst_offset <= dst_iov[0].iov_len);
 189            if (dst_offset == dst_iov[0].iov_len) {
 190                dst_offset = 0;
 191                dst_iov++;
 192                dst_count--;
 193            }
 194        }
 195        src_iov++;
 196        src_count--;
 197    }
 198}
 199
 200/*
 201 * Called back by ll whenever it wants to send a reply/message back
 202 * The 1st element of the iov starts with the fuse_out_header
 203 * 'unique'==0 means it's a notify message.
 204 */
 205int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch,
 206                    struct iovec *iov, int count)
 207{
 208    FVRequest *req = container_of(ch, FVRequest, ch);
 209    struct fv_QueueInfo *qi = ch->qi;
 210    VuDev *dev = &se->virtio_dev->dev;
 211    VuVirtq *q = vu_get_queue(dev, qi->qidx);
 212    VuVirtqElement *elem = &req->elem;
 213    int ret = 0;
 214
 215    assert(count >= 1);
 216    assert(iov[0].iov_len >= sizeof(struct fuse_out_header));
 217
 218    struct fuse_out_header *out = iov[0].iov_base;
 219    /* TODO: Endianness! */
 220
 221    size_t tosend_len = iov_size(iov, count);
 222
 223    /* unique == 0 is notification, which we don't support */
 224    assert(out->unique);
 225    assert(!req->reply_sent);
 226
 227    /* The 'in' part of the elem is to qemu */
 228    unsigned int in_num = elem->in_num;
 229    struct iovec *in_sg = elem->in_sg;
 230    size_t in_len = iov_size(in_sg, in_num);
 231    fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n",
 232             __func__, elem->index, in_num, in_len);
 233
 234    /*
 235     * The elem should have room for a 'fuse_out_header' (out from fuse)
 236     * plus the data based on the len in the header.
 237     */
 238    if (in_len < sizeof(struct fuse_out_header)) {
 239        fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n",
 240                 __func__, elem->index);
 241        ret = -E2BIG;
 242        goto err;
 243    }
 244    if (in_len < tosend_len) {
 245        fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n",
 246                 __func__, elem->index, tosend_len);
 247        ret = -E2BIG;
 248        goto err;
 249    }
 250
 251    copy_iov(iov, count, in_sg, in_num, tosend_len);
 252
 253    pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock);
 254    pthread_mutex_lock(&qi->vq_lock);
 255    vu_queue_push(dev, q, elem, tosend_len);
 256    vu_queue_notify(dev, q);
 257    pthread_mutex_unlock(&qi->vq_lock);
 258    pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock);
 259
 260    req->reply_sent = true;
 261
 262err:
 263    return ret;
 264}
 265
 266/*
 267 * Callback from fuse_send_data_iov_* when it's virtio and the buffer
 268 * is a single FD with FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK
 269 * We need send the iov and then the buffer.
 270 * Return 0 on success
 271 */
 272int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
 273                         struct iovec *iov, int count, struct fuse_bufvec *buf,
 274                         size_t len)
 275{
 276    FVRequest *req = container_of(ch, FVRequest, ch);
 277    struct fv_QueueInfo *qi = ch->qi;
 278    VuDev *dev = &se->virtio_dev->dev;
 279    VuVirtq *q = vu_get_queue(dev, qi->qidx);
 280    VuVirtqElement *elem = &req->elem;
 281    int ret = 0;
 282
 283    assert(count >= 1);
 284    assert(iov[0].iov_len >= sizeof(struct fuse_out_header));
 285
 286    struct fuse_out_header *out = iov[0].iov_base;
 287    /* TODO: Endianness! */
 288
 289    size_t iov_len = iov_size(iov, count);
 290    size_t tosend_len = iov_len + len;
 291
 292    out->len = tosend_len;
 293
 294    fuse_log(FUSE_LOG_DEBUG, "%s: count=%d len=%zd iov_len=%zd\n", __func__,
 295             count, len, iov_len);
 296
 297    /* unique == 0 is notification which we don't support */
 298    assert(out->unique);
 299
 300    assert(!req->reply_sent);
 301
 302    /* The 'in' part of the elem is to qemu */
 303    unsigned int in_num = elem->in_num;
 304    struct iovec *in_sg = elem->in_sg;
 305    size_t in_len = iov_size(in_sg, in_num);
 306    fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n",
 307             __func__, elem->index, in_num, in_len);
 308
 309    /*
 310     * The elem should have room for a 'fuse_out_header' (out from fuse)
 311     * plus the data based on the len in the header.
 312     */
 313    if (in_len < sizeof(struct fuse_out_header)) {
 314        fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n",
 315                 __func__, elem->index);
 316        ret = E2BIG;
 317        goto err;
 318    }
 319    if (in_len < tosend_len) {
 320        fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n",
 321                 __func__, elem->index, tosend_len);
 322        ret = E2BIG;
 323        goto err;
 324    }
 325
 326    /* TODO: Limit to 'len' */
 327
 328    /* First copy the header data from iov->in_sg */
 329    copy_iov(iov, count, in_sg, in_num, iov_len);
 330
 331    /*
 332     * Build a copy of the the in_sg iov so we can skip bits in it,
 333     * including changing the offsets
 334     */
 335    struct iovec *in_sg_cpy = calloc(sizeof(struct iovec), in_num);
 336    assert(in_sg_cpy);
 337    memcpy(in_sg_cpy, in_sg, sizeof(struct iovec) * in_num);
 338    /* These get updated as we skip */
 339    struct iovec *in_sg_ptr = in_sg_cpy;
 340    int in_sg_cpy_count = in_num;
 341
 342    /* skip over parts of in_sg that contained the header iov */
 343    size_t skip_size = iov_len;
 344
 345    size_t in_sg_left = 0;
 346    do {
 347        while (skip_size != 0 && in_sg_cpy_count) {
 348            if (skip_size >= in_sg_ptr[0].iov_len) {
 349                skip_size -= in_sg_ptr[0].iov_len;
 350                in_sg_ptr++;
 351                in_sg_cpy_count--;
 352            } else {
 353                in_sg_ptr[0].iov_len -= skip_size;
 354                in_sg_ptr[0].iov_base += skip_size;
 355                break;
 356            }
 357        }
 358
 359        int i;
 360        for (i = 0, in_sg_left = 0; i < in_sg_cpy_count; i++) {
 361            in_sg_left += in_sg_ptr[i].iov_len;
 362        }
 363        fuse_log(FUSE_LOG_DEBUG,
 364                 "%s: after skip skip_size=%zd in_sg_cpy_count=%d "
 365                 "in_sg_left=%zd\n",
 366                 __func__, skip_size, in_sg_cpy_count, in_sg_left);
 367        ret = preadv(buf->buf[0].fd, in_sg_ptr, in_sg_cpy_count,
 368                     buf->buf[0].pos);
 369
 370        if (ret == -1) {
 371            ret = errno;
 372            fuse_log(FUSE_LOG_DEBUG, "%s: preadv failed (%m) len=%zd\n",
 373                     __func__, len);
 374            free(in_sg_cpy);
 375            goto err;
 376        }
 377        fuse_log(FUSE_LOG_DEBUG, "%s: preadv ret=%d len=%zd\n", __func__,
 378                 ret, len);
 379        if (ret < len && ret) {
 380            fuse_log(FUSE_LOG_DEBUG, "%s: ret < len\n", __func__);
 381            /* Skip over this much next time around */
 382            skip_size = ret;
 383            buf->buf[0].pos += ret;
 384            len -= ret;
 385
 386            /* Lets do another read */
 387            continue;
 388        }
 389        if (!ret) {
 390            /* EOF case? */
 391            fuse_log(FUSE_LOG_DEBUG, "%s: !ret in_sg_left=%zd\n", __func__,
 392                     in_sg_left);
 393            break;
 394        }
 395        if (ret != len) {
 396            fuse_log(FUSE_LOG_DEBUG, "%s: ret!=len\n", __func__);
 397            ret = EIO;
 398            free(in_sg_cpy);
 399            goto err;
 400        }
 401        in_sg_left -= ret;
 402        len -= ret;
 403    } while (in_sg_left);
 404    free(in_sg_cpy);
 405
 406    /* Need to fix out->len on EOF */
 407    if (len) {
 408        struct fuse_out_header *out_sg = in_sg[0].iov_base;
 409
 410        tosend_len -= len;
 411        out_sg->len = tosend_len;
 412    }
 413
 414    ret = 0;
 415
 416    pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock);
 417    pthread_mutex_lock(&qi->vq_lock);
 418    vu_queue_push(dev, q, elem, tosend_len);
 419    vu_queue_notify(dev, q);
 420    pthread_mutex_unlock(&qi->vq_lock);
 421    pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock);
 422
 423err:
 424    if (ret == 0) {
 425        req->reply_sent = true;
 426    }
 427
 428    return ret;
 429}
 430
 431static __thread bool clone_fs_called;
 432
 433/* Process one FVRequest in a thread pool */
 434static void fv_queue_worker(gpointer data, gpointer user_data)
 435{
 436    struct fv_QueueInfo *qi = user_data;
 437    struct fuse_session *se = qi->virtio_dev->se;
 438    struct VuDev *dev = &qi->virtio_dev->dev;
 439    FVRequest *req = data;
 440    VuVirtqElement *elem = &req->elem;
 441    struct fuse_buf fbuf = {};
 442    bool allocated_bufv = false;
 443    struct fuse_bufvec bufv;
 444    struct fuse_bufvec *pbufv;
 445
 446    assert(se->bufsize > sizeof(struct fuse_in_header));
 447
 448    if (!clone_fs_called) {
 449        int ret;
 450
 451        /* unshare FS for xattr operation */
 452        ret = unshare(CLONE_FS);
 453        /* should not fail */
 454        assert(ret == 0);
 455
 456        clone_fs_called = true;
 457    }
 458
 459    /*
 460     * An element contains one request and the space to send our response
 461     * They're spread over multiple descriptors in a scatter/gather set
 462     * and we can't trust the guest to keep them still; so copy in/out.
 463     */
 464    fbuf.mem = malloc(se->bufsize);
 465    assert(fbuf.mem);
 466
 467    fuse_mutex_init(&req->ch.lock);
 468    req->ch.fd = -1;
 469    req->ch.qi = qi;
 470
 471    /* The 'out' part of the elem is from qemu */
 472    unsigned int out_num = elem->out_num;
 473    struct iovec *out_sg = elem->out_sg;
 474    size_t out_len = iov_size(out_sg, out_num);
 475    fuse_log(FUSE_LOG_DEBUG,
 476             "%s: elem %d: with %d out desc of length %zd\n",
 477             __func__, elem->index, out_num, out_len);
 478
 479    /*
 480     * The elem should contain a 'fuse_in_header' (in to fuse)
 481     * plus the data based on the len in the header.
 482     */
 483    if (out_len < sizeof(struct fuse_in_header)) {
 484        fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for in_header\n",
 485                 __func__, elem->index);
 486        assert(0); /* TODO */
 487    }
 488    if (out_len > se->bufsize) {
 489        fuse_log(FUSE_LOG_ERR, "%s: elem %d too large for buffer\n", __func__,
 490                 elem->index);
 491        assert(0); /* TODO */
 492    }
 493    /* Copy just the first element and look at it */
 494    copy_from_iov(&fbuf, 1, out_sg);
 495
 496    pbufv = NULL; /* Compiler thinks an unitialised path */
 497    if (out_num > 2 &&
 498        out_sg[0].iov_len == sizeof(struct fuse_in_header) &&
 499        ((struct fuse_in_header *)fbuf.mem)->opcode == FUSE_WRITE &&
 500        out_sg[1].iov_len == sizeof(struct fuse_write_in)) {
 501        /*
 502         * For a write we don't actually need to copy the
 503         * data, we can just do it straight out of guest memory
 504         * but we must still copy the headers in case the guest
 505         * was nasty and changed them while we were using them.
 506         */
 507        fuse_log(FUSE_LOG_DEBUG, "%s: Write special case\n", __func__);
 508
 509        /* copy the fuse_write_in header afte rthe fuse_in_header */
 510        fbuf.mem += out_sg->iov_len;
 511        copy_from_iov(&fbuf, 1, out_sg + 1);
 512        fbuf.mem -= out_sg->iov_len;
 513        fbuf.size = out_sg[0].iov_len + out_sg[1].iov_len;
 514
 515        /* Allocate the bufv, with space for the rest of the iov */
 516        pbufv = malloc(sizeof(struct fuse_bufvec) +
 517                       sizeof(struct fuse_buf) * (out_num - 2));
 518        if (!pbufv) {
 519            fuse_log(FUSE_LOG_ERR, "%s: pbufv malloc failed\n",
 520                    __func__);
 521            goto out;
 522        }
 523
 524        allocated_bufv = true;
 525        pbufv->count = 1;
 526        pbufv->buf[0] = fbuf;
 527
 528        size_t iovindex, pbufvindex;
 529        iovindex = 2; /* 2 headers, separate iovs */
 530        pbufvindex = 1; /* 2 headers, 1 fusebuf */
 531
 532        for (; iovindex < out_num; iovindex++, pbufvindex++) {
 533            pbufv->count++;
 534            pbufv->buf[pbufvindex].pos = ~0; /* Dummy */
 535            pbufv->buf[pbufvindex].flags = 0;
 536            pbufv->buf[pbufvindex].mem = out_sg[iovindex].iov_base;
 537            pbufv->buf[pbufvindex].size = out_sg[iovindex].iov_len;
 538        }
 539    } else {
 540        /* Normal (non fast write) path */
 541
 542        /* Copy the rest of the buffer */
 543        fbuf.mem += out_sg->iov_len;
 544        copy_from_iov(&fbuf, out_num - 1, out_sg + 1);
 545        fbuf.mem -= out_sg->iov_len;
 546        fbuf.size = out_len;
 547
 548        /* TODO! Endianness of header */
 549
 550        /* TODO: Add checks for fuse_session_exited */
 551        bufv.buf[0] = fbuf;
 552        bufv.count = 1;
 553        pbufv = &bufv;
 554    }
 555    pbufv->idx = 0;
 556    pbufv->off = 0;
 557    fuse_session_process_buf_int(se, pbufv, &req->ch);
 558
 559out:
 560    if (allocated_bufv) {
 561        free(pbufv);
 562    }
 563
 564    /* If the request has no reply, still recycle the virtqueue element */
 565    if (!req->reply_sent) {
 566        struct VuVirtq *q = vu_get_queue(dev, qi->qidx);
 567
 568        fuse_log(FUSE_LOG_DEBUG, "%s: elem %d no reply sent\n", __func__,
 569                 elem->index);
 570
 571        pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock);
 572        pthread_mutex_lock(&qi->vq_lock);
 573        vu_queue_push(dev, q, elem, 0);
 574        vu_queue_notify(dev, q);
 575        pthread_mutex_unlock(&qi->vq_lock);
 576        pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock);
 577    }
 578
 579    pthread_mutex_destroy(&req->ch.lock);
 580    free(fbuf.mem);
 581    free(req);
 582}
 583
 584/* Thread function for individual queues, created when a queue is 'started' */
 585static void *fv_queue_thread(void *opaque)
 586{
 587    struct fv_QueueInfo *qi = opaque;
 588    struct VuDev *dev = &qi->virtio_dev->dev;
 589    struct VuVirtq *q = vu_get_queue(dev, qi->qidx);
 590    struct fuse_session *se = qi->virtio_dev->se;
 591    GThreadPool *pool;
 592
 593    pool = g_thread_pool_new(fv_queue_worker, qi, se->thread_pool_size, FALSE,
 594                             NULL);
 595    if (!pool) {
 596        fuse_log(FUSE_LOG_ERR, "%s: g_thread_pool_new failed\n", __func__);
 597        return NULL;
 598    }
 599
 600    fuse_log(FUSE_LOG_INFO, "%s: Start for queue %d kick_fd %d\n", __func__,
 601             qi->qidx, qi->kick_fd);
 602    while (1) {
 603        struct pollfd pf[2];
 604        int ret;
 605
 606        pf[0].fd = qi->kick_fd;
 607        pf[0].events = POLLIN;
 608        pf[0].revents = 0;
 609        pf[1].fd = qi->kill_fd;
 610        pf[1].events = POLLIN;
 611        pf[1].revents = 0;
 612
 613        fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for Queue %d event\n", __func__,
 614                 qi->qidx);
 615        int poll_res = ppoll(pf, 2, NULL, NULL);
 616
 617        if (poll_res == -1) {
 618            if (errno == EINTR) {
 619                fuse_log(FUSE_LOG_INFO, "%s: ppoll interrupted, going around\n",
 620                         __func__);
 621                continue;
 622            }
 623            fuse_log(FUSE_LOG_ERR, "fv_queue_thread ppoll: %m\n");
 624            break;
 625        }
 626        assert(poll_res >= 1);
 627        if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) {
 628            fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x Queue %d\n",
 629                     __func__, pf[0].revents, qi->qidx);
 630            break;
 631        }
 632        if (pf[1].revents & (POLLERR | POLLHUP | POLLNVAL)) {
 633            fuse_log(FUSE_LOG_ERR,
 634                     "%s: Unexpected poll revents %x Queue %d killfd\n",
 635                     __func__, pf[1].revents, qi->qidx);
 636            break;
 637        }
 638        if (pf[1].revents) {
 639            fuse_log(FUSE_LOG_INFO, "%s: kill event on queue %d - quitting\n",
 640                     __func__, qi->qidx);
 641            break;
 642        }
 643        assert(pf[0].revents & POLLIN);
 644        fuse_log(FUSE_LOG_DEBUG, "%s: Got queue event on Queue %d\n", __func__,
 645                 qi->qidx);
 646
 647        eventfd_t evalue;
 648        if (eventfd_read(qi->kick_fd, &evalue)) {
 649            fuse_log(FUSE_LOG_ERR, "Eventfd_read for queue: %m\n");
 650            break;
 651        }
 652        /* Mutual exclusion with virtio_loop() */
 653        ret = pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock);
 654        assert(ret == 0); /* there is no possible error case */
 655        pthread_mutex_lock(&qi->vq_lock);
 656        /* out is from guest, in is too guest */
 657        unsigned int in_bytes, out_bytes;
 658        vu_queue_get_avail_bytes(dev, q, &in_bytes, &out_bytes, ~0, ~0);
 659
 660        fuse_log(FUSE_LOG_DEBUG,
 661                 "%s: Queue %d gave evalue: %zx available: in: %u out: %u\n",
 662                 __func__, qi->qidx, (size_t)evalue, in_bytes, out_bytes);
 663
 664        while (1) {
 665            FVRequest *req = vu_queue_pop(dev, q, sizeof(FVRequest));
 666            if (!req) {
 667                break;
 668            }
 669
 670            req->reply_sent = false;
 671
 672            g_thread_pool_push(pool, req, NULL);
 673        }
 674
 675        pthread_mutex_unlock(&qi->vq_lock);
 676        pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock);
 677    }
 678
 679    g_thread_pool_free(pool, FALSE, TRUE);
 680
 681    return NULL;
 682}
 683
 684static void fv_queue_cleanup_thread(struct fv_VuDev *vud, int qidx)
 685{
 686    int ret;
 687    struct fv_QueueInfo *ourqi;
 688
 689    assert(qidx < vud->nqueues);
 690    ourqi = vud->qi[qidx];
 691
 692    /* Kill the thread */
 693    if (eventfd_write(ourqi->kill_fd, 1)) {
 694        fuse_log(FUSE_LOG_ERR, "Eventfd_write for queue %d: %s\n",
 695                 qidx, strerror(errno));
 696    }
 697    ret = pthread_join(ourqi->thread, NULL);
 698    if (ret) {
 699        fuse_log(FUSE_LOG_ERR, "%s: Failed to join thread idx %d err %d\n",
 700                 __func__, qidx, ret);
 701    }
 702    pthread_mutex_destroy(&ourqi->vq_lock);
 703    close(ourqi->kill_fd);
 704    ourqi->kick_fd = -1;
 705    free(vud->qi[qidx]);
 706    vud->qi[qidx] = NULL;
 707}
 708
 709/* Callback from libvhost-user on start or stop of a queue */
 710static void fv_queue_set_started(VuDev *dev, int qidx, bool started)
 711{
 712    struct fv_VuDev *vud = container_of(dev, struct fv_VuDev, dev);
 713    struct fv_QueueInfo *ourqi;
 714
 715    fuse_log(FUSE_LOG_INFO, "%s: qidx=%d started=%d\n", __func__, qidx,
 716             started);
 717    assert(qidx >= 0);
 718
 719    /*
 720     * Ignore additional request queues for now.  passthrough_ll.c must be
 721     * audited for thread-safety issues first.  It was written with a
 722     * well-behaved client in mind and may not protect against all types of
 723     * races yet.
 724     */
 725    if (qidx > 1) {
 726        fuse_log(FUSE_LOG_ERR,
 727                 "%s: multiple request queues not yet implemented, please only "
 728                 "configure 1 request queue\n",
 729                 __func__);
 730        exit(EXIT_FAILURE);
 731    }
 732
 733    if (started) {
 734        /* Fire up a thread to watch this queue */
 735        if (qidx >= vud->nqueues) {
 736            vud->qi = realloc(vud->qi, (qidx + 1) * sizeof(vud->qi[0]));
 737            assert(vud->qi);
 738            memset(vud->qi + vud->nqueues, 0,
 739                   sizeof(vud->qi[0]) * (1 + (qidx - vud->nqueues)));
 740            vud->nqueues = qidx + 1;
 741        }
 742        if (!vud->qi[qidx]) {
 743            vud->qi[qidx] = calloc(sizeof(struct fv_QueueInfo), 1);
 744            assert(vud->qi[qidx]);
 745            vud->qi[qidx]->virtio_dev = vud;
 746            vud->qi[qidx]->qidx = qidx;
 747        } else {
 748            /* Shouldn't have been started */
 749            assert(vud->qi[qidx]->kick_fd == -1);
 750        }
 751        ourqi = vud->qi[qidx];
 752        ourqi->kick_fd = dev->vq[qidx].kick_fd;
 753
 754        ourqi->kill_fd = eventfd(0, EFD_CLOEXEC | EFD_SEMAPHORE);
 755        assert(ourqi->kill_fd != -1);
 756        pthread_mutex_init(&ourqi->vq_lock, NULL);
 757
 758        if (pthread_create(&ourqi->thread, NULL, fv_queue_thread, ourqi)) {
 759            fuse_log(FUSE_LOG_ERR, "%s: Failed to create thread for queue %d\n",
 760                     __func__, qidx);
 761            assert(0);
 762        }
 763    } else {
 764        fv_queue_cleanup_thread(vud, qidx);
 765    }
 766}
 767
 768static bool fv_queue_order(VuDev *dev, int qidx)
 769{
 770    return false;
 771}
 772
 773static const VuDevIface fv_iface = {
 774    .get_features = fv_get_features,
 775    .set_features = fv_set_features,
 776
 777    /* Don't need process message, we've not got any at vhost-user level */
 778    .queue_set_started = fv_queue_set_started,
 779
 780    .queue_is_processed_in_order = fv_queue_order,
 781};
 782
 783/*
 784 * Main loop; this mostly deals with events on the vhost-user
 785 * socket itself, and not actual fuse data.
 786 */
 787int virtio_loop(struct fuse_session *se)
 788{
 789    fuse_log(FUSE_LOG_INFO, "%s: Entry\n", __func__);
 790
 791    while (!fuse_session_exited(se)) {
 792        struct pollfd pf[1];
 793        bool ok;
 794        int ret;
 795        pf[0].fd = se->vu_socketfd;
 796        pf[0].events = POLLIN;
 797        pf[0].revents = 0;
 798
 799        fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for VU event\n", __func__);
 800        int poll_res = ppoll(pf, 1, NULL, NULL);
 801
 802        if (poll_res == -1) {
 803            if (errno == EINTR) {
 804                fuse_log(FUSE_LOG_INFO, "%s: ppoll interrupted, going around\n",
 805                         __func__);
 806                continue;
 807            }
 808            fuse_log(FUSE_LOG_ERR, "virtio_loop ppoll: %m\n");
 809            break;
 810        }
 811        assert(poll_res == 1);
 812        if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) {
 813            fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x\n", __func__,
 814                     pf[0].revents);
 815            break;
 816        }
 817        assert(pf[0].revents & POLLIN);
 818        fuse_log(FUSE_LOG_DEBUG, "%s: Got VU event\n", __func__);
 819        /* Mutual exclusion with fv_queue_thread() */
 820        ret = pthread_rwlock_wrlock(&se->virtio_dev->vu_dispatch_rwlock);
 821        assert(ret == 0); /* there is no possible error case */
 822
 823        ok = vu_dispatch(&se->virtio_dev->dev);
 824
 825        pthread_rwlock_unlock(&se->virtio_dev->vu_dispatch_rwlock);
 826
 827        if (!ok) {
 828            fuse_log(FUSE_LOG_ERR, "%s: vu_dispatch failed\n", __func__);
 829            break;
 830        }
 831    }
 832
 833    /*
 834     * Make sure all fv_queue_thread()s quit on exit, as we're about to
 835     * free virtio dev and fuse session, no one should access them anymore.
 836     */
 837    for (int i = 0; i < se->virtio_dev->nqueues; i++) {
 838        if (!se->virtio_dev->qi[i]) {
 839            continue;
 840        }
 841
 842        fuse_log(FUSE_LOG_INFO, "%s: Stopping queue %d thread\n", __func__, i);
 843        fv_queue_cleanup_thread(se->virtio_dev, i);
 844    }
 845
 846    fuse_log(FUSE_LOG_INFO, "%s: Exit\n", __func__);
 847
 848    return 0;
 849}
 850
 851static void strreplace(char *s, char old, char new)
 852{
 853    for (; *s; ++s) {
 854        if (*s == old) {
 855            *s = new;
 856        }
 857    }
 858}
 859
 860static bool fv_socket_lock(struct fuse_session *se)
 861{
 862    g_autofree gchar *sk_name = NULL;
 863    g_autofree gchar *pidfile = NULL;
 864    g_autofree gchar *dir = NULL;
 865    Error *local_err = NULL;
 866
 867    dir = qemu_get_local_state_pathname("run/virtiofsd");
 868
 869    if (g_mkdir_with_parents(dir, S_IRWXU) < 0) {
 870        fuse_log(FUSE_LOG_ERR, "%s: Failed to create directory %s: %s",
 871                 __func__, dir, strerror(errno));
 872        return false;
 873    }
 874
 875    sk_name = g_strdup(se->vu_socket_path);
 876    strreplace(sk_name, '/', '.');
 877    pidfile = g_strdup_printf("%s/%s.pid", dir, sk_name);
 878
 879    if (!qemu_write_pidfile(pidfile, &local_err)) {
 880        error_report_err(local_err);
 881        return false;
 882    }
 883
 884    return true;
 885}
 886
 887static int fv_create_listen_socket(struct fuse_session *se)
 888{
 889    struct sockaddr_un un;
 890    mode_t old_umask;
 891
 892    /* Nothing to do if fd is already initialized */
 893    if (se->vu_listen_fd >= 0) {
 894        return 0;
 895    }
 896
 897    if (strlen(se->vu_socket_path) >= sizeof(un.sun_path)) {
 898        fuse_log(FUSE_LOG_ERR, "Socket path too long\n");
 899        return -1;
 900    }
 901
 902    if (!strlen(se->vu_socket_path)) {
 903        fuse_log(FUSE_LOG_ERR, "Socket path is empty\n");
 904        return -1;
 905    }
 906
 907    /* Check the vu_socket_path is already used */
 908    if (!fv_socket_lock(se)) {
 909        return -1;
 910    }
 911
 912    /*
 913     * Create the Unix socket to communicate with qemu
 914     * based on QEMU's vhost-user-bridge
 915     */
 916    unlink(se->vu_socket_path);
 917    strcpy(un.sun_path, se->vu_socket_path);
 918    size_t addr_len = sizeof(un);
 919
 920    int listen_sock = socket(AF_UNIX, SOCK_STREAM, 0);
 921    if (listen_sock == -1) {
 922        fuse_log(FUSE_LOG_ERR, "vhost socket creation: %m\n");
 923        return -1;
 924    }
 925    un.sun_family = AF_UNIX;
 926
 927    /*
 928     * Unfortunately bind doesn't let you set the mask on the socket,
 929     * so set umask appropriately and restore it later.
 930     */
 931    if (se->vu_socket_group) {
 932        old_umask = umask(S_IROTH | S_IWOTH | S_IXOTH);
 933    } else {
 934        old_umask = umask(S_IRGRP | S_IWGRP | S_IXGRP |
 935                          S_IROTH | S_IWOTH | S_IXOTH);
 936    }
 937    if (bind(listen_sock, (struct sockaddr *)&un, addr_len) == -1) {
 938        fuse_log(FUSE_LOG_ERR, "vhost socket bind: %m\n");
 939        close(listen_sock);
 940        umask(old_umask);
 941        return -1;
 942    }
 943    if (se->vu_socket_group) {
 944        struct group *g = getgrnam(se->vu_socket_group);
 945        if (g) {
 946            if (!chown(se->vu_socket_path, -1, g->gr_gid)) {
 947                fuse_log(FUSE_LOG_WARNING,
 948                         "vhost socket failed to set group to %s (%d)\n",
 949                         se->vu_socket_group, g->gr_gid);
 950            }
 951        }
 952    }
 953    umask(old_umask);
 954
 955    if (listen(listen_sock, 1) == -1) {
 956        fuse_log(FUSE_LOG_ERR, "vhost socket listen: %m\n");
 957        close(listen_sock);
 958        return -1;
 959    }
 960
 961    se->vu_listen_fd = listen_sock;
 962    return 0;
 963}
 964
 965int virtio_session_mount(struct fuse_session *se)
 966{
 967    int ret;
 968
 969    /*
 970     * Test that unshare(CLONE_FS) works. fv_queue_worker() will need it. It's
 971     * an unprivileged system call but some Docker/Moby versions are known to
 972     * reject it via seccomp when CAP_SYS_ADMIN is not given.
 973     *
 974     * Note that the program is single-threaded here so this syscall has no
 975     * visible effect and is safe to make.
 976     */
 977    ret = unshare(CLONE_FS);
 978    if (ret == -1 && errno == EPERM) {
 979        fuse_log(FUSE_LOG_ERR, "unshare(CLONE_FS) failed with EPERM. If "
 980                "running in a container please check that the container "
 981                "runtime seccomp policy allows unshare.\n");
 982        return -1;
 983    }
 984
 985    ret = fv_create_listen_socket(se);
 986    if (ret < 0) {
 987        return ret;
 988    }
 989
 990    se->fd = -1;
 991
 992    fuse_log(FUSE_LOG_INFO, "%s: Waiting for vhost-user socket connection...\n",
 993             __func__);
 994    int data_sock = accept(se->vu_listen_fd, NULL, NULL);
 995    if (data_sock == -1) {
 996        fuse_log(FUSE_LOG_ERR, "vhost socket accept: %m\n");
 997        close(se->vu_listen_fd);
 998        return -1;
 999    }
1000    close(se->vu_listen_fd);
1001    se->vu_listen_fd = -1;
1002    fuse_log(FUSE_LOG_INFO, "%s: Received vhost-user socket connection\n",
1003             __func__);
1004
1005    /* TODO: Some cleanup/deallocation! */
1006    se->virtio_dev = calloc(sizeof(struct fv_VuDev), 1);
1007    if (!se->virtio_dev) {
1008        fuse_log(FUSE_LOG_ERR, "%s: virtio_dev calloc failed\n", __func__);
1009        close(data_sock);
1010        return -1;
1011    }
1012
1013    se->vu_socketfd = data_sock;
1014    se->virtio_dev->se = se;
1015    pthread_rwlock_init(&se->virtio_dev->vu_dispatch_rwlock, NULL);
1016    if (!vu_init(&se->virtio_dev->dev, 2, se->vu_socketfd, fv_panic, NULL,
1017                 fv_set_watch, fv_remove_watch, &fv_iface)) {
1018        fuse_log(FUSE_LOG_ERR, "%s: vu_init failed\n", __func__);
1019        return -1;
1020    }
1021
1022    return 0;
1023}
1024
1025void virtio_session_close(struct fuse_session *se)
1026{
1027    close(se->vu_socketfd);
1028
1029    if (!se->virtio_dev) {
1030        return;
1031    }
1032
1033    free(se->virtio_dev->qi);
1034    pthread_rwlock_destroy(&se->virtio_dev->vu_dispatch_rwlock);
1035    free(se->virtio_dev);
1036    se->virtio_dev = NULL;
1037}
1038