qemu/aio-posix.c
<<
>>
Prefs
   1/*
   2 * QEMU aio implementation
   3 *
   4 * Copyright IBM, Corp. 2008
   5 *
   6 * Authors:
   7 *  Anthony Liguori   <aliguori@us.ibm.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2.  See
  10 * the COPYING file in the top-level directory.
  11 *
  12 * Contributions after 2012-01-13 are licensed under the terms of the
  13 * GNU GPL, version 2 or (at your option) any later version.
  14 */
  15
  16#include "qemu-common.h"
  17#include "block/block.h"
  18#include "qemu/queue.h"
  19#include "qemu/sockets.h"
  20
  21struct AioHandler
  22{
  23    GPollFD pfd;
  24    IOHandler *io_read;
  25    IOHandler *io_write;
  26    int deleted;
  27    void *opaque;
  28    QLIST_ENTRY(AioHandler) node;
  29};
  30
  31static AioHandler *find_aio_handler(AioContext *ctx, int fd)
  32{
  33    AioHandler *node;
  34
  35    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
  36        if (node->pfd.fd == fd)
  37            if (!node->deleted)
  38                return node;
  39    }
  40
  41    return NULL;
  42}
  43
  44void aio_set_fd_handler(AioContext *ctx,
  45                        int fd,
  46                        IOHandler *io_read,
  47                        IOHandler *io_write,
  48                        void *opaque)
  49{
  50    AioHandler *node;
  51
  52    node = find_aio_handler(ctx, fd);
  53
  54    /* Are we deleting the fd handler? */
  55    if (!io_read && !io_write) {
  56        if (node) {
  57            g_source_remove_poll(&ctx->source, &node->pfd);
  58
  59            /* If the lock is held, just mark the node as deleted */
  60            if (ctx->walking_handlers) {
  61                node->deleted = 1;
  62                node->pfd.revents = 0;
  63            } else {
  64                /* Otherwise, delete it for real.  We can't just mark it as
  65                 * deleted because deleted nodes are only cleaned up after
  66                 * releasing the walking_handlers lock.
  67                 */
  68                QLIST_REMOVE(node, node);
  69                g_free(node);
  70            }
  71        }
  72    } else {
  73        if (node == NULL) {
  74            /* Alloc and insert if it's not already there */
  75            node = g_new0(AioHandler, 1);
  76            node->pfd.fd = fd;
  77            QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node);
  78
  79            g_source_add_poll(&ctx->source, &node->pfd);
  80        }
  81        /* Update handler with latest information */
  82        node->io_read = io_read;
  83        node->io_write = io_write;
  84        node->opaque = opaque;
  85
  86        node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
  87        node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
  88    }
  89
  90    aio_notify(ctx);
  91}
  92
  93void aio_set_event_notifier(AioContext *ctx,
  94                            EventNotifier *notifier,
  95                            EventNotifierHandler *io_read)
  96{
  97    aio_set_fd_handler(ctx, event_notifier_get_fd(notifier),
  98                       (IOHandler *)io_read, NULL, notifier);
  99}
 100
 101bool aio_prepare(AioContext *ctx)
 102{
 103    return false;
 104}
 105
 106bool aio_pending(AioContext *ctx)
 107{
 108    AioHandler *node;
 109
 110    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
 111        int revents;
 112
 113        revents = node->pfd.revents & node->pfd.events;
 114        if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read) {
 115            return true;
 116        }
 117        if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write) {
 118            return true;
 119        }
 120    }
 121
 122    return false;
 123}
 124
 125bool aio_dispatch(AioContext *ctx)
 126{
 127    AioHandler *node;
 128    bool progress = false;
 129
 130    /*
 131     * If there are callbacks left that have been queued, we need to call them.
 132     * Do not call select in this case, because it is possible that the caller
 133     * does not need a complete flush (as is the case for aio_poll loops).
 134     */
 135    if (aio_bh_poll(ctx)) {
 136        progress = true;
 137    }
 138
 139    /*
 140     * We have to walk very carefully in case aio_set_fd_handler is
 141     * called while we're walking.
 142     */
 143    node = QLIST_FIRST(&ctx->aio_handlers);
 144    while (node) {
 145        AioHandler *tmp;
 146        int revents;
 147
 148        ctx->walking_handlers++;
 149
 150        revents = node->pfd.revents & node->pfd.events;
 151        node->pfd.revents = 0;
 152
 153        if (!node->deleted &&
 154            (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
 155            node->io_read) {
 156            node->io_read(node->opaque);
 157
 158            /* aio_notify() does not count as progress */
 159            if (node->opaque != &ctx->notifier) {
 160                progress = true;
 161            }
 162        }
 163        if (!node->deleted &&
 164            (revents & (G_IO_OUT | G_IO_ERR)) &&
 165            node->io_write) {
 166            node->io_write(node->opaque);
 167            progress = true;
 168        }
 169
 170        tmp = node;
 171        node = QLIST_NEXT(node, node);
 172
 173        ctx->walking_handlers--;
 174
 175        if (!ctx->walking_handlers && tmp->deleted) {
 176            QLIST_REMOVE(tmp, node);
 177            g_free(tmp);
 178        }
 179    }
 180
 181    /* Run our timers */
 182    progress |= timerlistgroup_run_timers(&ctx->tlg);
 183
 184    return progress;
 185}
 186
 187/* These thread-local variables are used only in a small part of aio_poll
 188 * around the call to the poll() system call.  In particular they are not
 189 * used while aio_poll is performing callbacks, which makes it much easier
 190 * to think about reentrancy!
 191 *
 192 * Stack-allocated arrays would be perfect but they have size limitations;
 193 * heap allocation is expensive enough that we want to reuse arrays across
 194 * calls to aio_poll().  And because poll() has to be called without holding
 195 * any lock, the arrays cannot be stored in AioContext.  Thread-local data
 196 * has none of the disadvantages of these three options.
 197 */
 198static __thread GPollFD *pollfds;
 199static __thread AioHandler **nodes;
 200static __thread unsigned npfd, nalloc;
 201static __thread Notifier pollfds_cleanup_notifier;
 202
 203static void pollfds_cleanup(Notifier *n, void *unused)
 204{
 205    g_assert(npfd == 0);
 206    g_free(pollfds);
 207    g_free(nodes);
 208    nalloc = 0;
 209}
 210
 211static void add_pollfd(AioHandler *node)
 212{
 213    if (npfd == nalloc) {
 214        if (nalloc == 0) {
 215            pollfds_cleanup_notifier.notify = pollfds_cleanup;
 216            qemu_thread_atexit_add(&pollfds_cleanup_notifier);
 217            nalloc = 8;
 218        } else {
 219            g_assert(nalloc <= INT_MAX);
 220            nalloc *= 2;
 221        }
 222        pollfds = g_renew(GPollFD, pollfds, nalloc);
 223        nodes = g_renew(AioHandler *, nodes, nalloc);
 224    }
 225    nodes[npfd] = node;
 226    pollfds[npfd] = (GPollFD) {
 227        .fd = node->pfd.fd,
 228        .events = node->pfd.events,
 229    };
 230    npfd++;
 231}
 232
 233bool aio_poll(AioContext *ctx, bool blocking)
 234{
 235    AioHandler *node;
 236    int i, ret;
 237    bool progress;
 238    int64_t timeout;
 239
 240    aio_context_acquire(ctx);
 241    progress = false;
 242
 243    /* aio_notify can avoid the expensive event_notifier_set if
 244     * everything (file descriptors, bottom halves, timers) will
 245     * be re-evaluated before the next blocking poll().  This is
 246     * already true when aio_poll is called with blocking == false;
 247     * if blocking == true, it is only true after poll() returns,
 248     * so disable the optimization now.
 249     */
 250    if (blocking) {
 251        atomic_add(&ctx->notify_me, 2);
 252    }
 253
 254    ctx->walking_handlers++;
 255
 256    assert(npfd == 0);
 257
 258    /* fill pollfds */
 259    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
 260        if (!node->deleted && node->pfd.events) {
 261            add_pollfd(node);
 262        }
 263    }
 264
 265    timeout = blocking ? aio_compute_timeout(ctx) : 0;
 266
 267    /* wait until next event */
 268    if (timeout) {
 269        aio_context_release(ctx);
 270    }
 271    ret = qemu_poll_ns((GPollFD *)pollfds, npfd, timeout);
 272    if (blocking) {
 273        atomic_sub(&ctx->notify_me, 2);
 274    }
 275    if (timeout) {
 276        aio_context_acquire(ctx);
 277    }
 278
 279    aio_notify_accept(ctx);
 280
 281    /* if we have any readable fds, dispatch event */
 282    if (ret > 0) {
 283        for (i = 0; i < npfd; i++) {
 284            nodes[i]->pfd.revents = pollfds[i].revents;
 285        }
 286    }
 287
 288    npfd = 0;
 289    ctx->walking_handlers--;
 290
 291    /* Run dispatch even if there were no readable fds to run timers */
 292    if (aio_dispatch(ctx)) {
 293        progress = true;
 294    }
 295
 296    aio_context_release(ctx);
 297
 298    return progress;
 299}
 300