qemu/block/nbd.c
<<
>>
Prefs
   1/*
   2 * QEMU Block driver for  NBD
   3 *
   4 * Copyright (c) 2019 Virtuozzo International GmbH.
   5 * Copyright (C) 2016 Red Hat, Inc.
   6 * Copyright (C) 2008 Bull S.A.S.
   7 *     Author: Laurent Vivier <Laurent.Vivier@bull.net>
   8 *
   9 * Some parts:
  10 *    Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws>
  11 *
  12 * Permission is hereby granted, free of charge, to any person obtaining a copy
  13 * of this software and associated documentation files (the "Software"), to deal
  14 * in the Software without restriction, including without limitation the rights
  15 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  16 * copies of the Software, and to permit persons to whom the Software is
  17 * furnished to do so, subject to the following conditions:
  18 *
  19 * The above copyright notice and this permission notice shall be included in
  20 * all copies or substantial portions of the Software.
  21 *
  22 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  23 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  24 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  25 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  26 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  27 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  28 * THE SOFTWARE.
  29 */
  30
  31#include "qemu/osdep.h"
  32
  33#include "trace.h"
  34#include "qemu/uri.h"
  35#include "qemu/option.h"
  36#include "qemu/cutils.h"
  37#include "qemu/main-loop.h"
  38#include "qemu/atomic.h"
  39
  40#include "qapi/qapi-visit-sockets.h"
  41#include "qapi/qmp/qstring.h"
  42#include "qapi/clone-visitor.h"
  43
  44#include "block/qdict.h"
  45#include "block/nbd.h"
  46#include "block/block_int.h"
  47
  48#include "qemu/yank.h"
  49
  50#define EN_OPTSTR ":exportname="
  51#define MAX_NBD_REQUESTS    16
  52
  53#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ (uint64_t)(intptr_t)(bs))
  54#define INDEX_TO_HANDLE(bs, index)  ((index)  ^ (uint64_t)(intptr_t)(bs))
  55
  56typedef struct {
  57    Coroutine *coroutine;
  58    uint64_t offset;        /* original offset of the request */
  59    bool receiving;         /* waiting for connection_co? */
  60} NBDClientRequest;
  61
  62typedef enum NBDClientState {
  63    NBD_CLIENT_CONNECTING_WAIT,
  64    NBD_CLIENT_CONNECTING_NOWAIT,
  65    NBD_CLIENT_CONNECTED,
  66    NBD_CLIENT_QUIT
  67} NBDClientState;
  68
  69typedef enum NBDConnectThreadState {
  70    /* No thread, no pending results */
  71    CONNECT_THREAD_NONE,
  72
  73    /* Thread is running, no results for now */
  74    CONNECT_THREAD_RUNNING,
  75
  76    /*
  77     * Thread is running, but requestor exited. Thread should close
  78     * the new socket and free the connect state on exit.
  79     */
  80    CONNECT_THREAD_RUNNING_DETACHED,
  81
  82    /* Thread finished, results are stored in a state */
  83    CONNECT_THREAD_FAIL,
  84    CONNECT_THREAD_SUCCESS
  85} NBDConnectThreadState;
  86
  87typedef struct NBDConnectThread {
  88    /* Initialization constants */
  89    SocketAddress *saddr; /* address to connect to */
  90    /*
  91     * Bottom half to schedule on completion. Scheduled only if bh_ctx is not
  92     * NULL
  93     */
  94    QEMUBHFunc *bh_func;
  95    void *bh_opaque;
  96
  97    /*
  98     * Result of last attempt. Valid in FAIL and SUCCESS states.
  99     * If you want to steal error, don't forget to set pointer to NULL.
 100     */
 101    QIOChannelSocket *sioc;
 102    Error *err;
 103
 104    /* state and bh_ctx are protected by mutex */
 105    QemuMutex mutex;
 106    NBDConnectThreadState state; /* current state of the thread */
 107    AioContext *bh_ctx; /* where to schedule bh (NULL means don't schedule) */
 108} NBDConnectThread;
 109
 110typedef struct BDRVNBDState {
 111    QIOChannelSocket *sioc; /* The master data channel */
 112    QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */
 113    NBDExportInfo info;
 114
 115    CoMutex send_mutex;
 116    CoQueue free_sema;
 117    Coroutine *connection_co;
 118    Coroutine *teardown_co;
 119    QemuCoSleepState *connection_co_sleep_ns_state;
 120    bool drained;
 121    bool wait_drained_end;
 122    int in_flight;
 123    NBDClientState state;
 124    int connect_status;
 125    Error *connect_err;
 126    bool wait_in_flight;
 127
 128    QEMUTimer *reconnect_delay_timer;
 129
 130    NBDClientRequest requests[MAX_NBD_REQUESTS];
 131    NBDReply reply;
 132    BlockDriverState *bs;
 133
 134    /* Connection parameters */
 135    uint32_t reconnect_delay;
 136    SocketAddress *saddr;
 137    char *export, *tlscredsid;
 138    QCryptoTLSCreds *tlscreds;
 139    const char *hostname;
 140    char *x_dirty_bitmap;
 141    bool alloc_depth;
 142
 143    bool wait_connect;
 144    NBDConnectThread *connect_thread;
 145} BDRVNBDState;
 146
 147static int nbd_establish_connection(BlockDriverState *bs, SocketAddress *saddr,
 148                                    Error **errp);
 149static int nbd_co_establish_connection(BlockDriverState *bs, Error **errp);
 150static void nbd_co_establish_connection_cancel(BlockDriverState *bs,
 151                                               bool detach);
 152static int nbd_client_handshake(BlockDriverState *bs, Error **errp);
 153static void nbd_yank(void *opaque);
 154
 155static void nbd_clear_bdrvstate(BDRVNBDState *s)
 156{
 157    object_unref(OBJECT(s->tlscreds));
 158    qapi_free_SocketAddress(s->saddr);
 159    s->saddr = NULL;
 160    g_free(s->export);
 161    s->export = NULL;
 162    g_free(s->tlscredsid);
 163    s->tlscredsid = NULL;
 164    g_free(s->x_dirty_bitmap);
 165    s->x_dirty_bitmap = NULL;
 166}
 167
 168static void nbd_channel_error(BDRVNBDState *s, int ret)
 169{
 170    if (ret == -EIO) {
 171        if (qatomic_load_acquire(&s->state) == NBD_CLIENT_CONNECTED) {
 172            s->state = s->reconnect_delay ? NBD_CLIENT_CONNECTING_WAIT :
 173                                            NBD_CLIENT_CONNECTING_NOWAIT;
 174        }
 175    } else {
 176        if (qatomic_load_acquire(&s->state) == NBD_CLIENT_CONNECTED) {
 177            qio_channel_shutdown(s->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
 178        }
 179        s->state = NBD_CLIENT_QUIT;
 180    }
 181}
 182
 183static void nbd_recv_coroutines_wake_all(BDRVNBDState *s)
 184{
 185    int i;
 186
 187    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
 188        NBDClientRequest *req = &s->requests[i];
 189
 190        if (req->coroutine && req->receiving) {
 191            aio_co_wake(req->coroutine);
 192        }
 193    }
 194}
 195
 196static void reconnect_delay_timer_del(BDRVNBDState *s)
 197{
 198    if (s->reconnect_delay_timer) {
 199        timer_free(s->reconnect_delay_timer);
 200        s->reconnect_delay_timer = NULL;
 201    }
 202}
 203
 204static void reconnect_delay_timer_cb(void *opaque)
 205{
 206    BDRVNBDState *s = opaque;
 207
 208    if (qatomic_load_acquire(&s->state) == NBD_CLIENT_CONNECTING_WAIT) {
 209        s->state = NBD_CLIENT_CONNECTING_NOWAIT;
 210        while (qemu_co_enter_next(&s->free_sema, NULL)) {
 211            /* Resume all queued requests */
 212        }
 213    }
 214
 215    reconnect_delay_timer_del(s);
 216}
 217
 218static void reconnect_delay_timer_init(BDRVNBDState *s, uint64_t expire_time_ns)
 219{
 220    if (qatomic_load_acquire(&s->state) != NBD_CLIENT_CONNECTING_WAIT) {
 221        return;
 222    }
 223
 224    assert(!s->reconnect_delay_timer);
 225    s->reconnect_delay_timer = aio_timer_new(bdrv_get_aio_context(s->bs),
 226                                             QEMU_CLOCK_REALTIME,
 227                                             SCALE_NS,
 228                                             reconnect_delay_timer_cb, s);
 229    timer_mod(s->reconnect_delay_timer, expire_time_ns);
 230}
 231
 232static void nbd_client_detach_aio_context(BlockDriverState *bs)
 233{
 234    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
 235
 236    /* Timer is deleted in nbd_client_co_drain_begin() */
 237    assert(!s->reconnect_delay_timer);
 238    /*
 239     * If reconnect is in progress we may have no ->ioc.  It will be
 240     * re-instantiated in the proper aio context once the connection is
 241     * reestablished.
 242     */
 243    if (s->ioc) {
 244        qio_channel_detach_aio_context(QIO_CHANNEL(s->ioc));
 245    }
 246}
 247
 248static void nbd_client_attach_aio_context_bh(void *opaque)
 249{
 250    BlockDriverState *bs = opaque;
 251    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
 252
 253    if (s->connection_co) {
 254        /*
 255         * The node is still drained, so we know the coroutine has yielded in
 256         * nbd_read_eof(), the only place where bs->in_flight can reach 0, or
 257         * it is entered for the first time. Both places are safe for entering
 258         * the coroutine.
 259         */
 260        qemu_aio_coroutine_enter(bs->aio_context, s->connection_co);
 261    }
 262    bdrv_dec_in_flight(bs);
 263}
 264
 265static void nbd_client_attach_aio_context(BlockDriverState *bs,
 266                                          AioContext *new_context)
 267{
 268    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
 269
 270    /*
 271     * s->connection_co is either yielded from nbd_receive_reply or from
 272     * nbd_co_reconnect_loop()
 273     */
 274    if (qatomic_load_acquire(&s->state) == NBD_CLIENT_CONNECTED) {
 275        qio_channel_attach_aio_context(QIO_CHANNEL(s->ioc), new_context);
 276    }
 277
 278    bdrv_inc_in_flight(bs);
 279
 280    /*
 281     * Need to wait here for the BH to run because the BH must run while the
 282     * node is still drained.
 283     */
 284    aio_wait_bh_oneshot(new_context, nbd_client_attach_aio_context_bh, bs);
 285}
 286
 287static void coroutine_fn nbd_client_co_drain_begin(BlockDriverState *bs)
 288{
 289    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
 290
 291    s->drained = true;
 292    if (s->connection_co_sleep_ns_state) {
 293        qemu_co_sleep_wake(s->connection_co_sleep_ns_state);
 294    }
 295
 296    nbd_co_establish_connection_cancel(bs, false);
 297
 298    reconnect_delay_timer_del(s);
 299
 300    if (qatomic_load_acquire(&s->state) == NBD_CLIENT_CONNECTING_WAIT) {
 301        s->state = NBD_CLIENT_CONNECTING_NOWAIT;
 302        qemu_co_queue_restart_all(&s->free_sema);
 303    }
 304}
 305
 306static void coroutine_fn nbd_client_co_drain_end(BlockDriverState *bs)
 307{
 308    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
 309
 310    s->drained = false;
 311    if (s->wait_drained_end) {
 312        s->wait_drained_end = false;
 313        aio_co_wake(s->connection_co);
 314    }
 315}
 316
 317
 318static void nbd_teardown_connection(BlockDriverState *bs)
 319{
 320    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
 321
 322    if (s->ioc) {
 323        /* finish any pending coroutines */
 324        qio_channel_shutdown(s->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
 325    } else if (s->sioc) {
 326        /* abort negotiation */
 327        qio_channel_shutdown(QIO_CHANNEL(s->sioc), QIO_CHANNEL_SHUTDOWN_BOTH,
 328                             NULL);
 329    }
 330
 331    s->state = NBD_CLIENT_QUIT;
 332    if (s->connection_co) {
 333        if (s->connection_co_sleep_ns_state) {
 334            qemu_co_sleep_wake(s->connection_co_sleep_ns_state);
 335        }
 336        nbd_co_establish_connection_cancel(bs, true);
 337    }
 338    if (qemu_in_coroutine()) {
 339        s->teardown_co = qemu_coroutine_self();
 340        /* connection_co resumes us when it terminates */
 341        qemu_coroutine_yield();
 342        s->teardown_co = NULL;
 343    } else {
 344        BDRV_POLL_WHILE(bs, s->connection_co);
 345    }
 346    assert(!s->connection_co);
 347}
 348
 349static bool nbd_client_connecting(BDRVNBDState *s)
 350{
 351    NBDClientState state = qatomic_load_acquire(&s->state);
 352    return state == NBD_CLIENT_CONNECTING_WAIT ||
 353        state == NBD_CLIENT_CONNECTING_NOWAIT;
 354}
 355
 356static bool nbd_client_connecting_wait(BDRVNBDState *s)
 357{
 358    return qatomic_load_acquire(&s->state) == NBD_CLIENT_CONNECTING_WAIT;
 359}
 360
 361static void connect_bh(void *opaque)
 362{
 363    BDRVNBDState *state = opaque;
 364
 365    assert(state->wait_connect);
 366    state->wait_connect = false;
 367    aio_co_wake(state->connection_co);
 368}
 369
 370static void nbd_init_connect_thread(BDRVNBDState *s)
 371{
 372    s->connect_thread = g_new(NBDConnectThread, 1);
 373
 374    *s->connect_thread = (NBDConnectThread) {
 375        .saddr = QAPI_CLONE(SocketAddress, s->saddr),
 376        .state = CONNECT_THREAD_NONE,
 377        .bh_func = connect_bh,
 378        .bh_opaque = s,
 379    };
 380
 381    qemu_mutex_init(&s->connect_thread->mutex);
 382}
 383
 384static void nbd_free_connect_thread(NBDConnectThread *thr)
 385{
 386    if (thr->sioc) {
 387        qio_channel_close(QIO_CHANNEL(thr->sioc), NULL);
 388    }
 389    error_free(thr->err);
 390    qapi_free_SocketAddress(thr->saddr);
 391    g_free(thr);
 392}
 393
 394static void *connect_thread_func(void *opaque)
 395{
 396    NBDConnectThread *thr = opaque;
 397    int ret;
 398    bool do_free = false;
 399
 400    thr->sioc = qio_channel_socket_new();
 401
 402    error_free(thr->err);
 403    thr->err = NULL;
 404    ret = qio_channel_socket_connect_sync(thr->sioc, thr->saddr, &thr->err);
 405    if (ret < 0) {
 406        object_unref(OBJECT(thr->sioc));
 407        thr->sioc = NULL;
 408    }
 409
 410    qemu_mutex_lock(&thr->mutex);
 411
 412    switch (thr->state) {
 413    case CONNECT_THREAD_RUNNING:
 414        thr->state = ret < 0 ? CONNECT_THREAD_FAIL : CONNECT_THREAD_SUCCESS;
 415        if (thr->bh_ctx) {
 416            aio_bh_schedule_oneshot(thr->bh_ctx, thr->bh_func, thr->bh_opaque);
 417
 418            /* play safe, don't reuse bh_ctx on further connection attempts */
 419            thr->bh_ctx = NULL;
 420        }
 421        break;
 422    case CONNECT_THREAD_RUNNING_DETACHED:
 423        do_free = true;
 424        break;
 425    default:
 426        abort();
 427    }
 428
 429    qemu_mutex_unlock(&thr->mutex);
 430
 431    if (do_free) {
 432        nbd_free_connect_thread(thr);
 433    }
 434
 435    return NULL;
 436}
 437
 438static int coroutine_fn
 439nbd_co_establish_connection(BlockDriverState *bs, Error **errp)
 440{
 441    int ret;
 442    QemuThread thread;
 443    BDRVNBDState *s = bs->opaque;
 444    NBDConnectThread *thr = s->connect_thread;
 445
 446    if (!thr) {
 447        /* detached */
 448        return -1;
 449    }
 450
 451    qemu_mutex_lock(&thr->mutex);
 452
 453    switch (thr->state) {
 454    case CONNECT_THREAD_FAIL:
 455    case CONNECT_THREAD_NONE:
 456        error_free(thr->err);
 457        thr->err = NULL;
 458        thr->state = CONNECT_THREAD_RUNNING;
 459        qemu_thread_create(&thread, "nbd-connect",
 460                           connect_thread_func, thr, QEMU_THREAD_DETACHED);
 461        break;
 462    case CONNECT_THREAD_SUCCESS:
 463        /* Previous attempt finally succeeded in background */
 464        thr->state = CONNECT_THREAD_NONE;
 465        s->sioc = thr->sioc;
 466        thr->sioc = NULL;
 467        yank_register_function(BLOCKDEV_YANK_INSTANCE(bs->node_name),
 468                               nbd_yank, bs);
 469        qemu_mutex_unlock(&thr->mutex);
 470        return 0;
 471    case CONNECT_THREAD_RUNNING:
 472        /* Already running, will wait */
 473        break;
 474    default:
 475        abort();
 476    }
 477
 478    thr->bh_ctx = qemu_get_current_aio_context();
 479
 480    qemu_mutex_unlock(&thr->mutex);
 481
 482
 483    /*
 484     * We are going to wait for connect-thread finish, but
 485     * nbd_client_co_drain_begin() can interrupt.
 486     *
 487     * Note that wait_connect variable is not visible for connect-thread. It
 488     * doesn't need mutex protection, it used only inside home aio context of
 489     * bs.
 490     */
 491    s->wait_connect = true;
 492    qemu_coroutine_yield();
 493
 494    if (!s->connect_thread) {
 495        /* detached */
 496        return -1;
 497    }
 498    assert(thr == s->connect_thread);
 499
 500    qemu_mutex_lock(&thr->mutex);
 501
 502    switch (thr->state) {
 503    case CONNECT_THREAD_SUCCESS:
 504    case CONNECT_THREAD_FAIL:
 505        thr->state = CONNECT_THREAD_NONE;
 506        error_propagate(errp, thr->err);
 507        thr->err = NULL;
 508        s->sioc = thr->sioc;
 509        thr->sioc = NULL;
 510        if (s->sioc) {
 511            yank_register_function(BLOCKDEV_YANK_INSTANCE(bs->node_name),
 512                                   nbd_yank, bs);
 513        }
 514        ret = (s->sioc ? 0 : -1);
 515        break;
 516    case CONNECT_THREAD_RUNNING:
 517    case CONNECT_THREAD_RUNNING_DETACHED:
 518        /*
 519         * Obviously, drained section wants to start. Report the attempt as
 520         * failed. Still connect thread is executing in background, and its
 521         * result may be used for next connection attempt.
 522         */
 523        ret = -1;
 524        error_setg(errp, "Connection attempt cancelled by other operation");
 525        break;
 526
 527    case CONNECT_THREAD_NONE:
 528        /*
 529         * Impossible. We've seen this thread running. So it should be
 530         * running or at least give some results.
 531         */
 532        abort();
 533
 534    default:
 535        abort();
 536    }
 537
 538    qemu_mutex_unlock(&thr->mutex);
 539
 540    return ret;
 541}
 542
 543/*
 544 * nbd_co_establish_connection_cancel
 545 * Cancel nbd_co_establish_connection asynchronously: it will finish soon, to
 546 * allow drained section to begin.
 547 *
 548 * If detach is true, also cleanup the state (or if thread is running, move it
 549 * to CONNECT_THREAD_RUNNING_DETACHED state). s->connect_thread becomes NULL if
 550 * detach is true.
 551 */
 552static void nbd_co_establish_connection_cancel(BlockDriverState *bs,
 553                                               bool detach)
 554{
 555    BDRVNBDState *s = bs->opaque;
 556    NBDConnectThread *thr = s->connect_thread;
 557    bool wake = false;
 558    bool do_free = false;
 559
 560    qemu_mutex_lock(&thr->mutex);
 561
 562    if (thr->state == CONNECT_THREAD_RUNNING) {
 563        /* We can cancel only in running state, when bh is not yet scheduled */
 564        thr->bh_ctx = NULL;
 565        if (s->wait_connect) {
 566            s->wait_connect = false;
 567            wake = true;
 568        }
 569        if (detach) {
 570            thr->state = CONNECT_THREAD_RUNNING_DETACHED;
 571            s->connect_thread = NULL;
 572        }
 573    } else if (detach) {
 574        do_free = true;
 575    }
 576
 577    qemu_mutex_unlock(&thr->mutex);
 578
 579    if (do_free) {
 580        nbd_free_connect_thread(thr);
 581        s->connect_thread = NULL;
 582    }
 583
 584    if (wake) {
 585        aio_co_wake(s->connection_co);
 586    }
 587}
 588
 589static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s)
 590{
 591    int ret;
 592    Error *local_err = NULL;
 593
 594    if (!nbd_client_connecting(s)) {
 595        return;
 596    }
 597
 598    /* Wait for completion of all in-flight requests */
 599
 600    qemu_co_mutex_lock(&s->send_mutex);
 601
 602    while (s->in_flight > 0) {
 603        qemu_co_mutex_unlock(&s->send_mutex);
 604        nbd_recv_coroutines_wake_all(s);
 605        s->wait_in_flight = true;
 606        qemu_coroutine_yield();
 607        s->wait_in_flight = false;
 608        qemu_co_mutex_lock(&s->send_mutex);
 609    }
 610
 611    qemu_co_mutex_unlock(&s->send_mutex);
 612
 613    if (!nbd_client_connecting(s)) {
 614        return;
 615    }
 616
 617    /*
 618     * Now we are sure that nobody is accessing the channel, and no one will
 619     * try until we set the state to CONNECTED.
 620     */
 621
 622    /* Finalize previous connection if any */
 623    if (s->ioc) {
 624        qio_channel_detach_aio_context(QIO_CHANNEL(s->ioc));
 625        yank_unregister_function(BLOCKDEV_YANK_INSTANCE(s->bs->node_name),
 626                                 nbd_yank, s->bs);
 627        object_unref(OBJECT(s->sioc));
 628        s->sioc = NULL;
 629        object_unref(OBJECT(s->ioc));
 630        s->ioc = NULL;
 631    }
 632
 633    if (nbd_co_establish_connection(s->bs, &local_err) < 0) {
 634        ret = -ECONNREFUSED;
 635        goto out;
 636    }
 637
 638    bdrv_dec_in_flight(s->bs);
 639
 640    ret = nbd_client_handshake(s->bs, &local_err);
 641
 642    if (s->drained) {
 643        s->wait_drained_end = true;
 644        while (s->drained) {
 645            /*
 646             * We may be entered once from nbd_client_attach_aio_context_bh
 647             * and then from nbd_client_co_drain_end. So here is a loop.
 648             */
 649            qemu_coroutine_yield();
 650        }
 651    }
 652    bdrv_inc_in_flight(s->bs);
 653
 654out:
 655    s->connect_status = ret;
 656    error_free(s->connect_err);
 657    s->connect_err = NULL;
 658    error_propagate(&s->connect_err, local_err);
 659
 660    if (ret >= 0) {
 661        /* successfully connected */
 662        s->state = NBD_CLIENT_CONNECTED;
 663        qemu_co_queue_restart_all(&s->free_sema);
 664    }
 665}
 666
 667static coroutine_fn void nbd_co_reconnect_loop(BDRVNBDState *s)
 668{
 669    uint64_t timeout = 1 * NANOSECONDS_PER_SECOND;
 670    uint64_t max_timeout = 16 * NANOSECONDS_PER_SECOND;
 671
 672    if (qatomic_load_acquire(&s->state) == NBD_CLIENT_CONNECTING_WAIT) {
 673        reconnect_delay_timer_init(s, qemu_clock_get_ns(QEMU_CLOCK_REALTIME) +
 674                                   s->reconnect_delay * NANOSECONDS_PER_SECOND);
 675    }
 676
 677    nbd_reconnect_attempt(s);
 678
 679    while (nbd_client_connecting(s)) {
 680        if (s->drained) {
 681            bdrv_dec_in_flight(s->bs);
 682            s->wait_drained_end = true;
 683            while (s->drained) {
 684                /*
 685                 * We may be entered once from nbd_client_attach_aio_context_bh
 686                 * and then from nbd_client_co_drain_end. So here is a loop.
 687                 */
 688                qemu_coroutine_yield();
 689            }
 690            bdrv_inc_in_flight(s->bs);
 691        } else {
 692            qemu_co_sleep_ns_wakeable(QEMU_CLOCK_REALTIME, timeout,
 693                                      &s->connection_co_sleep_ns_state);
 694            if (s->drained) {
 695                continue;
 696            }
 697            if (timeout < max_timeout) {
 698                timeout *= 2;
 699            }
 700        }
 701
 702        nbd_reconnect_attempt(s);
 703    }
 704
 705    reconnect_delay_timer_del(s);
 706}
 707
 708static coroutine_fn void nbd_connection_entry(void *opaque)
 709{
 710    BDRVNBDState *s = opaque;
 711    uint64_t i;
 712    int ret = 0;
 713    Error *local_err = NULL;
 714
 715    while (qatomic_load_acquire(&s->state) != NBD_CLIENT_QUIT) {
 716        /*
 717         * The NBD client can only really be considered idle when it has
 718         * yielded from qio_channel_readv_all_eof(), waiting for data. This is
 719         * the point where the additional scheduled coroutine entry happens
 720         * after nbd_client_attach_aio_context().
 721         *
 722         * Therefore we keep an additional in_flight reference all the time and
 723         * only drop it temporarily here.
 724         */
 725
 726        if (nbd_client_connecting(s)) {
 727            nbd_co_reconnect_loop(s);
 728        }
 729
 730        if (qatomic_load_acquire(&s->state) != NBD_CLIENT_CONNECTED) {
 731            continue;
 732        }
 733
 734        assert(s->reply.handle == 0);
 735        ret = nbd_receive_reply(s->bs, s->ioc, &s->reply, &local_err);
 736
 737        if (local_err) {
 738            trace_nbd_read_reply_entry_fail(ret, error_get_pretty(local_err));
 739            error_free(local_err);
 740            local_err = NULL;
 741        }
 742        if (ret <= 0) {
 743            nbd_channel_error(s, ret ? ret : -EIO);
 744            continue;
 745        }
 746
 747        /*
 748         * There's no need for a mutex on the receive side, because the
 749         * handler acts as a synchronization point and ensures that only
 750         * one coroutine is called until the reply finishes.
 751         */
 752        i = HANDLE_TO_INDEX(s, s->reply.handle);
 753        if (i >= MAX_NBD_REQUESTS ||
 754            !s->requests[i].coroutine ||
 755            !s->requests[i].receiving ||
 756            (nbd_reply_is_structured(&s->reply) && !s->info.structured_reply))
 757        {
 758            nbd_channel_error(s, -EINVAL);
 759            continue;
 760        }
 761
 762        /*
 763         * We're woken up again by the request itself.  Note that there
 764         * is no race between yielding and reentering connection_co.  This
 765         * is because:
 766         *
 767         * - if the request runs on the same AioContext, it is only
 768         *   entered after we yield
 769         *
 770         * - if the request runs on a different AioContext, reentering
 771         *   connection_co happens through a bottom half, which can only
 772         *   run after we yield.
 773         */
 774        aio_co_wake(s->requests[i].coroutine);
 775        qemu_coroutine_yield();
 776    }
 777
 778    qemu_co_queue_restart_all(&s->free_sema);
 779    nbd_recv_coroutines_wake_all(s);
 780    bdrv_dec_in_flight(s->bs);
 781
 782    s->connection_co = NULL;
 783    if (s->ioc) {
 784        qio_channel_detach_aio_context(QIO_CHANNEL(s->ioc));
 785        yank_unregister_function(BLOCKDEV_YANK_INSTANCE(s->bs->node_name),
 786                                 nbd_yank, s->bs);
 787        object_unref(OBJECT(s->sioc));
 788        s->sioc = NULL;
 789        object_unref(OBJECT(s->ioc));
 790        s->ioc = NULL;
 791    }
 792
 793    if (s->teardown_co) {
 794        aio_co_wake(s->teardown_co);
 795    }
 796    aio_wait_kick();
 797}
 798
 799static int nbd_co_send_request(BlockDriverState *bs,
 800                               NBDRequest *request,
 801                               QEMUIOVector *qiov)
 802{
 803    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
 804    int rc, i = -1;
 805
 806    qemu_co_mutex_lock(&s->send_mutex);
 807    while (s->in_flight == MAX_NBD_REQUESTS || nbd_client_connecting_wait(s)) {
 808        qemu_co_queue_wait(&s->free_sema, &s->send_mutex);
 809    }
 810
 811    if (qatomic_load_acquire(&s->state) != NBD_CLIENT_CONNECTED) {
 812        rc = -EIO;
 813        goto err;
 814    }
 815
 816    s->in_flight++;
 817
 818    for (i = 0; i < MAX_NBD_REQUESTS; i++) {
 819        if (s->requests[i].coroutine == NULL) {
 820            break;
 821        }
 822    }
 823
 824    g_assert(qemu_in_coroutine());
 825    assert(i < MAX_NBD_REQUESTS);
 826
 827    s->requests[i].coroutine = qemu_coroutine_self();
 828    s->requests[i].offset = request->from;
 829    s->requests[i].receiving = false;
 830
 831    request->handle = INDEX_TO_HANDLE(s, i);
 832
 833    assert(s->ioc);
 834
 835    if (qiov) {
 836        qio_channel_set_cork(s->ioc, true);
 837        rc = nbd_send_request(s->ioc, request);
 838        if (qatomic_load_acquire(&s->state) == NBD_CLIENT_CONNECTED &&
 839            rc >= 0) {
 840            if (qio_channel_writev_all(s->ioc, qiov->iov, qiov->niov,
 841                                       NULL) < 0) {
 842                rc = -EIO;
 843            }
 844        } else if (rc >= 0) {
 845            rc = -EIO;
 846        }
 847        qio_channel_set_cork(s->ioc, false);
 848    } else {
 849        rc = nbd_send_request(s->ioc, request);
 850    }
 851
 852err:
 853    if (rc < 0) {
 854        nbd_channel_error(s, rc);
 855        if (i != -1) {
 856            s->requests[i].coroutine = NULL;
 857            s->in_flight--;
 858        }
 859        if (s->in_flight == 0 && s->wait_in_flight) {
 860            aio_co_wake(s->connection_co);
 861        } else {
 862            qemu_co_queue_next(&s->free_sema);
 863        }
 864    }
 865    qemu_co_mutex_unlock(&s->send_mutex);
 866    return rc;
 867}
 868
 869static inline uint16_t payload_advance16(uint8_t **payload)
 870{
 871    *payload += 2;
 872    return lduw_be_p(*payload - 2);
 873}
 874
 875static inline uint32_t payload_advance32(uint8_t **payload)
 876{
 877    *payload += 4;
 878    return ldl_be_p(*payload - 4);
 879}
 880
 881static inline uint64_t payload_advance64(uint8_t **payload)
 882{
 883    *payload += 8;
 884    return ldq_be_p(*payload - 8);
 885}
 886
 887static int nbd_parse_offset_hole_payload(BDRVNBDState *s,
 888                                         NBDStructuredReplyChunk *chunk,
 889                                         uint8_t *payload, uint64_t orig_offset,
 890                                         QEMUIOVector *qiov, Error **errp)
 891{
 892    uint64_t offset;
 893    uint32_t hole_size;
 894
 895    if (chunk->length != sizeof(offset) + sizeof(hole_size)) {
 896        error_setg(errp, "Protocol error: invalid payload for "
 897                         "NBD_REPLY_TYPE_OFFSET_HOLE");
 898        return -EINVAL;
 899    }
 900
 901    offset = payload_advance64(&payload);
 902    hole_size = payload_advance32(&payload);
 903
 904    if (!hole_size || offset < orig_offset || hole_size > qiov->size ||
 905        offset > orig_offset + qiov->size - hole_size) {
 906        error_setg(errp, "Protocol error: server sent chunk exceeding requested"
 907                         " region");
 908        return -EINVAL;
 909    }
 910    if (s->info.min_block &&
 911        !QEMU_IS_ALIGNED(hole_size, s->info.min_block)) {
 912        trace_nbd_structured_read_compliance("hole");
 913    }
 914
 915    qemu_iovec_memset(qiov, offset - orig_offset, 0, hole_size);
 916
 917    return 0;
 918}
 919
 920/*
 921 * nbd_parse_blockstatus_payload
 922 * Based on our request, we expect only one extent in reply, for the
 923 * base:allocation context.
 924 */
 925static int nbd_parse_blockstatus_payload(BDRVNBDState *s,
 926                                         NBDStructuredReplyChunk *chunk,
 927                                         uint8_t *payload, uint64_t orig_length,
 928                                         NBDExtent *extent, Error **errp)
 929{
 930    uint32_t context_id;
 931
 932    /* The server succeeded, so it must have sent [at least] one extent */
 933    if (chunk->length < sizeof(context_id) + sizeof(*extent)) {
 934        error_setg(errp, "Protocol error: invalid payload for "
 935                         "NBD_REPLY_TYPE_BLOCK_STATUS");
 936        return -EINVAL;
 937    }
 938
 939    context_id = payload_advance32(&payload);
 940    if (s->info.context_id != context_id) {
 941        error_setg(errp, "Protocol error: unexpected context id %d for "
 942                         "NBD_REPLY_TYPE_BLOCK_STATUS, when negotiated context "
 943                         "id is %d", context_id,
 944                         s->info.context_id);
 945        return -EINVAL;
 946    }
 947
 948    extent->length = payload_advance32(&payload);
 949    extent->flags = payload_advance32(&payload);
 950
 951    if (extent->length == 0) {
 952        error_setg(errp, "Protocol error: server sent status chunk with "
 953                   "zero length");
 954        return -EINVAL;
 955    }
 956
 957    /*
 958     * A server sending unaligned block status is in violation of the
 959     * protocol, but as qemu-nbd 3.1 is such a server (at least for
 960     * POSIX files that are not a multiple of 512 bytes, since qemu
 961     * rounds files up to 512-byte multiples but lseek(SEEK_HOLE)
 962     * still sees an implicit hole beyond the real EOF), it's nicer to
 963     * work around the misbehaving server. If the request included
 964     * more than the final unaligned block, truncate it back to an
 965     * aligned result; if the request was only the final block, round
 966     * up to the full block and change the status to fully-allocated
 967     * (always a safe status, even if it loses information).
 968     */
 969    if (s->info.min_block && !QEMU_IS_ALIGNED(extent->length,
 970                                                   s->info.min_block)) {
 971        trace_nbd_parse_blockstatus_compliance("extent length is unaligned");
 972        if (extent->length > s->info.min_block) {
 973            extent->length = QEMU_ALIGN_DOWN(extent->length,
 974                                             s->info.min_block);
 975        } else {
 976            extent->length = s->info.min_block;
 977            extent->flags = 0;
 978        }
 979    }
 980
 981    /*
 982     * We used NBD_CMD_FLAG_REQ_ONE, so the server should not have
 983     * sent us any more than one extent, nor should it have included
 984     * status beyond our request in that extent. However, it's easy
 985     * enough to ignore the server's noncompliance without killing the
 986     * connection; just ignore trailing extents, and clamp things to
 987     * the length of our request.
 988     */
 989    if (chunk->length > sizeof(context_id) + sizeof(*extent)) {
 990        trace_nbd_parse_blockstatus_compliance("more than one extent");
 991    }
 992    if (extent->length > orig_length) {
 993        extent->length = orig_length;
 994        trace_nbd_parse_blockstatus_compliance("extent length too large");
 995    }
 996
 997    /*
 998     * HACK: if we are using x-dirty-bitmaps to access
 999     * qemu:allocation-depth, treat all depths > 2 the same as 2,
1000     * since nbd_client_co_block_status is only expecting the low two
1001     * bits to be set.
1002     */
1003    if (s->alloc_depth && extent->flags > 2) {
1004        extent->flags = 2;
1005    }
1006
1007    return 0;
1008}
1009
1010/*
1011 * nbd_parse_error_payload
1012 * on success @errp contains message describing nbd error reply
1013 */
1014static int nbd_parse_error_payload(NBDStructuredReplyChunk *chunk,
1015                                   uint8_t *payload, int *request_ret,
1016                                   Error **errp)
1017{
1018    uint32_t error;
1019    uint16_t message_size;
1020
1021    assert(chunk->type & (1 << 15));
1022
1023    if (chunk->length < sizeof(error) + sizeof(message_size)) {
1024        error_setg(errp,
1025                   "Protocol error: invalid payload for structured error");
1026        return -EINVAL;
1027    }
1028
1029    error = nbd_errno_to_system_errno(payload_advance32(&payload));
1030    if (error == 0) {
1031        error_setg(errp, "Protocol error: server sent structured error chunk "
1032                         "with error = 0");
1033        return -EINVAL;
1034    }
1035
1036    *request_ret = -error;
1037    message_size = payload_advance16(&payload);
1038
1039    if (message_size > chunk->length - sizeof(error) - sizeof(message_size)) {
1040        error_setg(errp, "Protocol error: server sent structured error chunk "
1041                         "with incorrect message size");
1042        return -EINVAL;
1043    }
1044
1045    /* TODO: Add a trace point to mention the server complaint */
1046
1047    /* TODO handle ERROR_OFFSET */
1048
1049    return 0;
1050}
1051
1052static int nbd_co_receive_offset_data_payload(BDRVNBDState *s,
1053                                              uint64_t orig_offset,
1054                                              QEMUIOVector *qiov, Error **errp)
1055{
1056    QEMUIOVector sub_qiov;
1057    uint64_t offset;
1058    size_t data_size;
1059    int ret;
1060    NBDStructuredReplyChunk *chunk = &s->reply.structured;
1061
1062    assert(nbd_reply_is_structured(&s->reply));
1063
1064    /* The NBD spec requires at least one byte of payload */
1065    if (chunk->length <= sizeof(offset)) {
1066        error_setg(errp, "Protocol error: invalid payload for "
1067                         "NBD_REPLY_TYPE_OFFSET_DATA");
1068        return -EINVAL;
1069    }
1070
1071    if (nbd_read64(s->ioc, &offset, "OFFSET_DATA offset", errp) < 0) {
1072        return -EIO;
1073    }
1074
1075    data_size = chunk->length - sizeof(offset);
1076    assert(data_size);
1077    if (offset < orig_offset || data_size > qiov->size ||
1078        offset > orig_offset + qiov->size - data_size) {
1079        error_setg(errp, "Protocol error: server sent chunk exceeding requested"
1080                         " region");
1081        return -EINVAL;
1082    }
1083    if (s->info.min_block && !QEMU_IS_ALIGNED(data_size, s->info.min_block)) {
1084        trace_nbd_structured_read_compliance("data");
1085    }
1086
1087    qemu_iovec_init(&sub_qiov, qiov->niov);
1088    qemu_iovec_concat(&sub_qiov, qiov, offset - orig_offset, data_size);
1089    ret = qio_channel_readv_all(s->ioc, sub_qiov.iov, sub_qiov.niov, errp);
1090    qemu_iovec_destroy(&sub_qiov);
1091
1092    return ret < 0 ? -EIO : 0;
1093}
1094
1095#define NBD_MAX_MALLOC_PAYLOAD 1000
1096static coroutine_fn int nbd_co_receive_structured_payload(
1097        BDRVNBDState *s, void **payload, Error **errp)
1098{
1099    int ret;
1100    uint32_t len;
1101
1102    assert(nbd_reply_is_structured(&s->reply));
1103
1104    len = s->reply.structured.length;
1105
1106    if (len == 0) {
1107        return 0;
1108    }
1109
1110    if (payload == NULL) {
1111        error_setg(errp, "Unexpected structured payload");
1112        return -EINVAL;
1113    }
1114
1115    if (len > NBD_MAX_MALLOC_PAYLOAD) {
1116        error_setg(errp, "Payload too large");
1117        return -EINVAL;
1118    }
1119
1120    *payload = g_new(char, len);
1121    ret = nbd_read(s->ioc, *payload, len, "structured payload", errp);
1122    if (ret < 0) {
1123        g_free(*payload);
1124        *payload = NULL;
1125        return ret;
1126    }
1127
1128    return 0;
1129}
1130
1131/*
1132 * nbd_co_do_receive_one_chunk
1133 * for simple reply:
1134 *   set request_ret to received reply error
1135 *   if qiov is not NULL: read payload to @qiov
1136 * for structured reply chunk:
1137 *   if error chunk: read payload, set @request_ret, do not set @payload
1138 *   else if offset_data chunk: read payload data to @qiov, do not set @payload
1139 *   else: read payload to @payload
1140 *
1141 * If function fails, @errp contains corresponding error message, and the
1142 * connection with the server is suspect.  If it returns 0, then the
1143 * transaction succeeded (although @request_ret may be a negative errno
1144 * corresponding to the server's error reply), and errp is unchanged.
1145 */
1146static coroutine_fn int nbd_co_do_receive_one_chunk(
1147        BDRVNBDState *s, uint64_t handle, bool only_structured,
1148        int *request_ret, QEMUIOVector *qiov, void **payload, Error **errp)
1149{
1150    int ret;
1151    int i = HANDLE_TO_INDEX(s, handle);
1152    void *local_payload = NULL;
1153    NBDStructuredReplyChunk *chunk;
1154
1155    if (payload) {
1156        *payload = NULL;
1157    }
1158    *request_ret = 0;
1159
1160    /* Wait until we're woken up by nbd_connection_entry.  */
1161    s->requests[i].receiving = true;
1162    qemu_coroutine_yield();
1163    s->requests[i].receiving = false;
1164    if (qatomic_load_acquire(&s->state) != NBD_CLIENT_CONNECTED) {
1165        error_setg(errp, "Connection closed");
1166        return -EIO;
1167    }
1168    assert(s->ioc);
1169
1170    assert(s->reply.handle == handle);
1171
1172    if (nbd_reply_is_simple(&s->reply)) {
1173        if (only_structured) {
1174            error_setg(errp, "Protocol error: simple reply when structured "
1175                             "reply chunk was expected");
1176            return -EINVAL;
1177        }
1178
1179        *request_ret = -nbd_errno_to_system_errno(s->reply.simple.error);
1180        if (*request_ret < 0 || !qiov) {
1181            return 0;
1182        }
1183
1184        return qio_channel_readv_all(s->ioc, qiov->iov, qiov->niov,
1185                                     errp) < 0 ? -EIO : 0;
1186    }
1187
1188    /* handle structured reply chunk */
1189    assert(s->info.structured_reply);
1190    chunk = &s->reply.structured;
1191
1192    if (chunk->type == NBD_REPLY_TYPE_NONE) {
1193        if (!(chunk->flags & NBD_REPLY_FLAG_DONE)) {
1194            error_setg(errp, "Protocol error: NBD_REPLY_TYPE_NONE chunk without"
1195                       " NBD_REPLY_FLAG_DONE flag set");
1196            return -EINVAL;
1197        }
1198        if (chunk->length) {
1199            error_setg(errp, "Protocol error: NBD_REPLY_TYPE_NONE chunk with"
1200                       " nonzero length");
1201            return -EINVAL;
1202        }
1203        return 0;
1204    }
1205
1206    if (chunk->type == NBD_REPLY_TYPE_OFFSET_DATA) {
1207        if (!qiov) {
1208            error_setg(errp, "Unexpected NBD_REPLY_TYPE_OFFSET_DATA chunk");
1209            return -EINVAL;
1210        }
1211
1212        return nbd_co_receive_offset_data_payload(s, s->requests[i].offset,
1213                                                  qiov, errp);
1214    }
1215
1216    if (nbd_reply_type_is_error(chunk->type)) {
1217        payload = &local_payload;
1218    }
1219
1220    ret = nbd_co_receive_structured_payload(s, payload, errp);
1221    if (ret < 0) {
1222        return ret;
1223    }
1224
1225    if (nbd_reply_type_is_error(chunk->type)) {
1226        ret = nbd_parse_error_payload(chunk, local_payload, request_ret, errp);
1227        g_free(local_payload);
1228        return ret;
1229    }
1230
1231    return 0;
1232}
1233
1234/*
1235 * nbd_co_receive_one_chunk
1236 * Read reply, wake up connection_co and set s->quit if needed.
1237 * Return value is a fatal error code or normal nbd reply error code
1238 */
1239static coroutine_fn int nbd_co_receive_one_chunk(
1240        BDRVNBDState *s, uint64_t handle, bool only_structured,
1241        int *request_ret, QEMUIOVector *qiov, NBDReply *reply, void **payload,
1242        Error **errp)
1243{
1244    int ret = nbd_co_do_receive_one_chunk(s, handle, only_structured,
1245                                          request_ret, qiov, payload, errp);
1246
1247    if (ret < 0) {
1248        memset(reply, 0, sizeof(*reply));
1249        nbd_channel_error(s, ret);
1250    } else {
1251        /* For assert at loop start in nbd_connection_entry */
1252        *reply = s->reply;
1253    }
1254    s->reply.handle = 0;
1255
1256    if (s->connection_co && !s->wait_in_flight) {
1257        /*
1258         * We must check s->wait_in_flight, because we may entered by
1259         * nbd_recv_coroutines_wake_all(), in this case we should not
1260         * wake connection_co here, it will woken by last request.
1261         */
1262        aio_co_wake(s->connection_co);
1263    }
1264
1265    return ret;
1266}
1267
1268typedef struct NBDReplyChunkIter {
1269    int ret;
1270    int request_ret;
1271    Error *err;
1272    bool done, only_structured;
1273} NBDReplyChunkIter;
1274
1275static void nbd_iter_channel_error(NBDReplyChunkIter *iter,
1276                                   int ret, Error **local_err)
1277{
1278    assert(local_err && *local_err);
1279    assert(ret < 0);
1280
1281    if (!iter->ret) {
1282        iter->ret = ret;
1283        error_propagate(&iter->err, *local_err);
1284    } else {
1285        error_free(*local_err);
1286    }
1287
1288    *local_err = NULL;
1289}
1290
1291static void nbd_iter_request_error(NBDReplyChunkIter *iter, int ret)
1292{
1293    assert(ret < 0);
1294
1295    if (!iter->request_ret) {
1296        iter->request_ret = ret;
1297    }
1298}
1299
1300/*
1301 * NBD_FOREACH_REPLY_CHUNK
1302 * The pointer stored in @payload requires g_free() to free it.
1303 */
1304#define NBD_FOREACH_REPLY_CHUNK(s, iter, handle, structured, \
1305                                qiov, reply, payload) \
1306    for (iter = (NBDReplyChunkIter) { .only_structured = structured }; \
1307         nbd_reply_chunk_iter_receive(s, &iter, handle, qiov, reply, payload);)
1308
1309/*
1310 * nbd_reply_chunk_iter_receive
1311 * The pointer stored in @payload requires g_free() to free it.
1312 */
1313static bool nbd_reply_chunk_iter_receive(BDRVNBDState *s,
1314                                         NBDReplyChunkIter *iter,
1315                                         uint64_t handle,
1316                                         QEMUIOVector *qiov, NBDReply *reply,
1317                                         void **payload)
1318{
1319    int ret, request_ret;
1320    NBDReply local_reply;
1321    NBDStructuredReplyChunk *chunk;
1322    Error *local_err = NULL;
1323    if (qatomic_load_acquire(&s->state) != NBD_CLIENT_CONNECTED) {
1324        error_setg(&local_err, "Connection closed");
1325        nbd_iter_channel_error(iter, -EIO, &local_err);
1326        goto break_loop;
1327    }
1328
1329    if (iter->done) {
1330        /* Previous iteration was last. */
1331        goto break_loop;
1332    }
1333
1334    if (reply == NULL) {
1335        reply = &local_reply;
1336    }
1337
1338    ret = nbd_co_receive_one_chunk(s, handle, iter->only_structured,
1339                                   &request_ret, qiov, reply, payload,
1340                                   &local_err);
1341    if (ret < 0) {
1342        nbd_iter_channel_error(iter, ret, &local_err);
1343    } else if (request_ret < 0) {
1344        nbd_iter_request_error(iter, request_ret);
1345    }
1346
1347    /* Do not execute the body of NBD_FOREACH_REPLY_CHUNK for simple reply. */
1348    if (nbd_reply_is_simple(reply) ||
1349        qatomic_load_acquire(&s->state) != NBD_CLIENT_CONNECTED) {
1350        goto break_loop;
1351    }
1352
1353    chunk = &reply->structured;
1354    iter->only_structured = true;
1355
1356    if (chunk->type == NBD_REPLY_TYPE_NONE) {
1357        /* NBD_REPLY_FLAG_DONE is already checked in nbd_co_receive_one_chunk */
1358        assert(chunk->flags & NBD_REPLY_FLAG_DONE);
1359        goto break_loop;
1360    }
1361
1362    if (chunk->flags & NBD_REPLY_FLAG_DONE) {
1363        /* This iteration is last. */
1364        iter->done = true;
1365    }
1366
1367    /* Execute the loop body */
1368    return true;
1369
1370break_loop:
1371    s->requests[HANDLE_TO_INDEX(s, handle)].coroutine = NULL;
1372
1373    qemu_co_mutex_lock(&s->send_mutex);
1374    s->in_flight--;
1375    if (s->in_flight == 0 && s->wait_in_flight) {
1376        aio_co_wake(s->connection_co);
1377    } else {
1378        qemu_co_queue_next(&s->free_sema);
1379    }
1380    qemu_co_mutex_unlock(&s->send_mutex);
1381
1382    return false;
1383}
1384
1385static int nbd_co_receive_return_code(BDRVNBDState *s, uint64_t handle,
1386                                      int *request_ret, Error **errp)
1387{
1388    NBDReplyChunkIter iter;
1389
1390    NBD_FOREACH_REPLY_CHUNK(s, iter, handle, false, NULL, NULL, NULL) {
1391        /* nbd_reply_chunk_iter_receive does all the work */
1392    }
1393
1394    error_propagate(errp, iter.err);
1395    *request_ret = iter.request_ret;
1396    return iter.ret;
1397}
1398
1399static int nbd_co_receive_cmdread_reply(BDRVNBDState *s, uint64_t handle,
1400                                        uint64_t offset, QEMUIOVector *qiov,
1401                                        int *request_ret, Error **errp)
1402{
1403    NBDReplyChunkIter iter;
1404    NBDReply reply;
1405    void *payload = NULL;
1406    Error *local_err = NULL;
1407
1408    NBD_FOREACH_REPLY_CHUNK(s, iter, handle, s->info.structured_reply,
1409                            qiov, &reply, &payload)
1410    {
1411        int ret;
1412        NBDStructuredReplyChunk *chunk = &reply.structured;
1413
1414        assert(nbd_reply_is_structured(&reply));
1415
1416        switch (chunk->type) {
1417        case NBD_REPLY_TYPE_OFFSET_DATA:
1418            /*
1419             * special cased in nbd_co_receive_one_chunk, data is already
1420             * in qiov
1421             */
1422            break;
1423        case NBD_REPLY_TYPE_OFFSET_HOLE:
1424            ret = nbd_parse_offset_hole_payload(s, &reply.structured, payload,
1425                                                offset, qiov, &local_err);
1426            if (ret < 0) {
1427                nbd_channel_error(s, ret);
1428                nbd_iter_channel_error(&iter, ret, &local_err);
1429            }
1430            break;
1431        default:
1432            if (!nbd_reply_type_is_error(chunk->type)) {
1433                /* not allowed reply type */
1434                nbd_channel_error(s, -EINVAL);
1435                error_setg(&local_err,
1436                           "Unexpected reply type: %d (%s) for CMD_READ",
1437                           chunk->type, nbd_reply_type_lookup(chunk->type));
1438                nbd_iter_channel_error(&iter, -EINVAL, &local_err);
1439            }
1440        }
1441
1442        g_free(payload);
1443        payload = NULL;
1444    }
1445
1446    error_propagate(errp, iter.err);
1447    *request_ret = iter.request_ret;
1448    return iter.ret;
1449}
1450
1451static int nbd_co_receive_blockstatus_reply(BDRVNBDState *s,
1452                                            uint64_t handle, uint64_t length,
1453                                            NBDExtent *extent,
1454                                            int *request_ret, Error **errp)
1455{
1456    NBDReplyChunkIter iter;
1457    NBDReply reply;
1458    void *payload = NULL;
1459    Error *local_err = NULL;
1460    bool received = false;
1461
1462    assert(!extent->length);
1463    NBD_FOREACH_REPLY_CHUNK(s, iter, handle, false, NULL, &reply, &payload) {
1464        int ret;
1465        NBDStructuredReplyChunk *chunk = &reply.structured;
1466
1467        assert(nbd_reply_is_structured(&reply));
1468
1469        switch (chunk->type) {
1470        case NBD_REPLY_TYPE_BLOCK_STATUS:
1471            if (received) {
1472                nbd_channel_error(s, -EINVAL);
1473                error_setg(&local_err, "Several BLOCK_STATUS chunks in reply");
1474                nbd_iter_channel_error(&iter, -EINVAL, &local_err);
1475            }
1476            received = true;
1477
1478            ret = nbd_parse_blockstatus_payload(s, &reply.structured,
1479                                                payload, length, extent,
1480                                                &local_err);
1481            if (ret < 0) {
1482                nbd_channel_error(s, ret);
1483                nbd_iter_channel_error(&iter, ret, &local_err);
1484            }
1485            break;
1486        default:
1487            if (!nbd_reply_type_is_error(chunk->type)) {
1488                nbd_channel_error(s, -EINVAL);
1489                error_setg(&local_err,
1490                           "Unexpected reply type: %d (%s) "
1491                           "for CMD_BLOCK_STATUS",
1492                           chunk->type, nbd_reply_type_lookup(chunk->type));
1493                nbd_iter_channel_error(&iter, -EINVAL, &local_err);
1494            }
1495        }
1496
1497        g_free(payload);
1498        payload = NULL;
1499    }
1500
1501    if (!extent->length && !iter.request_ret) {
1502        error_setg(&local_err, "Server did not reply with any status extents");
1503        nbd_iter_channel_error(&iter, -EIO, &local_err);
1504    }
1505
1506    error_propagate(errp, iter.err);
1507    *request_ret = iter.request_ret;
1508    return iter.ret;
1509}
1510
1511static int nbd_co_request(BlockDriverState *bs, NBDRequest *request,
1512                          QEMUIOVector *write_qiov)
1513{
1514    int ret, request_ret;
1515    Error *local_err = NULL;
1516    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
1517
1518    assert(request->type != NBD_CMD_READ);
1519    if (write_qiov) {
1520        assert(request->type == NBD_CMD_WRITE);
1521        assert(request->len == iov_size(write_qiov->iov, write_qiov->niov));
1522    } else {
1523        assert(request->type != NBD_CMD_WRITE);
1524    }
1525
1526    do {
1527        ret = nbd_co_send_request(bs, request, write_qiov);
1528        if (ret < 0) {
1529            continue;
1530        }
1531
1532        ret = nbd_co_receive_return_code(s, request->handle,
1533                                         &request_ret, &local_err);
1534        if (local_err) {
1535            trace_nbd_co_request_fail(request->from, request->len,
1536                                      request->handle, request->flags,
1537                                      request->type,
1538                                      nbd_cmd_lookup(request->type),
1539                                      ret, error_get_pretty(local_err));
1540            error_free(local_err);
1541            local_err = NULL;
1542        }
1543    } while (ret < 0 && nbd_client_connecting_wait(s));
1544
1545    return ret ? ret : request_ret;
1546}
1547
1548static int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
1549                                uint64_t bytes, QEMUIOVector *qiov, int flags)
1550{
1551    int ret, request_ret;
1552    Error *local_err = NULL;
1553    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
1554    NBDRequest request = {
1555        .type = NBD_CMD_READ,
1556        .from = offset,
1557        .len = bytes,
1558    };
1559
1560    assert(bytes <= NBD_MAX_BUFFER_SIZE);
1561    assert(!flags);
1562
1563    if (!bytes) {
1564        return 0;
1565    }
1566    /*
1567     * Work around the fact that the block layer doesn't do
1568     * byte-accurate sizing yet - if the read exceeds the server's
1569     * advertised size because the block layer rounded size up, then
1570     * truncate the request to the server and tail-pad with zero.
1571     */
1572    if (offset >= s->info.size) {
1573        assert(bytes < BDRV_SECTOR_SIZE);
1574        qemu_iovec_memset(qiov, 0, 0, bytes);
1575        return 0;
1576    }
1577    if (offset + bytes > s->info.size) {
1578        uint64_t slop = offset + bytes - s->info.size;
1579
1580        assert(slop < BDRV_SECTOR_SIZE);
1581        qemu_iovec_memset(qiov, bytes - slop, 0, slop);
1582        request.len -= slop;
1583    }
1584
1585    do {
1586        ret = nbd_co_send_request(bs, &request, NULL);
1587        if (ret < 0) {
1588            continue;
1589        }
1590
1591        ret = nbd_co_receive_cmdread_reply(s, request.handle, offset, qiov,
1592                                           &request_ret, &local_err);
1593        if (local_err) {
1594            trace_nbd_co_request_fail(request.from, request.len, request.handle,
1595                                      request.flags, request.type,
1596                                      nbd_cmd_lookup(request.type),
1597                                      ret, error_get_pretty(local_err));
1598            error_free(local_err);
1599            local_err = NULL;
1600        }
1601    } while (ret < 0 && nbd_client_connecting_wait(s));
1602
1603    return ret ? ret : request_ret;
1604}
1605
1606static int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
1607                                 uint64_t bytes, QEMUIOVector *qiov, int flags)
1608{
1609    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
1610    NBDRequest request = {
1611        .type = NBD_CMD_WRITE,
1612        .from = offset,
1613        .len = bytes,
1614    };
1615
1616    assert(!(s->info.flags & NBD_FLAG_READ_ONLY));
1617    if (flags & BDRV_REQ_FUA) {
1618        assert(s->info.flags & NBD_FLAG_SEND_FUA);
1619        request.flags |= NBD_CMD_FLAG_FUA;
1620    }
1621
1622    assert(bytes <= NBD_MAX_BUFFER_SIZE);
1623
1624    if (!bytes) {
1625        return 0;
1626    }
1627    return nbd_co_request(bs, &request, qiov);
1628}
1629
1630static int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
1631                                       int bytes, BdrvRequestFlags flags)
1632{
1633    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
1634    NBDRequest request = {
1635        .type = NBD_CMD_WRITE_ZEROES,
1636        .from = offset,
1637        .len = bytes,
1638    };
1639
1640    assert(!(s->info.flags & NBD_FLAG_READ_ONLY));
1641    if (!(s->info.flags & NBD_FLAG_SEND_WRITE_ZEROES)) {
1642        return -ENOTSUP;
1643    }
1644
1645    if (flags & BDRV_REQ_FUA) {
1646        assert(s->info.flags & NBD_FLAG_SEND_FUA);
1647        request.flags |= NBD_CMD_FLAG_FUA;
1648    }
1649    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
1650        request.flags |= NBD_CMD_FLAG_NO_HOLE;
1651    }
1652    if (flags & BDRV_REQ_NO_FALLBACK) {
1653        assert(s->info.flags & NBD_FLAG_SEND_FAST_ZERO);
1654        request.flags |= NBD_CMD_FLAG_FAST_ZERO;
1655    }
1656
1657    if (!bytes) {
1658        return 0;
1659    }
1660    return nbd_co_request(bs, &request, NULL);
1661}
1662
1663static int nbd_client_co_flush(BlockDriverState *bs)
1664{
1665    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
1666    NBDRequest request = { .type = NBD_CMD_FLUSH };
1667
1668    if (!(s->info.flags & NBD_FLAG_SEND_FLUSH)) {
1669        return 0;
1670    }
1671
1672    request.from = 0;
1673    request.len = 0;
1674
1675    return nbd_co_request(bs, &request, NULL);
1676}
1677
1678static int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset,
1679                                  int bytes)
1680{
1681    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
1682    NBDRequest request = {
1683        .type = NBD_CMD_TRIM,
1684        .from = offset,
1685        .len = bytes,
1686    };
1687
1688    assert(!(s->info.flags & NBD_FLAG_READ_ONLY));
1689    if (!(s->info.flags & NBD_FLAG_SEND_TRIM) || !bytes) {
1690        return 0;
1691    }
1692
1693    return nbd_co_request(bs, &request, NULL);
1694}
1695
1696static int coroutine_fn nbd_client_co_block_status(
1697        BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes,
1698        int64_t *pnum, int64_t *map, BlockDriverState **file)
1699{
1700    int ret, request_ret;
1701    NBDExtent extent = { 0 };
1702    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
1703    Error *local_err = NULL;
1704
1705    NBDRequest request = {
1706        .type = NBD_CMD_BLOCK_STATUS,
1707        .from = offset,
1708        .len = MIN(QEMU_ALIGN_DOWN(INT_MAX, bs->bl.request_alignment),
1709                   MIN(bytes, s->info.size - offset)),
1710        .flags = NBD_CMD_FLAG_REQ_ONE,
1711    };
1712
1713    if (!s->info.base_allocation) {
1714        *pnum = bytes;
1715        *map = offset;
1716        *file = bs;
1717        return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
1718    }
1719
1720    /*
1721     * Work around the fact that the block layer doesn't do
1722     * byte-accurate sizing yet - if the status request exceeds the
1723     * server's advertised size because the block layer rounded size
1724     * up, we truncated the request to the server (above), or are
1725     * called on just the hole.
1726     */
1727    if (offset >= s->info.size) {
1728        *pnum = bytes;
1729        assert(bytes < BDRV_SECTOR_SIZE);
1730        /* Intentionally don't report offset_valid for the hole */
1731        return BDRV_BLOCK_ZERO;
1732    }
1733
1734    if (s->info.min_block) {
1735        assert(QEMU_IS_ALIGNED(request.len, s->info.min_block));
1736    }
1737    do {
1738        ret = nbd_co_send_request(bs, &request, NULL);
1739        if (ret < 0) {
1740            continue;
1741        }
1742
1743        ret = nbd_co_receive_blockstatus_reply(s, request.handle, bytes,
1744                                               &extent, &request_ret,
1745                                               &local_err);
1746        if (local_err) {
1747            trace_nbd_co_request_fail(request.from, request.len, request.handle,
1748                                      request.flags, request.type,
1749                                      nbd_cmd_lookup(request.type),
1750                                      ret, error_get_pretty(local_err));
1751            error_free(local_err);
1752            local_err = NULL;
1753        }
1754    } while (ret < 0 && nbd_client_connecting_wait(s));
1755
1756    if (ret < 0 || request_ret < 0) {
1757        return ret ? ret : request_ret;
1758    }
1759
1760    assert(extent.length);
1761    *pnum = extent.length;
1762    *map = offset;
1763    *file = bs;
1764    return (extent.flags & NBD_STATE_HOLE ? 0 : BDRV_BLOCK_DATA) |
1765        (extent.flags & NBD_STATE_ZERO ? BDRV_BLOCK_ZERO : 0) |
1766        BDRV_BLOCK_OFFSET_VALID;
1767}
1768
1769static int nbd_client_reopen_prepare(BDRVReopenState *state,
1770                                     BlockReopenQueue *queue, Error **errp)
1771{
1772    BDRVNBDState *s = (BDRVNBDState *)state->bs->opaque;
1773
1774    if ((state->flags & BDRV_O_RDWR) && (s->info.flags & NBD_FLAG_READ_ONLY)) {
1775        error_setg(errp, "Can't reopen read-only NBD mount as read/write");
1776        return -EACCES;
1777    }
1778    return 0;
1779}
1780
1781static void nbd_yank(void *opaque)
1782{
1783    BlockDriverState *bs = opaque;
1784    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
1785
1786    qatomic_store_release(&s->state, NBD_CLIENT_QUIT);
1787    qio_channel_shutdown(QIO_CHANNEL(s->sioc), QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1788}
1789
1790static void nbd_client_close(BlockDriverState *bs)
1791{
1792    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
1793    NBDRequest request = { .type = NBD_CMD_DISC };
1794
1795    if (s->ioc) {
1796        nbd_send_request(s->ioc, &request);
1797    }
1798
1799    nbd_teardown_connection(bs);
1800}
1801
1802static int nbd_establish_connection(BlockDriverState *bs,
1803                                    SocketAddress *saddr,
1804                                    Error **errp)
1805{
1806    ERRP_GUARD();
1807    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
1808
1809    s->sioc = qio_channel_socket_new();
1810    qio_channel_set_name(QIO_CHANNEL(s->sioc), "nbd-client");
1811
1812    qio_channel_socket_connect_sync(s->sioc, saddr, errp);
1813    if (*errp) {
1814        object_unref(OBJECT(s->sioc));
1815        s->sioc = NULL;
1816        return -1;
1817    }
1818
1819    yank_register_function(BLOCKDEV_YANK_INSTANCE(bs->node_name), nbd_yank, bs);
1820    qio_channel_set_delay(QIO_CHANNEL(s->sioc), false);
1821
1822    return 0;
1823}
1824
1825/* nbd_client_handshake takes ownership on s->sioc. On failure it's unref'ed. */
1826static int nbd_client_handshake(BlockDriverState *bs, Error **errp)
1827{
1828    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
1829    AioContext *aio_context = bdrv_get_aio_context(bs);
1830    int ret;
1831
1832    trace_nbd_client_handshake(s->export);
1833    qio_channel_set_blocking(QIO_CHANNEL(s->sioc), false, NULL);
1834    qio_channel_attach_aio_context(QIO_CHANNEL(s->sioc), aio_context);
1835
1836    s->info.request_sizes = true;
1837    s->info.structured_reply = true;
1838    s->info.base_allocation = true;
1839    s->info.x_dirty_bitmap = g_strdup(s->x_dirty_bitmap);
1840    s->info.name = g_strdup(s->export ?: "");
1841    ret = nbd_receive_negotiate(aio_context, QIO_CHANNEL(s->sioc), s->tlscreds,
1842                                s->hostname, &s->ioc, &s->info, errp);
1843    g_free(s->info.x_dirty_bitmap);
1844    g_free(s->info.name);
1845    if (ret < 0) {
1846        yank_unregister_function(BLOCKDEV_YANK_INSTANCE(bs->node_name),
1847                                 nbd_yank, bs);
1848        object_unref(OBJECT(s->sioc));
1849        s->sioc = NULL;
1850        return ret;
1851    }
1852    if (s->x_dirty_bitmap) {
1853        if (!s->info.base_allocation) {
1854            error_setg(errp, "requested x-dirty-bitmap %s not found",
1855                       s->x_dirty_bitmap);
1856            ret = -EINVAL;
1857            goto fail;
1858        }
1859        if (strcmp(s->x_dirty_bitmap, "qemu:allocation-depth") == 0) {
1860            s->alloc_depth = true;
1861        }
1862    }
1863    if (s->info.flags & NBD_FLAG_READ_ONLY) {
1864        ret = bdrv_apply_auto_read_only(bs, "NBD export is read-only", errp);
1865        if (ret < 0) {
1866            goto fail;
1867        }
1868    }
1869    if (s->info.flags & NBD_FLAG_SEND_FUA) {
1870        bs->supported_write_flags = BDRV_REQ_FUA;
1871        bs->supported_zero_flags |= BDRV_REQ_FUA;
1872    }
1873    if (s->info.flags & NBD_FLAG_SEND_WRITE_ZEROES) {
1874        bs->supported_zero_flags |= BDRV_REQ_MAY_UNMAP;
1875        if (s->info.flags & NBD_FLAG_SEND_FAST_ZERO) {
1876            bs->supported_zero_flags |= BDRV_REQ_NO_FALLBACK;
1877        }
1878    }
1879
1880    if (!s->ioc) {
1881        s->ioc = QIO_CHANNEL(s->sioc);
1882        object_ref(OBJECT(s->ioc));
1883    }
1884
1885    trace_nbd_client_handshake_success(s->export);
1886
1887    return 0;
1888
1889 fail:
1890    /*
1891     * We have connected, but must fail for other reasons.
1892     * Send NBD_CMD_DISC as a courtesy to the server.
1893     */
1894    {
1895        NBDRequest request = { .type = NBD_CMD_DISC };
1896
1897        nbd_send_request(s->ioc ?: QIO_CHANNEL(s->sioc), &request);
1898
1899        yank_unregister_function(BLOCKDEV_YANK_INSTANCE(bs->node_name),
1900                                 nbd_yank, bs);
1901        object_unref(OBJECT(s->sioc));
1902        s->sioc = NULL;
1903
1904        return ret;
1905    }
1906}
1907
1908/*
1909 * Parse nbd_open options
1910 */
1911
1912static int nbd_parse_uri(const char *filename, QDict *options)
1913{
1914    URI *uri;
1915    const char *p;
1916    QueryParams *qp = NULL;
1917    int ret = 0;
1918    bool is_unix;
1919
1920    uri = uri_parse(filename);
1921    if (!uri) {
1922        return -EINVAL;
1923    }
1924
1925    /* transport */
1926    if (!g_strcmp0(uri->scheme, "nbd")) {
1927        is_unix = false;
1928    } else if (!g_strcmp0(uri->scheme, "nbd+tcp")) {
1929        is_unix = false;
1930    } else if (!g_strcmp0(uri->scheme, "nbd+unix")) {
1931        is_unix = true;
1932    } else {
1933        ret = -EINVAL;
1934        goto out;
1935    }
1936
1937    p = uri->path ? uri->path : "";
1938    if (p[0] == '/') {
1939        p++;
1940    }
1941    if (p[0]) {
1942        qdict_put_str(options, "export", p);
1943    }
1944
1945    qp = query_params_parse(uri->query);
1946    if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) {
1947        ret = -EINVAL;
1948        goto out;
1949    }
1950
1951    if (is_unix) {
1952        /* nbd+unix:///export?socket=path */
1953        if (uri->server || uri->port || strcmp(qp->p[0].name, "socket")) {
1954            ret = -EINVAL;
1955            goto out;
1956        }
1957        qdict_put_str(options, "server.type", "unix");
1958        qdict_put_str(options, "server.path", qp->p[0].value);
1959    } else {
1960        QString *host;
1961        char *port_str;
1962
1963        /* nbd[+tcp]://host[:port]/export */
1964        if (!uri->server) {
1965            ret = -EINVAL;
1966            goto out;
1967        }
1968
1969        /* strip braces from literal IPv6 address */
1970        if (uri->server[0] == '[') {
1971            host = qstring_from_substr(uri->server, 1,
1972                                       strlen(uri->server) - 1);
1973        } else {
1974            host = qstring_from_str(uri->server);
1975        }
1976
1977        qdict_put_str(options, "server.type", "inet");
1978        qdict_put(options, "server.host", host);
1979
1980        port_str = g_strdup_printf("%d", uri->port ?: NBD_DEFAULT_PORT);
1981        qdict_put_str(options, "server.port", port_str);
1982        g_free(port_str);
1983    }
1984
1985out:
1986    if (qp) {
1987        query_params_free(qp);
1988    }
1989    uri_free(uri);
1990    return ret;
1991}
1992
1993static bool nbd_has_filename_options_conflict(QDict *options, Error **errp)
1994{
1995    const QDictEntry *e;
1996
1997    for (e = qdict_first(options); e; e = qdict_next(options, e)) {
1998        if (!strcmp(e->key, "host") ||
1999            !strcmp(e->key, "port") ||
2000            !strcmp(e->key, "path") ||
2001            !strcmp(e->key, "export") ||
2002            strstart(e->key, "server.", NULL))
2003        {
2004            error_setg(errp, "Option '%s' cannot be used with a file name",
2005                       e->key);
2006            return true;
2007        }
2008    }
2009
2010    return false;
2011}
2012
2013static void nbd_parse_filename(const char *filename, QDict *options,
2014                               Error **errp)
2015{
2016    g_autofree char *file = NULL;
2017    char *export_name;
2018    const char *host_spec;
2019    const char *unixpath;
2020
2021    if (nbd_has_filename_options_conflict(options, errp)) {
2022        return;
2023    }
2024
2025    if (strstr(filename, "://")) {
2026        int ret = nbd_parse_uri(filename, options);
2027        if (ret < 0) {
2028            error_setg(errp, "No valid URL specified");
2029        }
2030        return;
2031    }
2032
2033    file = g_strdup(filename);
2034
2035    export_name = strstr(file, EN_OPTSTR);
2036    if (export_name) {
2037        if (export_name[strlen(EN_OPTSTR)] == 0) {
2038            return;
2039        }
2040        export_name[0] = 0; /* truncate 'file' */
2041        export_name += strlen(EN_OPTSTR);
2042
2043        qdict_put_str(options, "export", export_name);
2044    }
2045
2046    /* extract the host_spec - fail if it's not nbd:... */
2047    if (!strstart(file, "nbd:", &host_spec)) {
2048        error_setg(errp, "File name string for NBD must start with 'nbd:'");
2049        return;
2050    }
2051
2052    if (!*host_spec) {
2053        return;
2054    }
2055
2056    /* are we a UNIX or TCP socket? */
2057    if (strstart(host_spec, "unix:", &unixpath)) {
2058        qdict_put_str(options, "server.type", "unix");
2059        qdict_put_str(options, "server.path", unixpath);
2060    } else {
2061        InetSocketAddress *addr = g_new(InetSocketAddress, 1);
2062
2063        if (inet_parse(addr, host_spec, errp)) {
2064            goto out_inet;
2065        }
2066
2067        qdict_put_str(options, "server.type", "inet");
2068        qdict_put_str(options, "server.host", addr->host);
2069        qdict_put_str(options, "server.port", addr->port);
2070    out_inet:
2071        qapi_free_InetSocketAddress(addr);
2072    }
2073}
2074
2075static bool nbd_process_legacy_socket_options(QDict *output_options,
2076                                              QemuOpts *legacy_opts,
2077                                              Error **errp)
2078{
2079    const char *path = qemu_opt_get(legacy_opts, "path");
2080    const char *host = qemu_opt_get(legacy_opts, "host");
2081    const char *port = qemu_opt_get(legacy_opts, "port");
2082    const QDictEntry *e;
2083
2084    if (!path && !host && !port) {
2085        return true;
2086    }
2087
2088    for (e = qdict_first(output_options); e; e = qdict_next(output_options, e))
2089    {
2090        if (strstart(e->key, "server.", NULL)) {
2091            error_setg(errp, "Cannot use 'server' and path/host/port at the "
2092                       "same time");
2093            return false;
2094        }
2095    }
2096
2097    if (path && host) {
2098        error_setg(errp, "path and host may not be used at the same time");
2099        return false;
2100    } else if (path) {
2101        if (port) {
2102            error_setg(errp, "port may not be used without host");
2103            return false;
2104        }
2105
2106        qdict_put_str(output_options, "server.type", "unix");
2107        qdict_put_str(output_options, "server.path", path);
2108    } else if (host) {
2109        qdict_put_str(output_options, "server.type", "inet");
2110        qdict_put_str(output_options, "server.host", host);
2111        qdict_put_str(output_options, "server.port",
2112                      port ?: stringify(NBD_DEFAULT_PORT));
2113    }
2114
2115    return true;
2116}
2117
2118static SocketAddress *nbd_config(BDRVNBDState *s, QDict *options,
2119                                 Error **errp)
2120{
2121    SocketAddress *saddr = NULL;
2122    QDict *addr = NULL;
2123    Visitor *iv = NULL;
2124
2125    qdict_extract_subqdict(options, &addr, "server.");
2126    if (!qdict_size(addr)) {
2127        error_setg(errp, "NBD server address missing");
2128        goto done;
2129    }
2130
2131    iv = qobject_input_visitor_new_flat_confused(addr, errp);
2132    if (!iv) {
2133        goto done;
2134    }
2135
2136    if (!visit_type_SocketAddress(iv, NULL, &saddr, errp)) {
2137        goto done;
2138    }
2139
2140done:
2141    qobject_unref(addr);
2142    visit_free(iv);
2143    return saddr;
2144}
2145
2146static QCryptoTLSCreds *nbd_get_tls_creds(const char *id, Error **errp)
2147{
2148    Object *obj;
2149    QCryptoTLSCreds *creds;
2150
2151    obj = object_resolve_path_component(
2152        object_get_objects_root(), id);
2153    if (!obj) {
2154        error_setg(errp, "No TLS credentials with id '%s'",
2155                   id);
2156        return NULL;
2157    }
2158    creds = (QCryptoTLSCreds *)
2159        object_dynamic_cast(obj, TYPE_QCRYPTO_TLS_CREDS);
2160    if (!creds) {
2161        error_setg(errp, "Object with id '%s' is not TLS credentials",
2162                   id);
2163        return NULL;
2164    }
2165
2166    if (creds->endpoint != QCRYPTO_TLS_CREDS_ENDPOINT_CLIENT) {
2167        error_setg(errp,
2168                   "Expecting TLS credentials with a client endpoint");
2169        return NULL;
2170    }
2171    object_ref(obj);
2172    return creds;
2173}
2174
2175
2176static QemuOptsList nbd_runtime_opts = {
2177    .name = "nbd",
2178    .head = QTAILQ_HEAD_INITIALIZER(nbd_runtime_opts.head),
2179    .desc = {
2180        {
2181            .name = "host",
2182            .type = QEMU_OPT_STRING,
2183            .help = "TCP host to connect to",
2184        },
2185        {
2186            .name = "port",
2187            .type = QEMU_OPT_STRING,
2188            .help = "TCP port to connect to",
2189        },
2190        {
2191            .name = "path",
2192            .type = QEMU_OPT_STRING,
2193            .help = "Unix socket path to connect to",
2194        },
2195        {
2196            .name = "export",
2197            .type = QEMU_OPT_STRING,
2198            .help = "Name of the NBD export to open",
2199        },
2200        {
2201            .name = "tls-creds",
2202            .type = QEMU_OPT_STRING,
2203            .help = "ID of the TLS credentials to use",
2204        },
2205        {
2206            .name = "x-dirty-bitmap",
2207            .type = QEMU_OPT_STRING,
2208            .help = "experimental: expose named dirty bitmap in place of "
2209                    "block status",
2210        },
2211        {
2212            .name = "reconnect-delay",
2213            .type = QEMU_OPT_NUMBER,
2214            .help = "On an unexpected disconnect, the nbd client tries to "
2215                    "connect again until succeeding or encountering a serious "
2216                    "error.  During the first @reconnect-delay seconds, all "
2217                    "requests are paused and will be rerun on a successful "
2218                    "reconnect. After that time, any delayed requests and all "
2219                    "future requests before a successful reconnect will "
2220                    "immediately fail. Default 0",
2221        },
2222        { /* end of list */ }
2223    },
2224};
2225
2226static int nbd_process_options(BlockDriverState *bs, QDict *options,
2227                               Error **errp)
2228{
2229    BDRVNBDState *s = bs->opaque;
2230    QemuOpts *opts;
2231    int ret = -EINVAL;
2232
2233    opts = qemu_opts_create(&nbd_runtime_opts, NULL, 0, &error_abort);
2234    if (!qemu_opts_absorb_qdict(opts, options, errp)) {
2235        goto error;
2236    }
2237
2238    /* Translate @host, @port, and @path to a SocketAddress */
2239    if (!nbd_process_legacy_socket_options(options, opts, errp)) {
2240        goto error;
2241    }
2242
2243    /* Pop the config into our state object. Exit if invalid. */
2244    s->saddr = nbd_config(s, options, errp);
2245    if (!s->saddr) {
2246        goto error;
2247    }
2248
2249    s->export = g_strdup(qemu_opt_get(opts, "export"));
2250    if (s->export && strlen(s->export) > NBD_MAX_STRING_SIZE) {
2251        error_setg(errp, "export name too long to send to server");
2252        goto error;
2253    }
2254
2255    s->tlscredsid = g_strdup(qemu_opt_get(opts, "tls-creds"));
2256    if (s->tlscredsid) {
2257        s->tlscreds = nbd_get_tls_creds(s->tlscredsid, errp);
2258        if (!s->tlscreds) {
2259            goto error;
2260        }
2261
2262        /* TODO SOCKET_ADDRESS_KIND_FD where fd has AF_INET or AF_INET6 */
2263        if (s->saddr->type != SOCKET_ADDRESS_TYPE_INET) {
2264            error_setg(errp, "TLS only supported over IP sockets");
2265            goto error;
2266        }
2267        s->hostname = s->saddr->u.inet.host;
2268    }
2269
2270    s->x_dirty_bitmap = g_strdup(qemu_opt_get(opts, "x-dirty-bitmap"));
2271    if (s->x_dirty_bitmap && strlen(s->x_dirty_bitmap) > NBD_MAX_STRING_SIZE) {
2272        error_setg(errp, "x-dirty-bitmap query too long to send to server");
2273        goto error;
2274    }
2275
2276    s->reconnect_delay = qemu_opt_get_number(opts, "reconnect-delay", 0);
2277
2278    ret = 0;
2279
2280 error:
2281    if (ret < 0) {
2282        nbd_clear_bdrvstate(s);
2283    }
2284    qemu_opts_del(opts);
2285    return ret;
2286}
2287
2288static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
2289                    Error **errp)
2290{
2291    int ret;
2292    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
2293
2294    ret = nbd_process_options(bs, options, errp);
2295    if (ret < 0) {
2296        return ret;
2297    }
2298
2299    s->bs = bs;
2300    qemu_co_mutex_init(&s->send_mutex);
2301    qemu_co_queue_init(&s->free_sema);
2302
2303    if (!yank_register_instance(BLOCKDEV_YANK_INSTANCE(bs->node_name), errp)) {
2304        return -EEXIST;
2305    }
2306
2307    /*
2308     * establish TCP connection, return error if it fails
2309     * TODO: Configurable retry-until-timeout behaviour.
2310     */
2311    if (nbd_establish_connection(bs, s->saddr, errp) < 0) {
2312        yank_unregister_instance(BLOCKDEV_YANK_INSTANCE(bs->node_name));
2313        return -ECONNREFUSED;
2314    }
2315
2316    ret = nbd_client_handshake(bs, errp);
2317    if (ret < 0) {
2318        yank_unregister_instance(BLOCKDEV_YANK_INSTANCE(bs->node_name));
2319        nbd_clear_bdrvstate(s);
2320        return ret;
2321    }
2322    /* successfully connected */
2323    s->state = NBD_CLIENT_CONNECTED;
2324
2325    nbd_init_connect_thread(s);
2326
2327    s->connection_co = qemu_coroutine_create(nbd_connection_entry, s);
2328    bdrv_inc_in_flight(bs);
2329    aio_co_schedule(bdrv_get_aio_context(bs), s->connection_co);
2330
2331    return 0;
2332}
2333
2334static int nbd_co_flush(BlockDriverState *bs)
2335{
2336    return nbd_client_co_flush(bs);
2337}
2338
2339static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
2340{
2341    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
2342    uint32_t min = s->info.min_block;
2343    uint32_t max = MIN_NON_ZERO(NBD_MAX_BUFFER_SIZE, s->info.max_block);
2344
2345    /*
2346     * If the server did not advertise an alignment:
2347     * - a size that is not sector-aligned implies that an alignment
2348     *   of 1 can be used to access those tail bytes
2349     * - advertisement of block status requires an alignment of 1, so
2350     *   that we don't violate block layer constraints that block
2351     *   status is always aligned (as we can't control whether the
2352     *   server will report sub-sector extents, such as a hole at EOF
2353     *   on an unaligned POSIX file)
2354     * - otherwise, assume the server is so old that we are safer avoiding
2355     *   sub-sector requests
2356     */
2357    if (!min) {
2358        min = (!QEMU_IS_ALIGNED(s->info.size, BDRV_SECTOR_SIZE) ||
2359               s->info.base_allocation) ? 1 : BDRV_SECTOR_SIZE;
2360    }
2361
2362    bs->bl.request_alignment = min;
2363    bs->bl.max_pdiscard = QEMU_ALIGN_DOWN(INT_MAX, min);
2364    bs->bl.max_pwrite_zeroes = max;
2365    bs->bl.max_transfer = max;
2366
2367    if (s->info.opt_block &&
2368        s->info.opt_block > bs->bl.opt_transfer) {
2369        bs->bl.opt_transfer = s->info.opt_block;
2370    }
2371}
2372
2373static void nbd_close(BlockDriverState *bs)
2374{
2375    BDRVNBDState *s = bs->opaque;
2376
2377    nbd_client_close(bs);
2378    yank_unregister_instance(BLOCKDEV_YANK_INSTANCE(bs->node_name));
2379    nbd_clear_bdrvstate(s);
2380}
2381
2382/*
2383 * NBD cannot truncate, but if the caller asks to truncate to the same size, or
2384 * to a smaller size with exact=false, there is no reason to fail the
2385 * operation.
2386 *
2387 * Preallocation mode is ignored since it does not seems useful to fail when
2388 * we never change anything.
2389 */
2390static int coroutine_fn nbd_co_truncate(BlockDriverState *bs, int64_t offset,
2391                                        bool exact, PreallocMode prealloc,
2392                                        BdrvRequestFlags flags, Error **errp)
2393{
2394    BDRVNBDState *s = bs->opaque;
2395
2396    if (offset != s->info.size && exact) {
2397        error_setg(errp, "Cannot resize NBD nodes");
2398        return -ENOTSUP;
2399    }
2400
2401    if (offset > s->info.size) {
2402        error_setg(errp, "Cannot grow NBD nodes");
2403        return -EINVAL;
2404    }
2405
2406    return 0;
2407}
2408
2409static int64_t nbd_getlength(BlockDriverState *bs)
2410{
2411    BDRVNBDState *s = bs->opaque;
2412
2413    return s->info.size;
2414}
2415
2416static void nbd_refresh_filename(BlockDriverState *bs)
2417{
2418    BDRVNBDState *s = bs->opaque;
2419    const char *host = NULL, *port = NULL, *path = NULL;
2420    size_t len = 0;
2421
2422    if (s->saddr->type == SOCKET_ADDRESS_TYPE_INET) {
2423        const InetSocketAddress *inet = &s->saddr->u.inet;
2424        if (!inet->has_ipv4 && !inet->has_ipv6 && !inet->has_to) {
2425            host = inet->host;
2426            port = inet->port;
2427        }
2428    } else if (s->saddr->type == SOCKET_ADDRESS_TYPE_UNIX) {
2429        path = s->saddr->u.q_unix.path;
2430    } /* else can't represent as pseudo-filename */
2431
2432    if (path && s->export) {
2433        len = snprintf(bs->exact_filename, sizeof(bs->exact_filename),
2434                       "nbd+unix:///%s?socket=%s", s->export, path);
2435    } else if (path && !s->export) {
2436        len = snprintf(bs->exact_filename, sizeof(bs->exact_filename),
2437                       "nbd+unix://?socket=%s", path);
2438    } else if (host && s->export) {
2439        len = snprintf(bs->exact_filename, sizeof(bs->exact_filename),
2440                       "nbd://%s:%s/%s", host, port, s->export);
2441    } else if (host && !s->export) {
2442        len = snprintf(bs->exact_filename, sizeof(bs->exact_filename),
2443                       "nbd://%s:%s", host, port);
2444    }
2445    if (len >= sizeof(bs->exact_filename)) {
2446        /* Name is too long to represent exactly, so leave it empty. */
2447        bs->exact_filename[0] = '\0';
2448    }
2449}
2450
2451static char *nbd_dirname(BlockDriverState *bs, Error **errp)
2452{
2453    /* The generic bdrv_dirname() implementation is able to work out some
2454     * directory name for NBD nodes, but that would be wrong. So far there is no
2455     * specification for how "export paths" would work, so NBD does not have
2456     * directory names. */
2457    error_setg(errp, "Cannot generate a base directory for NBD nodes");
2458    return NULL;
2459}
2460
2461static const char *const nbd_strong_runtime_opts[] = {
2462    "path",
2463    "host",
2464    "port",
2465    "export",
2466    "tls-creds",
2467    "server.",
2468
2469    NULL
2470};
2471
2472static void nbd_cancel_in_flight(BlockDriverState *bs)
2473{
2474    BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
2475
2476    reconnect_delay_timer_del(s);
2477
2478    if (s->state == NBD_CLIENT_CONNECTING_WAIT) {
2479        s->state = NBD_CLIENT_CONNECTING_NOWAIT;
2480        qemu_co_queue_restart_all(&s->free_sema);
2481    }
2482}
2483
2484static BlockDriver bdrv_nbd = {
2485    .format_name                = "nbd",
2486    .protocol_name              = "nbd",
2487    .instance_size              = sizeof(BDRVNBDState),
2488    .bdrv_parse_filename        = nbd_parse_filename,
2489    .bdrv_co_create_opts        = bdrv_co_create_opts_simple,
2490    .create_opts                = &bdrv_create_opts_simple,
2491    .bdrv_file_open             = nbd_open,
2492    .bdrv_reopen_prepare        = nbd_client_reopen_prepare,
2493    .bdrv_co_preadv             = nbd_client_co_preadv,
2494    .bdrv_co_pwritev            = nbd_client_co_pwritev,
2495    .bdrv_co_pwrite_zeroes      = nbd_client_co_pwrite_zeroes,
2496    .bdrv_close                 = nbd_close,
2497    .bdrv_co_flush_to_os        = nbd_co_flush,
2498    .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
2499    .bdrv_refresh_limits        = nbd_refresh_limits,
2500    .bdrv_co_truncate           = nbd_co_truncate,
2501    .bdrv_getlength             = nbd_getlength,
2502    .bdrv_detach_aio_context    = nbd_client_detach_aio_context,
2503    .bdrv_attach_aio_context    = nbd_client_attach_aio_context,
2504    .bdrv_co_drain_begin        = nbd_client_co_drain_begin,
2505    .bdrv_co_drain_end          = nbd_client_co_drain_end,
2506    .bdrv_refresh_filename      = nbd_refresh_filename,
2507    .bdrv_co_block_status       = nbd_client_co_block_status,
2508    .bdrv_dirname               = nbd_dirname,
2509    .strong_runtime_opts        = nbd_strong_runtime_opts,
2510    .bdrv_cancel_in_flight      = nbd_cancel_in_flight,
2511};
2512
2513static BlockDriver bdrv_nbd_tcp = {
2514    .format_name                = "nbd",
2515    .protocol_name              = "nbd+tcp",
2516    .instance_size              = sizeof(BDRVNBDState),
2517    .bdrv_parse_filename        = nbd_parse_filename,
2518    .bdrv_co_create_opts        = bdrv_co_create_opts_simple,
2519    .create_opts                = &bdrv_create_opts_simple,
2520    .bdrv_file_open             = nbd_open,
2521    .bdrv_reopen_prepare        = nbd_client_reopen_prepare,
2522    .bdrv_co_preadv             = nbd_client_co_preadv,
2523    .bdrv_co_pwritev            = nbd_client_co_pwritev,
2524    .bdrv_co_pwrite_zeroes      = nbd_client_co_pwrite_zeroes,
2525    .bdrv_close                 = nbd_close,
2526    .bdrv_co_flush_to_os        = nbd_co_flush,
2527    .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
2528    .bdrv_refresh_limits        = nbd_refresh_limits,
2529    .bdrv_co_truncate           = nbd_co_truncate,
2530    .bdrv_getlength             = nbd_getlength,
2531    .bdrv_detach_aio_context    = nbd_client_detach_aio_context,
2532    .bdrv_attach_aio_context    = nbd_client_attach_aio_context,
2533    .bdrv_co_drain_begin        = nbd_client_co_drain_begin,
2534    .bdrv_co_drain_end          = nbd_client_co_drain_end,
2535    .bdrv_refresh_filename      = nbd_refresh_filename,
2536    .bdrv_co_block_status       = nbd_client_co_block_status,
2537    .bdrv_dirname               = nbd_dirname,
2538    .strong_runtime_opts        = nbd_strong_runtime_opts,
2539    .bdrv_cancel_in_flight      = nbd_cancel_in_flight,
2540};
2541
2542static BlockDriver bdrv_nbd_unix = {
2543    .format_name                = "nbd",
2544    .protocol_name              = "nbd+unix",
2545    .instance_size              = sizeof(BDRVNBDState),
2546    .bdrv_parse_filename        = nbd_parse_filename,
2547    .bdrv_co_create_opts        = bdrv_co_create_opts_simple,
2548    .create_opts                = &bdrv_create_opts_simple,
2549    .bdrv_file_open             = nbd_open,
2550    .bdrv_reopen_prepare        = nbd_client_reopen_prepare,
2551    .bdrv_co_preadv             = nbd_client_co_preadv,
2552    .bdrv_co_pwritev            = nbd_client_co_pwritev,
2553    .bdrv_co_pwrite_zeroes      = nbd_client_co_pwrite_zeroes,
2554    .bdrv_close                 = nbd_close,
2555    .bdrv_co_flush_to_os        = nbd_co_flush,
2556    .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
2557    .bdrv_refresh_limits        = nbd_refresh_limits,
2558    .bdrv_co_truncate           = nbd_co_truncate,
2559    .bdrv_getlength             = nbd_getlength,
2560    .bdrv_detach_aio_context    = nbd_client_detach_aio_context,
2561    .bdrv_attach_aio_context    = nbd_client_attach_aio_context,
2562    .bdrv_co_drain_begin        = nbd_client_co_drain_begin,
2563    .bdrv_co_drain_end          = nbd_client_co_drain_end,
2564    .bdrv_refresh_filename      = nbd_refresh_filename,
2565    .bdrv_co_block_status       = nbd_client_co_block_status,
2566    .bdrv_dirname               = nbd_dirname,
2567    .strong_runtime_opts        = nbd_strong_runtime_opts,
2568    .bdrv_cancel_in_flight      = nbd_cancel_in_flight,
2569};
2570
2571static void bdrv_nbd_init(void)
2572{
2573    bdrv_register(&bdrv_nbd);
2574    bdrv_register(&bdrv_nbd_tcp);
2575    bdrv_register(&bdrv_nbd_unix);
2576}
2577
2578block_init(bdrv_nbd_init);
2579