qemu/nbd/server.c
<<
>>
Prefs
   1/*
   2 *  Copyright (C) 2016-2018 Red Hat, Inc.
   3 *  Copyright (C) 2005  Anthony Liguori <anthony@codemonkey.ws>
   4 *
   5 *  Network Block Device Server Side
   6 *
   7 *  This program is free software; you can redistribute it and/or modify
   8 *  it under the terms of the GNU General Public License as published by
   9 *  the Free Software Foundation; under version 2 of the License.
  10 *
  11 *  This program is distributed in the hope that it will be useful,
  12 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 *  GNU General Public License for more details.
  15 *
  16 *  You should have received a copy of the GNU General Public License
  17 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include "qapi/error.h"
  22#include "qemu/queue.h"
  23#include "trace.h"
  24#include "nbd-internal.h"
  25#include "qemu/units.h"
  26
  27#define NBD_META_ID_BASE_ALLOCATION 0
  28#define NBD_META_ID_DIRTY_BITMAP 1
  29
  30/*
  31 * NBD_MAX_BLOCK_STATUS_EXTENTS: 1 MiB of extents data. An empirical
  32 * constant. If an increase is needed, note that the NBD protocol
  33 * recommends no larger than 32 mb, so that the client won't consider
  34 * the reply as a denial of service attack.
  35 */
  36#define NBD_MAX_BLOCK_STATUS_EXTENTS (1 * MiB / 8)
  37
  38static int system_errno_to_nbd_errno(int err)
  39{
  40    switch (err) {
  41    case 0:
  42        return NBD_SUCCESS;
  43    case EPERM:
  44    case EROFS:
  45        return NBD_EPERM;
  46    case EIO:
  47        return NBD_EIO;
  48    case ENOMEM:
  49        return NBD_ENOMEM;
  50#ifdef EDQUOT
  51    case EDQUOT:
  52#endif
  53    case EFBIG:
  54    case ENOSPC:
  55        return NBD_ENOSPC;
  56    case EOVERFLOW:
  57        return NBD_EOVERFLOW;
  58    case ENOTSUP:
  59#if ENOTSUP != EOPNOTSUPP
  60    case EOPNOTSUPP:
  61#endif
  62        return NBD_ENOTSUP;
  63    case ESHUTDOWN:
  64        return NBD_ESHUTDOWN;
  65    case EINVAL:
  66    default:
  67        return NBD_EINVAL;
  68    }
  69}
  70
  71/* Definitions for opaque data types */
  72
  73typedef struct NBDRequestData NBDRequestData;
  74
  75struct NBDRequestData {
  76    QSIMPLEQ_ENTRY(NBDRequestData) entry;
  77    NBDClient *client;
  78    uint8_t *data;
  79    bool complete;
  80};
  81
  82struct NBDExport {
  83    int refcount;
  84    void (*close)(NBDExport *exp);
  85
  86    BlockBackend *blk;
  87    char *name;
  88    char *description;
  89    uint64_t dev_offset;
  90    uint64_t size;
  91    uint16_t nbdflags;
  92    QTAILQ_HEAD(, NBDClient) clients;
  93    QTAILQ_ENTRY(NBDExport) next;
  94
  95    AioContext *ctx;
  96
  97    BlockBackend *eject_notifier_blk;
  98    Notifier eject_notifier;
  99
 100    BdrvDirtyBitmap *export_bitmap;
 101    char *export_bitmap_context;
 102};
 103
 104static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
 105
 106/* NBDExportMetaContexts represents a list of contexts to be exported,
 107 * as selected by NBD_OPT_SET_META_CONTEXT. Also used for
 108 * NBD_OPT_LIST_META_CONTEXT. */
 109typedef struct NBDExportMetaContexts {
 110    NBDExport *exp;
 111    bool valid; /* means that negotiation of the option finished without
 112                   errors */
 113    bool base_allocation; /* export base:allocation context (block status) */
 114    bool bitmap; /* export qemu:dirty-bitmap:<export bitmap name> */
 115} NBDExportMetaContexts;
 116
 117struct NBDClient {
 118    int refcount;
 119    void (*close_fn)(NBDClient *client, bool negotiated);
 120
 121    NBDExport *exp;
 122    QCryptoTLSCreds *tlscreds;
 123    char *tlsauthz;
 124    QIOChannelSocket *sioc; /* The underlying data channel */
 125    QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */
 126
 127    Coroutine *recv_coroutine;
 128
 129    CoMutex send_lock;
 130    Coroutine *send_coroutine;
 131
 132    QTAILQ_ENTRY(NBDClient) next;
 133    int nb_requests;
 134    bool closing;
 135
 136    uint32_t check_align; /* If non-zero, check for aligned client requests */
 137
 138    bool structured_reply;
 139    NBDExportMetaContexts export_meta;
 140
 141    uint32_t opt; /* Current option being negotiated */
 142    uint32_t optlen; /* remaining length of data in ioc for the option being
 143                        negotiated now */
 144};
 145
 146static void nbd_client_receive_next_request(NBDClient *client);
 147
 148/* Basic flow for negotiation
 149
 150   Server         Client
 151   Negotiate
 152
 153   or
 154
 155   Server         Client
 156   Negotiate #1
 157                  Option
 158   Negotiate #2
 159
 160   ----
 161
 162   followed by
 163
 164   Server         Client
 165                  Request
 166   Response
 167                  Request
 168   Response
 169                  ...
 170   ...
 171                  Request (type == 2)
 172
 173*/
 174
 175static inline void set_be_option_rep(NBDOptionReply *rep, uint32_t option,
 176                                     uint32_t type, uint32_t length)
 177{
 178    stq_be_p(&rep->magic, NBD_REP_MAGIC);
 179    stl_be_p(&rep->option, option);
 180    stl_be_p(&rep->type, type);
 181    stl_be_p(&rep->length, length);
 182}
 183
 184/* Send a reply header, including length, but no payload.
 185 * Return -errno on error, 0 on success. */
 186static int nbd_negotiate_send_rep_len(NBDClient *client, uint32_t type,
 187                                      uint32_t len, Error **errp)
 188{
 189    NBDOptionReply rep;
 190
 191    trace_nbd_negotiate_send_rep_len(client->opt, nbd_opt_lookup(client->opt),
 192                                     type, nbd_rep_lookup(type), len);
 193
 194    assert(len < NBD_MAX_BUFFER_SIZE);
 195
 196    set_be_option_rep(&rep, client->opt, type, len);
 197    return nbd_write(client->ioc, &rep, sizeof(rep), errp);
 198}
 199
 200/* Send a reply header with default 0 length.
 201 * Return -errno on error, 0 on success. */
 202static int nbd_negotiate_send_rep(NBDClient *client, uint32_t type,
 203                                  Error **errp)
 204{
 205    return nbd_negotiate_send_rep_len(client, type, 0, errp);
 206}
 207
 208/* Send an error reply.
 209 * Return -errno on error, 0 on success. */
 210static int GCC_FMT_ATTR(4, 0)
 211nbd_negotiate_send_rep_verr(NBDClient *client, uint32_t type,
 212                            Error **errp, const char *fmt, va_list va)
 213{
 214    g_autofree char *msg = NULL;
 215    int ret;
 216    size_t len;
 217
 218    msg = g_strdup_vprintf(fmt, va);
 219    len = strlen(msg);
 220    assert(len < NBD_MAX_STRING_SIZE);
 221    trace_nbd_negotiate_send_rep_err(msg);
 222    ret = nbd_negotiate_send_rep_len(client, type, len, errp);
 223    if (ret < 0) {
 224        return ret;
 225    }
 226    if (nbd_write(client->ioc, msg, len, errp) < 0) {
 227        error_prepend(errp, "write failed (error message): ");
 228        return -EIO;
 229    }
 230
 231    return 0;
 232}
 233
 234/*
 235 * Return a malloc'd copy of @name suitable for use in an error reply.
 236 */
 237static char *
 238nbd_sanitize_name(const char *name)
 239{
 240    if (strnlen(name, 80) < 80) {
 241        return g_strdup(name);
 242    }
 243    /* XXX Should we also try to sanitize any control characters? */
 244    return g_strdup_printf("%.80s...", name);
 245}
 246
 247/* Send an error reply.
 248 * Return -errno on error, 0 on success. */
 249static int GCC_FMT_ATTR(4, 5)
 250nbd_negotiate_send_rep_err(NBDClient *client, uint32_t type,
 251                           Error **errp, const char *fmt, ...)
 252{
 253    va_list va;
 254    int ret;
 255
 256    va_start(va, fmt);
 257    ret = nbd_negotiate_send_rep_verr(client, type, errp, fmt, va);
 258    va_end(va);
 259    return ret;
 260}
 261
 262/* Drop remainder of the current option, and send a reply with the
 263 * given error type and message. Return -errno on read or write
 264 * failure; or 0 if connection is still live. */
 265static int GCC_FMT_ATTR(4, 0)
 266nbd_opt_vdrop(NBDClient *client, uint32_t type, Error **errp,
 267              const char *fmt, va_list va)
 268{
 269    int ret = nbd_drop(client->ioc, client->optlen, errp);
 270
 271    client->optlen = 0;
 272    if (!ret) {
 273        ret = nbd_negotiate_send_rep_verr(client, type, errp, fmt, va);
 274    }
 275    return ret;
 276}
 277
 278static int GCC_FMT_ATTR(4, 5)
 279nbd_opt_drop(NBDClient *client, uint32_t type, Error **errp,
 280             const char *fmt, ...)
 281{
 282    int ret;
 283    va_list va;
 284
 285    va_start(va, fmt);
 286    ret = nbd_opt_vdrop(client, type, errp, fmt, va);
 287    va_end(va);
 288
 289    return ret;
 290}
 291
 292static int GCC_FMT_ATTR(3, 4)
 293nbd_opt_invalid(NBDClient *client, Error **errp, const char *fmt, ...)
 294{
 295    int ret;
 296    va_list va;
 297
 298    va_start(va, fmt);
 299    ret = nbd_opt_vdrop(client, NBD_REP_ERR_INVALID, errp, fmt, va);
 300    va_end(va);
 301
 302    return ret;
 303}
 304
 305/* Read size bytes from the unparsed payload of the current option.
 306 * Return -errno on I/O error, 0 if option was completely handled by
 307 * sending a reply about inconsistent lengths, or 1 on success. */
 308static int nbd_opt_read(NBDClient *client, void *buffer, size_t size,
 309                        Error **errp)
 310{
 311    if (size > client->optlen) {
 312        return nbd_opt_invalid(client, errp,
 313                               "Inconsistent lengths in option %s",
 314                               nbd_opt_lookup(client->opt));
 315    }
 316    client->optlen -= size;
 317    return qio_channel_read_all(client->ioc, buffer, size, errp) < 0 ? -EIO : 1;
 318}
 319
 320/* Drop size bytes from the unparsed payload of the current option.
 321 * Return -errno on I/O error, 0 if option was completely handled by
 322 * sending a reply about inconsistent lengths, or 1 on success. */
 323static int nbd_opt_skip(NBDClient *client, size_t size, Error **errp)
 324{
 325    if (size > client->optlen) {
 326        return nbd_opt_invalid(client, errp,
 327                               "Inconsistent lengths in option %s",
 328                               nbd_opt_lookup(client->opt));
 329    }
 330    client->optlen -= size;
 331    return nbd_drop(client->ioc, size, errp) < 0 ? -EIO : 1;
 332}
 333
 334/* nbd_opt_read_name
 335 *
 336 * Read a string with the format:
 337 *   uint32_t len     (<= NBD_MAX_STRING_SIZE)
 338 *   len bytes string (not 0-terminated)
 339 *
 340 * On success, @name will be allocated.
 341 * If @length is non-null, it will be set to the actual string length.
 342 *
 343 * Return -errno on I/O error, 0 if option was completely handled by
 344 * sending a reply about inconsistent lengths, or 1 on success.
 345 */
 346static int nbd_opt_read_name(NBDClient *client, char **name, uint32_t *length,
 347                             Error **errp)
 348{
 349    int ret;
 350    uint32_t len;
 351    g_autofree char *local_name = NULL;
 352
 353    *name = NULL;
 354    ret = nbd_opt_read(client, &len, sizeof(len), errp);
 355    if (ret <= 0) {
 356        return ret;
 357    }
 358    len = cpu_to_be32(len);
 359
 360    if (len > NBD_MAX_STRING_SIZE) {
 361        return nbd_opt_invalid(client, errp,
 362                               "Invalid name length: %" PRIu32, len);
 363    }
 364
 365    local_name = g_malloc(len + 1);
 366    ret = nbd_opt_read(client, local_name, len, errp);
 367    if (ret <= 0) {
 368        return ret;
 369    }
 370    local_name[len] = '\0';
 371
 372    if (length) {
 373        *length = len;
 374    }
 375    *name = g_steal_pointer(&local_name);
 376
 377    return 1;
 378}
 379
 380/* Send a single NBD_REP_SERVER reply to NBD_OPT_LIST, including payload.
 381 * Return -errno on error, 0 on success. */
 382static int nbd_negotiate_send_rep_list(NBDClient *client, NBDExport *exp,
 383                                       Error **errp)
 384{
 385    size_t name_len, desc_len;
 386    uint32_t len;
 387    const char *name = exp->name ? exp->name : "";
 388    const char *desc = exp->description ? exp->description : "";
 389    QIOChannel *ioc = client->ioc;
 390    int ret;
 391
 392    trace_nbd_negotiate_send_rep_list(name, desc);
 393    name_len = strlen(name);
 394    desc_len = strlen(desc);
 395    assert(name_len <= NBD_MAX_STRING_SIZE && desc_len <= NBD_MAX_STRING_SIZE);
 396    len = name_len + desc_len + sizeof(len);
 397    ret = nbd_negotiate_send_rep_len(client, NBD_REP_SERVER, len, errp);
 398    if (ret < 0) {
 399        return ret;
 400    }
 401
 402    len = cpu_to_be32(name_len);
 403    if (nbd_write(ioc, &len, sizeof(len), errp) < 0) {
 404        error_prepend(errp, "write failed (name length): ");
 405        return -EINVAL;
 406    }
 407
 408    if (nbd_write(ioc, name, name_len, errp) < 0) {
 409        error_prepend(errp, "write failed (name buffer): ");
 410        return -EINVAL;
 411    }
 412
 413    if (nbd_write(ioc, desc, desc_len, errp) < 0) {
 414        error_prepend(errp, "write failed (description buffer): ");
 415        return -EINVAL;
 416    }
 417
 418    return 0;
 419}
 420
 421/* Process the NBD_OPT_LIST command, with a potential series of replies.
 422 * Return -errno on error, 0 on success. */
 423static int nbd_negotiate_handle_list(NBDClient *client, Error **errp)
 424{
 425    NBDExport *exp;
 426    assert(client->opt == NBD_OPT_LIST);
 427
 428    /* For each export, send a NBD_REP_SERVER reply. */
 429    QTAILQ_FOREACH(exp, &exports, next) {
 430        if (nbd_negotiate_send_rep_list(client, exp, errp)) {
 431            return -EINVAL;
 432        }
 433    }
 434    /* Finish with a NBD_REP_ACK. */
 435    return nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
 436}
 437
 438static void nbd_check_meta_export(NBDClient *client)
 439{
 440    client->export_meta.valid &= client->exp == client->export_meta.exp;
 441}
 442
 443/* Send a reply to NBD_OPT_EXPORT_NAME.
 444 * Return -errno on error, 0 on success. */
 445static int nbd_negotiate_handle_export_name(NBDClient *client, bool no_zeroes,
 446                                            Error **errp)
 447{
 448    g_autofree char *name = NULL;
 449    char buf[NBD_REPLY_EXPORT_NAME_SIZE] = "";
 450    size_t len;
 451    int ret;
 452    uint16_t myflags;
 453
 454    /* Client sends:
 455        [20 ..  xx]   export name (length bytes)
 456       Server replies:
 457        [ 0 ..   7]   size
 458        [ 8 ..   9]   export flags
 459        [10 .. 133]   reserved     (0) [unless no_zeroes]
 460     */
 461    trace_nbd_negotiate_handle_export_name();
 462    if (client->optlen > NBD_MAX_STRING_SIZE) {
 463        error_setg(errp, "Bad length received");
 464        return -EINVAL;
 465    }
 466    name = g_malloc(client->optlen + 1);
 467    if (nbd_read(client->ioc, name, client->optlen, "export name", errp) < 0) {
 468        return -EIO;
 469    }
 470    name[client->optlen] = '\0';
 471    client->optlen = 0;
 472
 473    trace_nbd_negotiate_handle_export_name_request(name);
 474
 475    client->exp = nbd_export_find(name);
 476    if (!client->exp) {
 477        error_setg(errp, "export not found");
 478        return -EINVAL;
 479    }
 480
 481    myflags = client->exp->nbdflags;
 482    if (client->structured_reply) {
 483        myflags |= NBD_FLAG_SEND_DF;
 484    }
 485    trace_nbd_negotiate_new_style_size_flags(client->exp->size, myflags);
 486    stq_be_p(buf, client->exp->size);
 487    stw_be_p(buf + 8, myflags);
 488    len = no_zeroes ? 10 : sizeof(buf);
 489    ret = nbd_write(client->ioc, buf, len, errp);
 490    if (ret < 0) {
 491        error_prepend(errp, "write failed: ");
 492        return ret;
 493    }
 494
 495    QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
 496    nbd_export_get(client->exp);
 497    nbd_check_meta_export(client);
 498
 499    return 0;
 500}
 501
 502/* Send a single NBD_REP_INFO, with a buffer @buf of @length bytes.
 503 * The buffer does NOT include the info type prefix.
 504 * Return -errno on error, 0 if ready to send more. */
 505static int nbd_negotiate_send_info(NBDClient *client,
 506                                   uint16_t info, uint32_t length, void *buf,
 507                                   Error **errp)
 508{
 509    int rc;
 510
 511    trace_nbd_negotiate_send_info(info, nbd_info_lookup(info), length);
 512    rc = nbd_negotiate_send_rep_len(client, NBD_REP_INFO,
 513                                    sizeof(info) + length, errp);
 514    if (rc < 0) {
 515        return rc;
 516    }
 517    info = cpu_to_be16(info);
 518    if (nbd_write(client->ioc, &info, sizeof(info), errp) < 0) {
 519        return -EIO;
 520    }
 521    if (nbd_write(client->ioc, buf, length, errp) < 0) {
 522        return -EIO;
 523    }
 524    return 0;
 525}
 526
 527/* nbd_reject_length: Handle any unexpected payload.
 528 * @fatal requests that we quit talking to the client, even if we are able
 529 * to successfully send an error reply.
 530 * Return:
 531 * -errno  transmission error occurred or @fatal was requested, errp is set
 532 * 0       error message successfully sent to client, errp is not set
 533 */
 534static int nbd_reject_length(NBDClient *client, bool fatal, Error **errp)
 535{
 536    int ret;
 537
 538    assert(client->optlen);
 539    ret = nbd_opt_invalid(client, errp, "option '%s' has unexpected length",
 540                          nbd_opt_lookup(client->opt));
 541    if (fatal && !ret) {
 542        error_setg(errp, "option '%s' has unexpected length",
 543                   nbd_opt_lookup(client->opt));
 544        return -EINVAL;
 545    }
 546    return ret;
 547}
 548
 549/* Handle NBD_OPT_INFO and NBD_OPT_GO.
 550 * Return -errno on error, 0 if ready for next option, and 1 to move
 551 * into transmission phase.  */
 552static int nbd_negotiate_handle_info(NBDClient *client, Error **errp)
 553{
 554    int rc;
 555    g_autofree char *name = NULL;
 556    NBDExport *exp;
 557    uint16_t requests;
 558    uint16_t request;
 559    uint32_t namelen;
 560    bool sendname = false;
 561    bool blocksize = false;
 562    uint32_t sizes[3];
 563    char buf[sizeof(uint64_t) + sizeof(uint16_t)];
 564    uint32_t check_align = 0;
 565    uint16_t myflags;
 566
 567    /* Client sends:
 568        4 bytes: L, name length (can be 0)
 569        L bytes: export name
 570        2 bytes: N, number of requests (can be 0)
 571        N * 2 bytes: N requests
 572    */
 573    rc = nbd_opt_read_name(client, &name, &namelen, errp);
 574    if (rc <= 0) {
 575        return rc;
 576    }
 577    trace_nbd_negotiate_handle_export_name_request(name);
 578
 579    rc = nbd_opt_read(client, &requests, sizeof(requests), errp);
 580    if (rc <= 0) {
 581        return rc;
 582    }
 583    requests = be16_to_cpu(requests);
 584    trace_nbd_negotiate_handle_info_requests(requests);
 585    while (requests--) {
 586        rc = nbd_opt_read(client, &request, sizeof(request), errp);
 587        if (rc <= 0) {
 588            return rc;
 589        }
 590        request = be16_to_cpu(request);
 591        trace_nbd_negotiate_handle_info_request(request,
 592                                                nbd_info_lookup(request));
 593        /* We care about NBD_INFO_NAME and NBD_INFO_BLOCK_SIZE;
 594         * everything else is either a request we don't know or
 595         * something we send regardless of request */
 596        switch (request) {
 597        case NBD_INFO_NAME:
 598            sendname = true;
 599            break;
 600        case NBD_INFO_BLOCK_SIZE:
 601            blocksize = true;
 602            break;
 603        }
 604    }
 605    if (client->optlen) {
 606        return nbd_reject_length(client, false, errp);
 607    }
 608
 609    exp = nbd_export_find(name);
 610    if (!exp) {
 611        g_autofree char *sane_name = nbd_sanitize_name(name);
 612
 613        return nbd_negotiate_send_rep_err(client, NBD_REP_ERR_UNKNOWN,
 614                                          errp, "export '%s' not present",
 615                                          sane_name);
 616    }
 617
 618    /* Don't bother sending NBD_INFO_NAME unless client requested it */
 619    if (sendname) {
 620        rc = nbd_negotiate_send_info(client, NBD_INFO_NAME, namelen, name,
 621                                     errp);
 622        if (rc < 0) {
 623            return rc;
 624        }
 625    }
 626
 627    /* Send NBD_INFO_DESCRIPTION only if available, regardless of
 628     * client request */
 629    if (exp->description) {
 630        size_t len = strlen(exp->description);
 631
 632        assert(len <= NBD_MAX_STRING_SIZE);
 633        rc = nbd_negotiate_send_info(client, NBD_INFO_DESCRIPTION,
 634                                     len, exp->description, errp);
 635        if (rc < 0) {
 636            return rc;
 637        }
 638    }
 639
 640    /* Send NBD_INFO_BLOCK_SIZE always, but tweak the minimum size
 641     * according to whether the client requested it, and according to
 642     * whether this is OPT_INFO or OPT_GO. */
 643    /* minimum - 1 for back-compat, or actual if client will obey it. */
 644    if (client->opt == NBD_OPT_INFO || blocksize) {
 645        check_align = sizes[0] = blk_get_request_alignment(exp->blk);
 646    } else {
 647        sizes[0] = 1;
 648    }
 649    assert(sizes[0] <= NBD_MAX_BUFFER_SIZE);
 650    /* preferred - Hard-code to 4096 for now.
 651     * TODO: is blk_bs(blk)->bl.opt_transfer appropriate? */
 652    sizes[1] = MAX(4096, sizes[0]);
 653    /* maximum - At most 32M, but smaller as appropriate. */
 654    sizes[2] = MIN(blk_get_max_transfer(exp->blk), NBD_MAX_BUFFER_SIZE);
 655    trace_nbd_negotiate_handle_info_block_size(sizes[0], sizes[1], sizes[2]);
 656    sizes[0] = cpu_to_be32(sizes[0]);
 657    sizes[1] = cpu_to_be32(sizes[1]);
 658    sizes[2] = cpu_to_be32(sizes[2]);
 659    rc = nbd_negotiate_send_info(client, NBD_INFO_BLOCK_SIZE,
 660                                 sizeof(sizes), sizes, errp);
 661    if (rc < 0) {
 662        return rc;
 663    }
 664
 665    /* Send NBD_INFO_EXPORT always */
 666    myflags = exp->nbdflags;
 667    if (client->structured_reply) {
 668        myflags |= NBD_FLAG_SEND_DF;
 669    }
 670    trace_nbd_negotiate_new_style_size_flags(exp->size, myflags);
 671    stq_be_p(buf, exp->size);
 672    stw_be_p(buf + 8, myflags);
 673    rc = nbd_negotiate_send_info(client, NBD_INFO_EXPORT,
 674                                 sizeof(buf), buf, errp);
 675    if (rc < 0) {
 676        return rc;
 677    }
 678
 679    /*
 680     * If the client is just asking for NBD_OPT_INFO, but forgot to
 681     * request block sizes in a situation that would impact
 682     * performance, then return an error. But for NBD_OPT_GO, we
 683     * tolerate all clients, regardless of alignments.
 684     */
 685    if (client->opt == NBD_OPT_INFO && !blocksize &&
 686        blk_get_request_alignment(exp->blk) > 1) {
 687        return nbd_negotiate_send_rep_err(client,
 688                                          NBD_REP_ERR_BLOCK_SIZE_REQD,
 689                                          errp,
 690                                          "request NBD_INFO_BLOCK_SIZE to "
 691                                          "use this export");
 692    }
 693
 694    /* Final reply */
 695    rc = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
 696    if (rc < 0) {
 697        return rc;
 698    }
 699
 700    if (client->opt == NBD_OPT_GO) {
 701        client->exp = exp;
 702        client->check_align = check_align;
 703        QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
 704        nbd_export_get(client->exp);
 705        nbd_check_meta_export(client);
 706        rc = 1;
 707    }
 708    return rc;
 709}
 710
 711
 712/* Handle NBD_OPT_STARTTLS. Return NULL to drop connection, or else the
 713 * new channel for all further (now-encrypted) communication. */
 714static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client,
 715                                                 Error **errp)
 716{
 717    QIOChannel *ioc;
 718    QIOChannelTLS *tioc;
 719    struct NBDTLSHandshakeData data = { 0 };
 720
 721    assert(client->opt == NBD_OPT_STARTTLS);
 722
 723    trace_nbd_negotiate_handle_starttls();
 724    ioc = client->ioc;
 725
 726    if (nbd_negotiate_send_rep(client, NBD_REP_ACK, errp) < 0) {
 727        return NULL;
 728    }
 729
 730    tioc = qio_channel_tls_new_server(ioc,
 731                                      client->tlscreds,
 732                                      client->tlsauthz,
 733                                      errp);
 734    if (!tioc) {
 735        return NULL;
 736    }
 737
 738    qio_channel_set_name(QIO_CHANNEL(tioc), "nbd-server-tls");
 739    trace_nbd_negotiate_handle_starttls_handshake();
 740    data.loop = g_main_loop_new(g_main_context_default(), FALSE);
 741    qio_channel_tls_handshake(tioc,
 742                              nbd_tls_handshake,
 743                              &data,
 744                              NULL,
 745                              NULL);
 746
 747    if (!data.complete) {
 748        g_main_loop_run(data.loop);
 749    }
 750    g_main_loop_unref(data.loop);
 751    if (data.error) {
 752        object_unref(OBJECT(tioc));
 753        error_propagate(errp, data.error);
 754        return NULL;
 755    }
 756
 757    return QIO_CHANNEL(tioc);
 758}
 759
 760/* nbd_negotiate_send_meta_context
 761 *
 762 * Send one chunk of reply to NBD_OPT_{LIST,SET}_META_CONTEXT
 763 *
 764 * For NBD_OPT_LIST_META_CONTEXT @context_id is ignored, 0 is used instead.
 765 */
 766static int nbd_negotiate_send_meta_context(NBDClient *client,
 767                                           const char *context,
 768                                           uint32_t context_id,
 769                                           Error **errp)
 770{
 771    NBDOptionReplyMetaContext opt;
 772    struct iovec iov[] = {
 773        {.iov_base = &opt, .iov_len = sizeof(opt)},
 774        {.iov_base = (void *)context, .iov_len = strlen(context)}
 775    };
 776
 777    assert(iov[1].iov_len <= NBD_MAX_STRING_SIZE);
 778    if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
 779        context_id = 0;
 780    }
 781
 782    trace_nbd_negotiate_meta_query_reply(context, context_id);
 783    set_be_option_rep(&opt.h, client->opt, NBD_REP_META_CONTEXT,
 784                      sizeof(opt) - sizeof(opt.h) + iov[1].iov_len);
 785    stl_be_p(&opt.context_id, context_id);
 786
 787    return qio_channel_writev_all(client->ioc, iov, 2, errp) < 0 ? -EIO : 0;
 788}
 789
 790/* Read strlen(@pattern) bytes, and set @match to true if they match @pattern.
 791 * @match is never set to false.
 792 *
 793 * Return -errno on I/O error, 0 if option was completely handled by
 794 * sending a reply about inconsistent lengths, or 1 on success.
 795 *
 796 * Note: return code = 1 doesn't mean that we've read exactly @pattern.
 797 * It only means that there are no errors.
 798 */
 799static int nbd_meta_pattern(NBDClient *client, const char *pattern, bool *match,
 800                            Error **errp)
 801{
 802    int ret;
 803    char *query;
 804    size_t len = strlen(pattern);
 805
 806    assert(len);
 807
 808    query = g_malloc(len);
 809    ret = nbd_opt_read(client, query, len, errp);
 810    if (ret <= 0) {
 811        g_free(query);
 812        return ret;
 813    }
 814
 815    if (strncmp(query, pattern, len) == 0) {
 816        trace_nbd_negotiate_meta_query_parse(pattern);
 817        *match = true;
 818    } else {
 819        trace_nbd_negotiate_meta_query_skip("pattern not matched");
 820    }
 821    g_free(query);
 822
 823    return 1;
 824}
 825
 826/*
 827 * Read @len bytes, and set @match to true if they match @pattern, or if @len
 828 * is 0 and the client is performing _LIST_. @match is never set to false.
 829 *
 830 * Return -errno on I/O error, 0 if option was completely handled by
 831 * sending a reply about inconsistent lengths, or 1 on success.
 832 *
 833 * Note: return code = 1 doesn't mean that we've read exactly @pattern.
 834 * It only means that there are no errors.
 835 */
 836static int nbd_meta_empty_or_pattern(NBDClient *client, const char *pattern,
 837                                     uint32_t len, bool *match, Error **errp)
 838{
 839    if (len == 0) {
 840        if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
 841            *match = true;
 842        }
 843        trace_nbd_negotiate_meta_query_parse("empty");
 844        return 1;
 845    }
 846
 847    if (len != strlen(pattern)) {
 848        trace_nbd_negotiate_meta_query_skip("different lengths");
 849        return nbd_opt_skip(client, len, errp);
 850    }
 851
 852    return nbd_meta_pattern(client, pattern, match, errp);
 853}
 854
 855/* nbd_meta_base_query
 856 *
 857 * Handle queries to 'base' namespace. For now, only the base:allocation
 858 * context is available.  'len' is the amount of text remaining to be read from
 859 * the current name, after the 'base:' portion has been stripped.
 860 *
 861 * Return -errno on I/O error, 0 if option was completely handled by
 862 * sending a reply about inconsistent lengths, or 1 on success.
 863 */
 864static int nbd_meta_base_query(NBDClient *client, NBDExportMetaContexts *meta,
 865                               uint32_t len, Error **errp)
 866{
 867    return nbd_meta_empty_or_pattern(client, "allocation", len,
 868                                     &meta->base_allocation, errp);
 869}
 870
 871/* nbd_meta_bitmap_query
 872 *
 873 * Handle query to 'qemu:' namespace.
 874 * @len is the amount of text remaining to be read from the current name, after
 875 * the 'qemu:' portion has been stripped.
 876 *
 877 * Return -errno on I/O error, 0 if option was completely handled by
 878 * sending a reply about inconsistent lengths, or 1 on success. */
 879static int nbd_meta_qemu_query(NBDClient *client, NBDExportMetaContexts *meta,
 880                               uint32_t len, Error **errp)
 881{
 882    bool dirty_bitmap = false;
 883    size_t dirty_bitmap_len = strlen("dirty-bitmap:");
 884    int ret;
 885
 886    if (!meta->exp->export_bitmap) {
 887        trace_nbd_negotiate_meta_query_skip("no dirty-bitmap exported");
 888        return nbd_opt_skip(client, len, errp);
 889    }
 890
 891    if (len == 0) {
 892        if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
 893            meta->bitmap = true;
 894        }
 895        trace_nbd_negotiate_meta_query_parse("empty");
 896        return 1;
 897    }
 898
 899    if (len < dirty_bitmap_len) {
 900        trace_nbd_negotiate_meta_query_skip("not dirty-bitmap:");
 901        return nbd_opt_skip(client, len, errp);
 902    }
 903
 904    len -= dirty_bitmap_len;
 905    ret = nbd_meta_pattern(client, "dirty-bitmap:", &dirty_bitmap, errp);
 906    if (ret <= 0) {
 907        return ret;
 908    }
 909    if (!dirty_bitmap) {
 910        trace_nbd_negotiate_meta_query_skip("not dirty-bitmap:");
 911        return nbd_opt_skip(client, len, errp);
 912    }
 913
 914    trace_nbd_negotiate_meta_query_parse("dirty-bitmap:");
 915
 916    return nbd_meta_empty_or_pattern(
 917            client, meta->exp->export_bitmap_context +
 918            strlen("qemu:dirty_bitmap:"), len, &meta->bitmap, errp);
 919}
 920
 921/* nbd_negotiate_meta_query
 922 *
 923 * Parse namespace name and call corresponding function to parse body of the
 924 * query.
 925 *
 926 * The only supported namespaces are 'base' and 'qemu'.
 927 *
 928 * The function aims not wasting time and memory to read long unknown namespace
 929 * names.
 930 *
 931 * Return -errno on I/O error, 0 if option was completely handled by
 932 * sending a reply about inconsistent lengths, or 1 on success. */
 933static int nbd_negotiate_meta_query(NBDClient *client,
 934                                    NBDExportMetaContexts *meta, Error **errp)
 935{
 936    /*
 937     * Both 'qemu' and 'base' namespaces have length = 5 including a
 938     * colon. If another length namespace is later introduced, this
 939     * should certainly be refactored.
 940     */
 941    int ret;
 942    size_t ns_len = 5;
 943    char ns[5];
 944    uint32_t len;
 945
 946    ret = nbd_opt_read(client, &len, sizeof(len), errp);
 947    if (ret <= 0) {
 948        return ret;
 949    }
 950    len = cpu_to_be32(len);
 951
 952    if (len > NBD_MAX_STRING_SIZE) {
 953        trace_nbd_negotiate_meta_query_skip("length too long");
 954        return nbd_opt_skip(client, len, errp);
 955    }
 956    if (len < ns_len) {
 957        trace_nbd_negotiate_meta_query_skip("length too short");
 958        return nbd_opt_skip(client, len, errp);
 959    }
 960
 961    len -= ns_len;
 962    ret = nbd_opt_read(client, ns, ns_len, errp);
 963    if (ret <= 0) {
 964        return ret;
 965    }
 966
 967    if (!strncmp(ns, "base:", ns_len)) {
 968        trace_nbd_negotiate_meta_query_parse("base:");
 969        return nbd_meta_base_query(client, meta, len, errp);
 970    } else if (!strncmp(ns, "qemu:", ns_len)) {
 971        trace_nbd_negotiate_meta_query_parse("qemu:");
 972        return nbd_meta_qemu_query(client, meta, len, errp);
 973    }
 974
 975    trace_nbd_negotiate_meta_query_skip("unknown namespace");
 976    return nbd_opt_skip(client, len, errp);
 977}
 978
 979/* nbd_negotiate_meta_queries
 980 * Handle NBD_OPT_LIST_META_CONTEXT and NBD_OPT_SET_META_CONTEXT
 981 *
 982 * Return -errno on I/O error, or 0 if option was completely handled. */
 983static int nbd_negotiate_meta_queries(NBDClient *client,
 984                                      NBDExportMetaContexts *meta, Error **errp)
 985{
 986    int ret;
 987    g_autofree char *export_name = NULL;
 988    NBDExportMetaContexts local_meta;
 989    uint32_t nb_queries;
 990    int i;
 991
 992    if (!client->structured_reply) {
 993        return nbd_opt_invalid(client, errp,
 994                               "request option '%s' when structured reply "
 995                               "is not negotiated",
 996                               nbd_opt_lookup(client->opt));
 997    }
 998
 999    if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
1000        /* Only change the caller's meta on SET. */
1001        meta = &local_meta;
1002    }
1003
1004    memset(meta, 0, sizeof(*meta));
1005
1006    ret = nbd_opt_read_name(client, &export_name, NULL, errp);
1007    if (ret <= 0) {
1008        return ret;
1009    }
1010
1011    meta->exp = nbd_export_find(export_name);
1012    if (meta->exp == NULL) {
1013        g_autofree char *sane_name = nbd_sanitize_name(export_name);
1014
1015        return nbd_opt_drop(client, NBD_REP_ERR_UNKNOWN, errp,
1016                            "export '%s' not present", sane_name);
1017    }
1018
1019    ret = nbd_opt_read(client, &nb_queries, sizeof(nb_queries), errp);
1020    if (ret <= 0) {
1021        return ret;
1022    }
1023    nb_queries = cpu_to_be32(nb_queries);
1024    trace_nbd_negotiate_meta_context(nbd_opt_lookup(client->opt),
1025                                     export_name, nb_queries);
1026
1027    if (client->opt == NBD_OPT_LIST_META_CONTEXT && !nb_queries) {
1028        /* enable all known contexts */
1029        meta->base_allocation = true;
1030        meta->bitmap = !!meta->exp->export_bitmap;
1031    } else {
1032        for (i = 0; i < nb_queries; ++i) {
1033            ret = nbd_negotiate_meta_query(client, meta, errp);
1034            if (ret <= 0) {
1035                return ret;
1036            }
1037        }
1038    }
1039
1040    if (meta->base_allocation) {
1041        ret = nbd_negotiate_send_meta_context(client, "base:allocation",
1042                                              NBD_META_ID_BASE_ALLOCATION,
1043                                              errp);
1044        if (ret < 0) {
1045            return ret;
1046        }
1047    }
1048
1049    if (meta->bitmap) {
1050        ret = nbd_negotiate_send_meta_context(client,
1051                                              meta->exp->export_bitmap_context,
1052                                              NBD_META_ID_DIRTY_BITMAP,
1053                                              errp);
1054        if (ret < 0) {
1055            return ret;
1056        }
1057    }
1058
1059    ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
1060    if (ret == 0) {
1061        meta->valid = true;
1062    }
1063
1064    return ret;
1065}
1066
1067/* nbd_negotiate_options
1068 * Process all NBD_OPT_* client option commands, during fixed newstyle
1069 * negotiation.
1070 * Return:
1071 * -errno  on error, errp is set
1072 * 0       on successful negotiation, errp is not set
1073 * 1       if client sent NBD_OPT_ABORT, i.e. on valid disconnect,
1074 *         errp is not set
1075 */
1076static int nbd_negotiate_options(NBDClient *client, Error **errp)
1077{
1078    uint32_t flags;
1079    bool fixedNewstyle = false;
1080    bool no_zeroes = false;
1081
1082    /* Client sends:
1083        [ 0 ..   3]   client flags
1084
1085       Then we loop until NBD_OPT_EXPORT_NAME or NBD_OPT_GO:
1086        [ 0 ..   7]   NBD_OPTS_MAGIC
1087        [ 8 ..  11]   NBD option
1088        [12 ..  15]   Data length
1089        ...           Rest of request
1090
1091        [ 0 ..   7]   NBD_OPTS_MAGIC
1092        [ 8 ..  11]   Second NBD option
1093        [12 ..  15]   Data length
1094        ...           Rest of request
1095    */
1096
1097    if (nbd_read32(client->ioc, &flags, "flags", errp) < 0) {
1098        return -EIO;
1099    }
1100    trace_nbd_negotiate_options_flags(flags);
1101    if (flags & NBD_FLAG_C_FIXED_NEWSTYLE) {
1102        fixedNewstyle = true;
1103        flags &= ~NBD_FLAG_C_FIXED_NEWSTYLE;
1104    }
1105    if (flags & NBD_FLAG_C_NO_ZEROES) {
1106        no_zeroes = true;
1107        flags &= ~NBD_FLAG_C_NO_ZEROES;
1108    }
1109    if (flags != 0) {
1110        error_setg(errp, "Unknown client flags 0x%" PRIx32 " received", flags);
1111        return -EINVAL;
1112    }
1113
1114    while (1) {
1115        int ret;
1116        uint32_t option, length;
1117        uint64_t magic;
1118
1119        if (nbd_read64(client->ioc, &magic, "opts magic", errp) < 0) {
1120            return -EINVAL;
1121        }
1122        trace_nbd_negotiate_options_check_magic(magic);
1123        if (magic != NBD_OPTS_MAGIC) {
1124            error_setg(errp, "Bad magic received");
1125            return -EINVAL;
1126        }
1127
1128        if (nbd_read32(client->ioc, &option, "option", errp) < 0) {
1129            return -EINVAL;
1130        }
1131        client->opt = option;
1132
1133        if (nbd_read32(client->ioc, &length, "option length", errp) < 0) {
1134            return -EINVAL;
1135        }
1136        assert(!client->optlen);
1137        client->optlen = length;
1138
1139        if (length > NBD_MAX_BUFFER_SIZE) {
1140            error_setg(errp, "len (%" PRIu32" ) is larger than max len (%u)",
1141                       length, NBD_MAX_BUFFER_SIZE);
1142            return -EINVAL;
1143        }
1144
1145        trace_nbd_negotiate_options_check_option(option,
1146                                                 nbd_opt_lookup(option));
1147        if (client->tlscreds &&
1148            client->ioc == (QIOChannel *)client->sioc) {
1149            QIOChannel *tioc;
1150            if (!fixedNewstyle) {
1151                error_setg(errp, "Unsupported option 0x%" PRIx32, option);
1152                return -EINVAL;
1153            }
1154            switch (option) {
1155            case NBD_OPT_STARTTLS:
1156                if (length) {
1157                    /* Unconditionally drop the connection if the client
1158                     * can't start a TLS negotiation correctly */
1159                    return nbd_reject_length(client, true, errp);
1160                }
1161                tioc = nbd_negotiate_handle_starttls(client, errp);
1162                if (!tioc) {
1163                    return -EIO;
1164                }
1165                ret = 0;
1166                object_unref(OBJECT(client->ioc));
1167                client->ioc = QIO_CHANNEL(tioc);
1168                break;
1169
1170            case NBD_OPT_EXPORT_NAME:
1171                /* No way to return an error to client, so drop connection */
1172                error_setg(errp, "Option 0x%x not permitted before TLS",
1173                           option);
1174                return -EINVAL;
1175
1176            default:
1177                /* Let the client keep trying, unless they asked to
1178                 * quit. Always try to give an error back to the
1179                 * client; but when replying to OPT_ABORT, be aware
1180                 * that the client may hang up before receiving the
1181                 * error, in which case we are fine ignoring the
1182                 * resulting EPIPE. */
1183                ret = nbd_opt_drop(client, NBD_REP_ERR_TLS_REQD,
1184                                   option == NBD_OPT_ABORT ? NULL : errp,
1185                                   "Option 0x%" PRIx32
1186                                   " not permitted before TLS", option);
1187                if (option == NBD_OPT_ABORT) {
1188                    return 1;
1189                }
1190                break;
1191            }
1192        } else if (fixedNewstyle) {
1193            switch (option) {
1194            case NBD_OPT_LIST:
1195                if (length) {
1196                    ret = nbd_reject_length(client, false, errp);
1197                } else {
1198                    ret = nbd_negotiate_handle_list(client, errp);
1199                }
1200                break;
1201
1202            case NBD_OPT_ABORT:
1203                /* NBD spec says we must try to reply before
1204                 * disconnecting, but that we must also tolerate
1205                 * guests that don't wait for our reply. */
1206                nbd_negotiate_send_rep(client, NBD_REP_ACK, NULL);
1207                return 1;
1208
1209            case NBD_OPT_EXPORT_NAME:
1210                return nbd_negotiate_handle_export_name(client, no_zeroes,
1211                                                        errp);
1212
1213            case NBD_OPT_INFO:
1214            case NBD_OPT_GO:
1215                ret = nbd_negotiate_handle_info(client, errp);
1216                if (ret == 1) {
1217                    assert(option == NBD_OPT_GO);
1218                    return 0;
1219                }
1220                break;
1221
1222            case NBD_OPT_STARTTLS:
1223                if (length) {
1224                    ret = nbd_reject_length(client, false, errp);
1225                } else if (client->tlscreds) {
1226                    ret = nbd_negotiate_send_rep_err(client,
1227                                                     NBD_REP_ERR_INVALID, errp,
1228                                                     "TLS already enabled");
1229                } else {
1230                    ret = nbd_negotiate_send_rep_err(client,
1231                                                     NBD_REP_ERR_POLICY, errp,
1232                                                     "TLS not configured");
1233                }
1234                break;
1235
1236            case NBD_OPT_STRUCTURED_REPLY:
1237                if (length) {
1238                    ret = nbd_reject_length(client, false, errp);
1239                } else if (client->structured_reply) {
1240                    ret = nbd_negotiate_send_rep_err(
1241                        client, NBD_REP_ERR_INVALID, errp,
1242                        "structured reply already negotiated");
1243                } else {
1244                    ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
1245                    client->structured_reply = true;
1246                }
1247                break;
1248
1249            case NBD_OPT_LIST_META_CONTEXT:
1250            case NBD_OPT_SET_META_CONTEXT:
1251                ret = nbd_negotiate_meta_queries(client, &client->export_meta,
1252                                                 errp);
1253                break;
1254
1255            default:
1256                ret = nbd_opt_drop(client, NBD_REP_ERR_UNSUP, errp,
1257                                   "Unsupported option %" PRIu32 " (%s)",
1258                                   option, nbd_opt_lookup(option));
1259                break;
1260            }
1261        } else {
1262            /*
1263             * If broken new-style we should drop the connection
1264             * for anything except NBD_OPT_EXPORT_NAME
1265             */
1266            switch (option) {
1267            case NBD_OPT_EXPORT_NAME:
1268                return nbd_negotiate_handle_export_name(client, no_zeroes,
1269                                                        errp);
1270
1271            default:
1272                error_setg(errp, "Unsupported option %" PRIu32 " (%s)",
1273                           option, nbd_opt_lookup(option));
1274                return -EINVAL;
1275            }
1276        }
1277        if (ret < 0) {
1278            return ret;
1279        }
1280    }
1281}
1282
1283/* nbd_negotiate
1284 * Return:
1285 * -errno  on error, errp is set
1286 * 0       on successful negotiation, errp is not set
1287 * 1       if client sent NBD_OPT_ABORT, i.e. on valid disconnect,
1288 *         errp is not set
1289 */
1290static coroutine_fn int nbd_negotiate(NBDClient *client, Error **errp)
1291{
1292    char buf[NBD_OLDSTYLE_NEGOTIATE_SIZE] = "";
1293    int ret;
1294
1295    /* Old style negotiation header, no room for options
1296        [ 0 ..   7]   passwd       ("NBDMAGIC")
1297        [ 8 ..  15]   magic        (NBD_CLIENT_MAGIC)
1298        [16 ..  23]   size
1299        [24 ..  27]   export flags (zero-extended)
1300        [28 .. 151]   reserved     (0)
1301
1302       New style negotiation header, client can send options
1303        [ 0 ..   7]   passwd       ("NBDMAGIC")
1304        [ 8 ..  15]   magic        (NBD_OPTS_MAGIC)
1305        [16 ..  17]   server flags (0)
1306        ....options sent, ending in NBD_OPT_EXPORT_NAME or NBD_OPT_GO....
1307     */
1308
1309    qio_channel_set_blocking(client->ioc, false, NULL);
1310
1311    trace_nbd_negotiate_begin();
1312    memcpy(buf, "NBDMAGIC", 8);
1313
1314    stq_be_p(buf + 8, NBD_OPTS_MAGIC);
1315    stw_be_p(buf + 16, NBD_FLAG_FIXED_NEWSTYLE | NBD_FLAG_NO_ZEROES);
1316
1317    if (nbd_write(client->ioc, buf, 18, errp) < 0) {
1318        error_prepend(errp, "write failed: ");
1319        return -EINVAL;
1320    }
1321    ret = nbd_negotiate_options(client, errp);
1322    if (ret != 0) {
1323        if (ret < 0) {
1324            error_prepend(errp, "option negotiation failed: ");
1325        }
1326        return ret;
1327    }
1328
1329    /* Attach the channel to the same AioContext as the export */
1330    if (client->exp && client->exp->ctx) {
1331        qio_channel_attach_aio_context(client->ioc, client->exp->ctx);
1332    }
1333
1334    assert(!client->optlen);
1335    trace_nbd_negotiate_success();
1336
1337    return 0;
1338}
1339
1340static int nbd_receive_request(QIOChannel *ioc, NBDRequest *request,
1341                               Error **errp)
1342{
1343    uint8_t buf[NBD_REQUEST_SIZE];
1344    uint32_t magic;
1345    int ret;
1346
1347    ret = nbd_read(ioc, buf, sizeof(buf), "request", errp);
1348    if (ret < 0) {
1349        return ret;
1350    }
1351
1352    /* Request
1353       [ 0 ..  3]   magic   (NBD_REQUEST_MAGIC)
1354       [ 4 ..  5]   flags   (NBD_CMD_FLAG_FUA, ...)
1355       [ 6 ..  7]   type    (NBD_CMD_READ, ...)
1356       [ 8 .. 15]   handle
1357       [16 .. 23]   from
1358       [24 .. 27]   len
1359     */
1360
1361    magic = ldl_be_p(buf);
1362    request->flags  = lduw_be_p(buf + 4);
1363    request->type   = lduw_be_p(buf + 6);
1364    request->handle = ldq_be_p(buf + 8);
1365    request->from   = ldq_be_p(buf + 16);
1366    request->len    = ldl_be_p(buf + 24);
1367
1368    trace_nbd_receive_request(magic, request->flags, request->type,
1369                              request->from, request->len);
1370
1371    if (magic != NBD_REQUEST_MAGIC) {
1372        error_setg(errp, "invalid magic (got 0x%" PRIx32 ")", magic);
1373        return -EINVAL;
1374    }
1375    return 0;
1376}
1377
1378#define MAX_NBD_REQUESTS 16
1379
1380void nbd_client_get(NBDClient *client)
1381{
1382    client->refcount++;
1383}
1384
1385void nbd_client_put(NBDClient *client)
1386{
1387    if (--client->refcount == 0) {
1388        /* The last reference should be dropped by client->close,
1389         * which is called by client_close.
1390         */
1391        assert(client->closing);
1392
1393        qio_channel_detach_aio_context(client->ioc);
1394        object_unref(OBJECT(client->sioc));
1395        object_unref(OBJECT(client->ioc));
1396        if (client->tlscreds) {
1397            object_unref(OBJECT(client->tlscreds));
1398        }
1399        g_free(client->tlsauthz);
1400        if (client->exp) {
1401            QTAILQ_REMOVE(&client->exp->clients, client, next);
1402            nbd_export_put(client->exp);
1403        }
1404        g_free(client);
1405    }
1406}
1407
1408static void client_close(NBDClient *client, bool negotiated)
1409{
1410    if (client->closing) {
1411        return;
1412    }
1413
1414    client->closing = true;
1415
1416    /* Force requests to finish.  They will drop their own references,
1417     * then we'll close the socket and free the NBDClient.
1418     */
1419    qio_channel_shutdown(client->ioc, QIO_CHANNEL_SHUTDOWN_BOTH,
1420                         NULL);
1421
1422    /* Also tell the client, so that they release their reference.  */
1423    if (client->close_fn) {
1424        client->close_fn(client, negotiated);
1425    }
1426}
1427
1428static NBDRequestData *nbd_request_get(NBDClient *client)
1429{
1430    NBDRequestData *req;
1431
1432    assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
1433    client->nb_requests++;
1434
1435    req = g_new0(NBDRequestData, 1);
1436    nbd_client_get(client);
1437    req->client = client;
1438    return req;
1439}
1440
1441static void nbd_request_put(NBDRequestData *req)
1442{
1443    NBDClient *client = req->client;
1444
1445    if (req->data) {
1446        qemu_vfree(req->data);
1447    }
1448    g_free(req);
1449
1450    client->nb_requests--;
1451    nbd_client_receive_next_request(client);
1452
1453    nbd_client_put(client);
1454}
1455
1456static void blk_aio_attached(AioContext *ctx, void *opaque)
1457{
1458    NBDExport *exp = opaque;
1459    NBDClient *client;
1460
1461    trace_nbd_blk_aio_attached(exp->name, ctx);
1462
1463    exp->ctx = ctx;
1464
1465    QTAILQ_FOREACH(client, &exp->clients, next) {
1466        qio_channel_attach_aio_context(client->ioc, ctx);
1467        if (client->recv_coroutine) {
1468            aio_co_schedule(ctx, client->recv_coroutine);
1469        }
1470        if (client->send_coroutine) {
1471            aio_co_schedule(ctx, client->send_coroutine);
1472        }
1473    }
1474}
1475
1476static void blk_aio_detach(void *opaque)
1477{
1478    NBDExport *exp = opaque;
1479    NBDClient *client;
1480
1481    trace_nbd_blk_aio_detach(exp->name, exp->ctx);
1482
1483    QTAILQ_FOREACH(client, &exp->clients, next) {
1484        qio_channel_detach_aio_context(client->ioc);
1485    }
1486
1487    exp->ctx = NULL;
1488}
1489
1490static void nbd_eject_notifier(Notifier *n, void *data)
1491{
1492    NBDExport *exp = container_of(n, NBDExport, eject_notifier);
1493    AioContext *aio_context;
1494
1495    aio_context = exp->ctx;
1496    aio_context_acquire(aio_context);
1497    nbd_export_close(exp);
1498    aio_context_release(aio_context);
1499}
1500
1501NBDExport *nbd_export_new(BlockDriverState *bs, uint64_t dev_offset,
1502                          uint64_t size, const char *name, const char *desc,
1503                          const char *bitmap, bool readonly, bool shared,
1504                          void (*close)(NBDExport *), bool writethrough,
1505                          BlockBackend *on_eject_blk, Error **errp)
1506{
1507    AioContext *ctx;
1508    BlockBackend *blk;
1509    NBDExport *exp = g_new0(NBDExport, 1);
1510    uint64_t perm;
1511    int ret;
1512
1513    /*
1514     * NBD exports are used for non-shared storage migration.  Make sure
1515     * that BDRV_O_INACTIVE is cleared and the image is ready for write
1516     * access since the export could be available before migration handover.
1517     * ctx was acquired in the caller.
1518     */
1519    assert(name && strlen(name) <= NBD_MAX_STRING_SIZE);
1520    ctx = bdrv_get_aio_context(bs);
1521    bdrv_invalidate_cache(bs, NULL);
1522
1523    /* Don't allow resize while the NBD server is running, otherwise we don't
1524     * care what happens with the node. */
1525    perm = BLK_PERM_CONSISTENT_READ;
1526    if (!readonly) {
1527        perm |= BLK_PERM_WRITE;
1528    }
1529    blk = blk_new(ctx, perm,
1530                  BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
1531                  BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD);
1532    ret = blk_insert_bs(blk, bs, errp);
1533    if (ret < 0) {
1534        goto fail;
1535    }
1536    blk_set_enable_write_cache(blk, !writethrough);
1537    blk_set_allow_aio_context_change(blk, true);
1538
1539    exp->refcount = 1;
1540    QTAILQ_INIT(&exp->clients);
1541    exp->blk = blk;
1542    assert(dev_offset <= INT64_MAX);
1543    exp->dev_offset = dev_offset;
1544    exp->name = g_strdup(name);
1545    assert(!desc || strlen(desc) <= NBD_MAX_STRING_SIZE);
1546    exp->description = g_strdup(desc);
1547    exp->nbdflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_FLUSH |
1548                     NBD_FLAG_SEND_FUA | NBD_FLAG_SEND_CACHE);
1549    if (readonly) {
1550        exp->nbdflags |= NBD_FLAG_READ_ONLY;
1551        if (shared) {
1552            exp->nbdflags |= NBD_FLAG_CAN_MULTI_CONN;
1553        }
1554    } else {
1555        exp->nbdflags |= (NBD_FLAG_SEND_TRIM | NBD_FLAG_SEND_WRITE_ZEROES |
1556                          NBD_FLAG_SEND_FAST_ZERO);
1557    }
1558    assert(size <= INT64_MAX - dev_offset);
1559    exp->size = QEMU_ALIGN_DOWN(size, BDRV_SECTOR_SIZE);
1560
1561    if (bitmap) {
1562        BdrvDirtyBitmap *bm = NULL;
1563
1564        while (true) {
1565            bm = bdrv_find_dirty_bitmap(bs, bitmap);
1566            if (bm != NULL || bs->backing == NULL) {
1567                break;
1568            }
1569
1570            bs = bs->backing->bs;
1571        }
1572
1573        if (bm == NULL) {
1574            error_setg(errp, "Bitmap '%s' is not found", bitmap);
1575            goto fail;
1576        }
1577
1578        if (bdrv_dirty_bitmap_check(bm, BDRV_BITMAP_ALLOW_RO, errp)) {
1579            goto fail;
1580        }
1581
1582        if (readonly && bdrv_is_writable(bs) &&
1583            bdrv_dirty_bitmap_enabled(bm)) {
1584            error_setg(errp,
1585                       "Enabled bitmap '%s' incompatible with readonly export",
1586                       bitmap);
1587            goto fail;
1588        }
1589
1590        bdrv_dirty_bitmap_set_busy(bm, true);
1591        exp->export_bitmap = bm;
1592        assert(strlen(bitmap) <= BDRV_BITMAP_MAX_NAME_SIZE);
1593        exp->export_bitmap_context = g_strdup_printf("qemu:dirty-bitmap:%s",
1594                                                     bitmap);
1595        assert(strlen(exp->export_bitmap_context) < NBD_MAX_STRING_SIZE);
1596    }
1597
1598    exp->close = close;
1599    exp->ctx = ctx;
1600    blk_add_aio_context_notifier(blk, blk_aio_attached, blk_aio_detach, exp);
1601
1602    if (on_eject_blk) {
1603        blk_ref(on_eject_blk);
1604        exp->eject_notifier_blk = on_eject_blk;
1605        exp->eject_notifier.notify = nbd_eject_notifier;
1606        blk_add_remove_bs_notifier(on_eject_blk, &exp->eject_notifier);
1607    }
1608    QTAILQ_INSERT_TAIL(&exports, exp, next);
1609    nbd_export_get(exp);
1610    return exp;
1611
1612fail:
1613    blk_unref(blk);
1614    g_free(exp->name);
1615    g_free(exp->description);
1616    g_free(exp);
1617    return NULL;
1618}
1619
1620NBDExport *nbd_export_find(const char *name)
1621{
1622    NBDExport *exp;
1623    QTAILQ_FOREACH(exp, &exports, next) {
1624        if (strcmp(name, exp->name) == 0) {
1625            return exp;
1626        }
1627    }
1628
1629    return NULL;
1630}
1631
1632AioContext *
1633nbd_export_aio_context(NBDExport *exp)
1634{
1635    return exp->ctx;
1636}
1637
1638void nbd_export_close(NBDExport *exp)
1639{
1640    NBDClient *client, *next;
1641
1642    nbd_export_get(exp);
1643    /*
1644     * TODO: Should we expand QMP NbdServerRemoveNode enum to allow a
1645     * close mode that stops advertising the export to new clients but
1646     * still permits existing clients to run to completion? Because of
1647     * that possibility, nbd_export_close() can be called more than
1648     * once on an export.
1649     */
1650    QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) {
1651        client_close(client, true);
1652    }
1653    if (exp->name) {
1654        nbd_export_put(exp);
1655        g_free(exp->name);
1656        exp->name = NULL;
1657        QTAILQ_REMOVE(&exports, exp, next);
1658    }
1659    g_free(exp->description);
1660    exp->description = NULL;
1661    nbd_export_put(exp);
1662}
1663
1664void nbd_export_remove(NBDExport *exp, NbdServerRemoveMode mode, Error **errp)
1665{
1666    if (mode == NBD_SERVER_REMOVE_MODE_HARD || QTAILQ_EMPTY(&exp->clients)) {
1667        nbd_export_close(exp);
1668        return;
1669    }
1670
1671    assert(mode == NBD_SERVER_REMOVE_MODE_SAFE);
1672
1673    error_setg(errp, "export '%s' still in use", exp->name);
1674    error_append_hint(errp, "Use mode='hard' to force client disconnect\n");
1675}
1676
1677void nbd_export_get(NBDExport *exp)
1678{
1679    assert(exp->refcount > 0);
1680    exp->refcount++;
1681}
1682
1683void nbd_export_put(NBDExport *exp)
1684{
1685    assert(exp->refcount > 0);
1686    if (exp->refcount == 1) {
1687        nbd_export_close(exp);
1688    }
1689
1690    /* nbd_export_close() may theoretically reduce refcount to 0. It may happen
1691     * if someone calls nbd_export_put() on named export not through
1692     * nbd_export_set_name() when refcount is 1. So, let's assert that
1693     * it is > 0.
1694     */
1695    assert(exp->refcount > 0);
1696    if (--exp->refcount == 0) {
1697        assert(exp->name == NULL);
1698        assert(exp->description == NULL);
1699
1700        if (exp->close) {
1701            exp->close(exp);
1702        }
1703
1704        if (exp->blk) {
1705            if (exp->eject_notifier_blk) {
1706                notifier_remove(&exp->eject_notifier);
1707                blk_unref(exp->eject_notifier_blk);
1708            }
1709            blk_remove_aio_context_notifier(exp->blk, blk_aio_attached,
1710                                            blk_aio_detach, exp);
1711            blk_unref(exp->blk);
1712            exp->blk = NULL;
1713        }
1714
1715        if (exp->export_bitmap) {
1716            bdrv_dirty_bitmap_set_busy(exp->export_bitmap, false);
1717            g_free(exp->export_bitmap_context);
1718        }
1719
1720        g_free(exp);
1721    }
1722}
1723
1724BlockBackend *nbd_export_get_blockdev(NBDExport *exp)
1725{
1726    return exp->blk;
1727}
1728
1729void nbd_export_close_all(void)
1730{
1731    NBDExport *exp, *next;
1732    AioContext *aio_context;
1733
1734    QTAILQ_FOREACH_SAFE(exp, &exports, next, next) {
1735        aio_context = exp->ctx;
1736        aio_context_acquire(aio_context);
1737        nbd_export_close(exp);
1738        aio_context_release(aio_context);
1739    }
1740}
1741
1742static int coroutine_fn nbd_co_send_iov(NBDClient *client, struct iovec *iov,
1743                                        unsigned niov, Error **errp)
1744{
1745    int ret;
1746
1747    g_assert(qemu_in_coroutine());
1748    qemu_co_mutex_lock(&client->send_lock);
1749    client->send_coroutine = qemu_coroutine_self();
1750
1751    ret = qio_channel_writev_all(client->ioc, iov, niov, errp) < 0 ? -EIO : 0;
1752
1753    client->send_coroutine = NULL;
1754    qemu_co_mutex_unlock(&client->send_lock);
1755
1756    return ret;
1757}
1758
1759static inline void set_be_simple_reply(NBDSimpleReply *reply, uint64_t error,
1760                                       uint64_t handle)
1761{
1762    stl_be_p(&reply->magic, NBD_SIMPLE_REPLY_MAGIC);
1763    stl_be_p(&reply->error, error);
1764    stq_be_p(&reply->handle, handle);
1765}
1766
1767static int nbd_co_send_simple_reply(NBDClient *client,
1768                                    uint64_t handle,
1769                                    uint32_t error,
1770                                    void *data,
1771                                    size_t len,
1772                                    Error **errp)
1773{
1774    NBDSimpleReply reply;
1775    int nbd_err = system_errno_to_nbd_errno(error);
1776    struct iovec iov[] = {
1777        {.iov_base = &reply, .iov_len = sizeof(reply)},
1778        {.iov_base = data, .iov_len = len}
1779    };
1780
1781    trace_nbd_co_send_simple_reply(handle, nbd_err, nbd_err_lookup(nbd_err),
1782                                   len);
1783    set_be_simple_reply(&reply, nbd_err, handle);
1784
1785    return nbd_co_send_iov(client, iov, len ? 2 : 1, errp);
1786}
1787
1788static inline void set_be_chunk(NBDStructuredReplyChunk *chunk, uint16_t flags,
1789                                uint16_t type, uint64_t handle, uint32_t length)
1790{
1791    stl_be_p(&chunk->magic, NBD_STRUCTURED_REPLY_MAGIC);
1792    stw_be_p(&chunk->flags, flags);
1793    stw_be_p(&chunk->type, type);
1794    stq_be_p(&chunk->handle, handle);
1795    stl_be_p(&chunk->length, length);
1796}
1797
1798static int coroutine_fn nbd_co_send_structured_done(NBDClient *client,
1799                                                    uint64_t handle,
1800                                                    Error **errp)
1801{
1802    NBDStructuredReplyChunk chunk;
1803    struct iovec iov[] = {
1804        {.iov_base = &chunk, .iov_len = sizeof(chunk)},
1805    };
1806
1807    trace_nbd_co_send_structured_done(handle);
1808    set_be_chunk(&chunk, NBD_REPLY_FLAG_DONE, NBD_REPLY_TYPE_NONE, handle, 0);
1809
1810    return nbd_co_send_iov(client, iov, 1, errp);
1811}
1812
1813static int coroutine_fn nbd_co_send_structured_read(NBDClient *client,
1814                                                    uint64_t handle,
1815                                                    uint64_t offset,
1816                                                    void *data,
1817                                                    size_t size,
1818                                                    bool final,
1819                                                    Error **errp)
1820{
1821    NBDStructuredReadData chunk;
1822    struct iovec iov[] = {
1823        {.iov_base = &chunk, .iov_len = sizeof(chunk)},
1824        {.iov_base = data, .iov_len = size}
1825    };
1826
1827    assert(size);
1828    trace_nbd_co_send_structured_read(handle, offset, data, size);
1829    set_be_chunk(&chunk.h, final ? NBD_REPLY_FLAG_DONE : 0,
1830                 NBD_REPLY_TYPE_OFFSET_DATA, handle,
1831                 sizeof(chunk) - sizeof(chunk.h) + size);
1832    stq_be_p(&chunk.offset, offset);
1833
1834    return nbd_co_send_iov(client, iov, 2, errp);
1835}
1836
1837static int coroutine_fn nbd_co_send_structured_error(NBDClient *client,
1838                                                     uint64_t handle,
1839                                                     uint32_t error,
1840                                                     const char *msg,
1841                                                     Error **errp)
1842{
1843    NBDStructuredError chunk;
1844    int nbd_err = system_errno_to_nbd_errno(error);
1845    struct iovec iov[] = {
1846        {.iov_base = &chunk, .iov_len = sizeof(chunk)},
1847        {.iov_base = (char *)msg, .iov_len = msg ? strlen(msg) : 0},
1848    };
1849
1850    assert(nbd_err);
1851    trace_nbd_co_send_structured_error(handle, nbd_err,
1852                                       nbd_err_lookup(nbd_err), msg ? msg : "");
1853    set_be_chunk(&chunk.h, NBD_REPLY_FLAG_DONE, NBD_REPLY_TYPE_ERROR, handle,
1854                 sizeof(chunk) - sizeof(chunk.h) + iov[1].iov_len);
1855    stl_be_p(&chunk.error, nbd_err);
1856    stw_be_p(&chunk.message_length, iov[1].iov_len);
1857
1858    return nbd_co_send_iov(client, iov, 1 + !!iov[1].iov_len, errp);
1859}
1860
1861/* Do a sparse read and send the structured reply to the client.
1862 * Returns -errno if sending fails. bdrv_block_status_above() failure is
1863 * reported to the client, at which point this function succeeds.
1864 */
1865static int coroutine_fn nbd_co_send_sparse_read(NBDClient *client,
1866                                                uint64_t handle,
1867                                                uint64_t offset,
1868                                                uint8_t *data,
1869                                                size_t size,
1870                                                Error **errp)
1871{
1872    int ret = 0;
1873    NBDExport *exp = client->exp;
1874    size_t progress = 0;
1875
1876    while (progress < size) {
1877        int64_t pnum;
1878        int status = bdrv_block_status_above(blk_bs(exp->blk), NULL,
1879                                             offset + progress,
1880                                             size - progress, &pnum, NULL,
1881                                             NULL);
1882        bool final;
1883
1884        if (status < 0) {
1885            char *msg = g_strdup_printf("unable to check for holes: %s",
1886                                        strerror(-status));
1887
1888            ret = nbd_co_send_structured_error(client, handle, -status, msg,
1889                                               errp);
1890            g_free(msg);
1891            return ret;
1892        }
1893        assert(pnum && pnum <= size - progress);
1894        final = progress + pnum == size;
1895        if (status & BDRV_BLOCK_ZERO) {
1896            NBDStructuredReadHole chunk;
1897            struct iovec iov[] = {
1898                {.iov_base = &chunk, .iov_len = sizeof(chunk)},
1899            };
1900
1901            trace_nbd_co_send_structured_read_hole(handle, offset + progress,
1902                                                   pnum);
1903            set_be_chunk(&chunk.h, final ? NBD_REPLY_FLAG_DONE : 0,
1904                         NBD_REPLY_TYPE_OFFSET_HOLE,
1905                         handle, sizeof(chunk) - sizeof(chunk.h));
1906            stq_be_p(&chunk.offset, offset + progress);
1907            stl_be_p(&chunk.length, pnum);
1908            ret = nbd_co_send_iov(client, iov, 1, errp);
1909        } else {
1910            ret = blk_pread(exp->blk, offset + progress + exp->dev_offset,
1911                            data + progress, pnum);
1912            if (ret < 0) {
1913                error_setg_errno(errp, -ret, "reading from file failed");
1914                break;
1915            }
1916            ret = nbd_co_send_structured_read(client, handle, offset + progress,
1917                                              data + progress, pnum, final,
1918                                              errp);
1919        }
1920
1921        if (ret < 0) {
1922            break;
1923        }
1924        progress += pnum;
1925    }
1926    return ret;
1927}
1928
1929/*
1930 * Populate @extents from block status. Update @bytes to be the actual
1931 * length encoded (which may be smaller than the original), and update
1932 * @nb_extents to the number of extents used.
1933 *
1934 * Returns zero on success and -errno on bdrv_block_status_above failure.
1935 */
1936static int blockstatus_to_extents(BlockDriverState *bs, uint64_t offset,
1937                                  uint64_t *bytes, NBDExtent *extents,
1938                                  unsigned int *nb_extents)
1939{
1940    uint64_t remaining_bytes = *bytes;
1941    NBDExtent *extent = extents, *extents_end = extents + *nb_extents;
1942    bool first_extent = true;
1943
1944    assert(*nb_extents);
1945    while (remaining_bytes) {
1946        uint32_t flags;
1947        int64_t num;
1948        int ret = bdrv_block_status_above(bs, NULL, offset, remaining_bytes,
1949                                          &num, NULL, NULL);
1950
1951        if (ret < 0) {
1952            return ret;
1953        }
1954
1955        flags = (ret & BDRV_BLOCK_ALLOCATED ? 0 : NBD_STATE_HOLE) |
1956                (ret & BDRV_BLOCK_ZERO      ? NBD_STATE_ZERO : 0);
1957
1958        if (first_extent) {
1959            extent->flags = flags;
1960            extent->length = num;
1961            first_extent = false;
1962        } else if (flags == extent->flags) {
1963            /* extend current extent */
1964            extent->length += num;
1965        } else {
1966            if (extent + 1 == extents_end) {
1967                break;
1968            }
1969
1970            /* start new extent */
1971            extent++;
1972            extent->flags = flags;
1973            extent->length = num;
1974        }
1975        offset += num;
1976        remaining_bytes -= num;
1977    }
1978
1979    extents_end = extent + 1;
1980
1981    for (extent = extents; extent < extents_end; extent++) {
1982        extent->flags = cpu_to_be32(extent->flags);
1983        extent->length = cpu_to_be32(extent->length);
1984    }
1985
1986    *bytes -= remaining_bytes;
1987    *nb_extents = extents_end - extents;
1988
1989    return 0;
1990}
1991
1992/* nbd_co_send_extents
1993 *
1994 * @length is only for tracing purposes (and may be smaller or larger
1995 * than the client's original request). @last controls whether
1996 * NBD_REPLY_FLAG_DONE is sent. @extents should already be in
1997 * big-endian format.
1998 */
1999static int nbd_co_send_extents(NBDClient *client, uint64_t handle,
2000                               NBDExtent *extents, unsigned int nb_extents,
2001                               uint64_t length, bool last,
2002                               uint32_t context_id, Error **errp)
2003{
2004    NBDStructuredMeta chunk;
2005
2006    struct iovec iov[] = {
2007        {.iov_base = &chunk, .iov_len = sizeof(chunk)},
2008        {.iov_base = extents, .iov_len = nb_extents * sizeof(extents[0])}
2009    };
2010
2011    trace_nbd_co_send_extents(handle, nb_extents, context_id, length, last);
2012    set_be_chunk(&chunk.h, last ? NBD_REPLY_FLAG_DONE : 0,
2013                 NBD_REPLY_TYPE_BLOCK_STATUS,
2014                 handle, sizeof(chunk) - sizeof(chunk.h) + iov[1].iov_len);
2015    stl_be_p(&chunk.context_id, context_id);
2016
2017    return nbd_co_send_iov(client, iov, 2, errp);
2018}
2019
2020/* Get block status from the exported device and send it to the client */
2021static int nbd_co_send_block_status(NBDClient *client, uint64_t handle,
2022                                    BlockDriverState *bs, uint64_t offset,
2023                                    uint32_t length, bool dont_fragment,
2024                                    bool last, uint32_t context_id,
2025                                    Error **errp)
2026{
2027    int ret;
2028    unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS;
2029    NBDExtent *extents = g_new(NBDExtent, nb_extents);
2030    uint64_t final_length = length;
2031
2032    ret = blockstatus_to_extents(bs, offset, &final_length, extents,
2033                                 &nb_extents);
2034    if (ret < 0) {
2035        g_free(extents);
2036        return nbd_co_send_structured_error(
2037                client, handle, -ret, "can't get block status", errp);
2038    }
2039
2040    ret = nbd_co_send_extents(client, handle, extents, nb_extents,
2041                              final_length, last, context_id, errp);
2042
2043    g_free(extents);
2044
2045    return ret;
2046}
2047
2048/*
2049 * Populate @extents from a dirty bitmap. Unless @dont_fragment, the
2050 * final extent may exceed the original @length. Store in @length the
2051 * byte length encoded (which may be smaller or larger than the
2052 * original), and return the number of extents used.
2053 */
2054static unsigned int bitmap_to_extents(BdrvDirtyBitmap *bitmap, uint64_t offset,
2055                                      uint64_t *length, NBDExtent *extents,
2056                                      unsigned int nb_extents,
2057                                      bool dont_fragment)
2058{
2059    uint64_t begin = offset, end = offset;
2060    uint64_t overall_end = offset + *length;
2061    unsigned int i = 0;
2062    BdrvDirtyBitmapIter *it;
2063    bool dirty;
2064
2065    bdrv_dirty_bitmap_lock(bitmap);
2066
2067    it = bdrv_dirty_iter_new(bitmap);
2068    dirty = bdrv_dirty_bitmap_get_locked(bitmap, offset);
2069
2070    assert(begin < overall_end && nb_extents);
2071    while (begin < overall_end && i < nb_extents) {
2072        bool next_dirty = !dirty;
2073
2074        if (dirty) {
2075            end = bdrv_dirty_bitmap_next_zero(bitmap, begin, UINT64_MAX);
2076        } else {
2077            bdrv_set_dirty_iter(it, begin);
2078            end = bdrv_dirty_iter_next(it);
2079        }
2080        if (end == -1 || end - begin > UINT32_MAX) {
2081            /* Cap to an aligned value < 4G beyond begin. */
2082            end = MIN(bdrv_dirty_bitmap_size(bitmap),
2083                      begin + UINT32_MAX + 1 -
2084                      bdrv_dirty_bitmap_granularity(bitmap));
2085            next_dirty = dirty;
2086        }
2087        if (dont_fragment && end > overall_end) {
2088            end = overall_end;
2089        }
2090
2091        extents[i].length = cpu_to_be32(end - begin);
2092        extents[i].flags = cpu_to_be32(dirty ? NBD_STATE_DIRTY : 0);
2093        i++;
2094        begin = end;
2095        dirty = next_dirty;
2096    }
2097
2098    bdrv_dirty_iter_free(it);
2099
2100    bdrv_dirty_bitmap_unlock(bitmap);
2101
2102    assert(offset < end);
2103    *length = end - offset;
2104    return i;
2105}
2106
2107static int nbd_co_send_bitmap(NBDClient *client, uint64_t handle,
2108                              BdrvDirtyBitmap *bitmap, uint64_t offset,
2109                              uint32_t length, bool dont_fragment, bool last,
2110                              uint32_t context_id, Error **errp)
2111{
2112    int ret;
2113    unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS;
2114    NBDExtent *extents = g_new(NBDExtent, nb_extents);
2115    uint64_t final_length = length;
2116
2117    nb_extents = bitmap_to_extents(bitmap, offset, &final_length, extents,
2118                                   nb_extents, dont_fragment);
2119
2120    ret = nbd_co_send_extents(client, handle, extents, nb_extents,
2121                              final_length, last, context_id, errp);
2122
2123    g_free(extents);
2124
2125    return ret;
2126}
2127
2128/* nbd_co_receive_request
2129 * Collect a client request. Return 0 if request looks valid, -EIO to drop
2130 * connection right away, and any other negative value to report an error to
2131 * the client (although the caller may still need to disconnect after reporting
2132 * the error).
2133 */
2134static int nbd_co_receive_request(NBDRequestData *req, NBDRequest *request,
2135                                  Error **errp)
2136{
2137    NBDClient *client = req->client;
2138    int valid_flags;
2139
2140    g_assert(qemu_in_coroutine());
2141    assert(client->recv_coroutine == qemu_coroutine_self());
2142    if (nbd_receive_request(client->ioc, request, errp) < 0) {
2143        return -EIO;
2144    }
2145
2146    trace_nbd_co_receive_request_decode_type(request->handle, request->type,
2147                                             nbd_cmd_lookup(request->type));
2148
2149    if (request->type != NBD_CMD_WRITE) {
2150        /* No payload, we are ready to read the next request.  */
2151        req->complete = true;
2152    }
2153
2154    if (request->type == NBD_CMD_DISC) {
2155        /* Special case: we're going to disconnect without a reply,
2156         * whether or not flags, from, or len are bogus */
2157        return -EIO;
2158    }
2159
2160    if (request->type == NBD_CMD_READ || request->type == NBD_CMD_WRITE ||
2161        request->type == NBD_CMD_CACHE)
2162    {
2163        if (request->len > NBD_MAX_BUFFER_SIZE) {
2164            error_setg(errp, "len (%" PRIu32" ) is larger than max len (%u)",
2165                       request->len, NBD_MAX_BUFFER_SIZE);
2166            return -EINVAL;
2167        }
2168
2169        if (request->type != NBD_CMD_CACHE) {
2170            req->data = blk_try_blockalign(client->exp->blk, request->len);
2171            if (req->data == NULL) {
2172                error_setg(errp, "No memory");
2173                return -ENOMEM;
2174            }
2175        }
2176    }
2177
2178    if (request->type == NBD_CMD_WRITE) {
2179        if (nbd_read(client->ioc, req->data, request->len, "CMD_WRITE data",
2180                     errp) < 0)
2181        {
2182            return -EIO;
2183        }
2184        req->complete = true;
2185
2186        trace_nbd_co_receive_request_payload_received(request->handle,
2187                                                      request->len);
2188    }
2189
2190    /* Sanity checks. */
2191    if (client->exp->nbdflags & NBD_FLAG_READ_ONLY &&
2192        (request->type == NBD_CMD_WRITE ||
2193         request->type == NBD_CMD_WRITE_ZEROES ||
2194         request->type == NBD_CMD_TRIM)) {
2195        error_setg(errp, "Export is read-only");
2196        return -EROFS;
2197    }
2198    if (request->from > client->exp->size ||
2199        request->len > client->exp->size - request->from) {
2200        error_setg(errp, "operation past EOF; From: %" PRIu64 ", Len: %" PRIu32
2201                   ", Size: %" PRIu64, request->from, request->len,
2202                   client->exp->size);
2203        return (request->type == NBD_CMD_WRITE ||
2204                request->type == NBD_CMD_WRITE_ZEROES) ? -ENOSPC : -EINVAL;
2205    }
2206    if (client->check_align && !QEMU_IS_ALIGNED(request->from | request->len,
2207                                                client->check_align)) {
2208        /*
2209         * The block layer gracefully handles unaligned requests, but
2210         * it's still worth tracing client non-compliance
2211         */
2212        trace_nbd_co_receive_align_compliance(nbd_cmd_lookup(request->type),
2213                                              request->from,
2214                                              request->len,
2215                                              client->check_align);
2216    }
2217    valid_flags = NBD_CMD_FLAG_FUA;
2218    if (request->type == NBD_CMD_READ && client->structured_reply) {
2219        valid_flags |= NBD_CMD_FLAG_DF;
2220    } else if (request->type == NBD_CMD_WRITE_ZEROES) {
2221        valid_flags |= NBD_CMD_FLAG_NO_HOLE | NBD_CMD_FLAG_FAST_ZERO;
2222    } else if (request->type == NBD_CMD_BLOCK_STATUS) {
2223        valid_flags |= NBD_CMD_FLAG_REQ_ONE;
2224    }
2225    if (request->flags & ~valid_flags) {
2226        error_setg(errp, "unsupported flags for command %s (got 0x%x)",
2227                   nbd_cmd_lookup(request->type), request->flags);
2228        return -EINVAL;
2229    }
2230
2231    return 0;
2232}
2233
2234/* Send simple reply without a payload, or a structured error
2235 * @error_msg is ignored if @ret >= 0
2236 * Returns 0 if connection is still live, -errno on failure to talk to client
2237 */
2238static coroutine_fn int nbd_send_generic_reply(NBDClient *client,
2239                                               uint64_t handle,
2240                                               int ret,
2241                                               const char *error_msg,
2242                                               Error **errp)
2243{
2244    if (client->structured_reply && ret < 0) {
2245        return nbd_co_send_structured_error(client, handle, -ret, error_msg,
2246                                            errp);
2247    } else {
2248        return nbd_co_send_simple_reply(client, handle, ret < 0 ? -ret : 0,
2249                                        NULL, 0, errp);
2250    }
2251}
2252
2253/* Handle NBD_CMD_READ request.
2254 * Return -errno if sending fails. Other errors are reported directly to the
2255 * client as an error reply. */
2256static coroutine_fn int nbd_do_cmd_read(NBDClient *client, NBDRequest *request,
2257                                        uint8_t *data, Error **errp)
2258{
2259    int ret;
2260    NBDExport *exp = client->exp;
2261
2262    assert(request->type == NBD_CMD_READ);
2263
2264    /* XXX: NBD Protocol only documents use of FUA with WRITE */
2265    if (request->flags & NBD_CMD_FLAG_FUA) {
2266        ret = blk_co_flush(exp->blk);
2267        if (ret < 0) {
2268            return nbd_send_generic_reply(client, request->handle, ret,
2269                                          "flush failed", errp);
2270        }
2271    }
2272
2273    if (client->structured_reply && !(request->flags & NBD_CMD_FLAG_DF) &&
2274        request->len)
2275    {
2276        return nbd_co_send_sparse_read(client, request->handle, request->from,
2277                                       data, request->len, errp);
2278    }
2279
2280    ret = blk_pread(exp->blk, request->from + exp->dev_offset, data,
2281                    request->len);
2282    if (ret < 0) {
2283        return nbd_send_generic_reply(client, request->handle, ret,
2284                                      "reading from file failed", errp);
2285    }
2286
2287    if (client->structured_reply) {
2288        if (request->len) {
2289            return nbd_co_send_structured_read(client, request->handle,
2290                                               request->from, data,
2291                                               request->len, true, errp);
2292        } else {
2293            return nbd_co_send_structured_done(client, request->handle, errp);
2294        }
2295    } else {
2296        return nbd_co_send_simple_reply(client, request->handle, 0,
2297                                        data, request->len, errp);
2298    }
2299}
2300
2301/*
2302 * nbd_do_cmd_cache
2303 *
2304 * Handle NBD_CMD_CACHE request.
2305 * Return -errno if sending fails. Other errors are reported directly to the
2306 * client as an error reply.
2307 */
2308static coroutine_fn int nbd_do_cmd_cache(NBDClient *client, NBDRequest *request,
2309                                         Error **errp)
2310{
2311    int ret;
2312    NBDExport *exp = client->exp;
2313
2314    assert(request->type == NBD_CMD_CACHE);
2315
2316    ret = blk_co_preadv(exp->blk, request->from + exp->dev_offset, request->len,
2317                        NULL, BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH);
2318
2319    return nbd_send_generic_reply(client, request->handle, ret,
2320                                  "caching data failed", errp);
2321}
2322
2323/* Handle NBD request.
2324 * Return -errno if sending fails. Other errors are reported directly to the
2325 * client as an error reply. */
2326static coroutine_fn int nbd_handle_request(NBDClient *client,
2327                                           NBDRequest *request,
2328                                           uint8_t *data, Error **errp)
2329{
2330    int ret;
2331    int flags;
2332    NBDExport *exp = client->exp;
2333    char *msg;
2334
2335    switch (request->type) {
2336    case NBD_CMD_CACHE:
2337        return nbd_do_cmd_cache(client, request, errp);
2338
2339    case NBD_CMD_READ:
2340        return nbd_do_cmd_read(client, request, data, errp);
2341
2342    case NBD_CMD_WRITE:
2343        flags = 0;
2344        if (request->flags & NBD_CMD_FLAG_FUA) {
2345            flags |= BDRV_REQ_FUA;
2346        }
2347        ret = blk_pwrite(exp->blk, request->from + exp->dev_offset,
2348                         data, request->len, flags);
2349        return nbd_send_generic_reply(client, request->handle, ret,
2350                                      "writing to file failed", errp);
2351
2352    case NBD_CMD_WRITE_ZEROES:
2353        flags = 0;
2354        if (request->flags & NBD_CMD_FLAG_FUA) {
2355            flags |= BDRV_REQ_FUA;
2356        }
2357        if (!(request->flags & NBD_CMD_FLAG_NO_HOLE)) {
2358            flags |= BDRV_REQ_MAY_UNMAP;
2359        }
2360        if (request->flags & NBD_CMD_FLAG_FAST_ZERO) {
2361            flags |= BDRV_REQ_NO_FALLBACK;
2362        }
2363        ret = blk_pwrite_zeroes(exp->blk, request->from + exp->dev_offset,
2364                                request->len, flags);
2365        return nbd_send_generic_reply(client, request->handle, ret,
2366                                      "writing to file failed", errp);
2367
2368    case NBD_CMD_DISC:
2369        /* unreachable, thanks to special case in nbd_co_receive_request() */
2370        abort();
2371
2372    case NBD_CMD_FLUSH:
2373        ret = blk_co_flush(exp->blk);
2374        return nbd_send_generic_reply(client, request->handle, ret,
2375                                      "flush failed", errp);
2376
2377    case NBD_CMD_TRIM:
2378        ret = blk_co_pdiscard(exp->blk, request->from + exp->dev_offset,
2379                              request->len);
2380        if (ret == 0 && request->flags & NBD_CMD_FLAG_FUA) {
2381            ret = blk_co_flush(exp->blk);
2382        }
2383        return nbd_send_generic_reply(client, request->handle, ret,
2384                                      "discard failed", errp);
2385
2386    case NBD_CMD_BLOCK_STATUS:
2387        if (!request->len) {
2388            return nbd_send_generic_reply(client, request->handle, -EINVAL,
2389                                          "need non-zero length", errp);
2390        }
2391        if (client->export_meta.valid &&
2392            (client->export_meta.base_allocation ||
2393             client->export_meta.bitmap))
2394        {
2395            bool dont_fragment = request->flags & NBD_CMD_FLAG_REQ_ONE;
2396
2397            if (client->export_meta.base_allocation) {
2398                ret = nbd_co_send_block_status(client, request->handle,
2399                                               blk_bs(exp->blk), request->from,
2400                                               request->len, dont_fragment,
2401                                               !client->export_meta.bitmap,
2402                                               NBD_META_ID_BASE_ALLOCATION,
2403                                               errp);
2404                if (ret < 0) {
2405                    return ret;
2406                }
2407            }
2408
2409            if (client->export_meta.bitmap) {
2410                ret = nbd_co_send_bitmap(client, request->handle,
2411                                         client->exp->export_bitmap,
2412                                         request->from, request->len,
2413                                         dont_fragment,
2414                                         true, NBD_META_ID_DIRTY_BITMAP, errp);
2415                if (ret < 0) {
2416                    return ret;
2417                }
2418            }
2419
2420            return ret;
2421        } else {
2422            return nbd_send_generic_reply(client, request->handle, -EINVAL,
2423                                          "CMD_BLOCK_STATUS not negotiated",
2424                                          errp);
2425        }
2426
2427    default:
2428        msg = g_strdup_printf("invalid request type (%" PRIu32 ") received",
2429                              request->type);
2430        ret = nbd_send_generic_reply(client, request->handle, -EINVAL, msg,
2431                                     errp);
2432        g_free(msg);
2433        return ret;
2434    }
2435}
2436
2437/* Owns a reference to the NBDClient passed as opaque.  */
2438static coroutine_fn void nbd_trip(void *opaque)
2439{
2440    NBDClient *client = opaque;
2441    NBDRequestData *req;
2442    NBDRequest request = { 0 };    /* GCC thinks it can be used uninitialized */
2443    int ret;
2444    Error *local_err = NULL;
2445
2446    trace_nbd_trip();
2447    if (client->closing) {
2448        nbd_client_put(client);
2449        return;
2450    }
2451
2452    req = nbd_request_get(client);
2453    ret = nbd_co_receive_request(req, &request, &local_err);
2454    client->recv_coroutine = NULL;
2455
2456    if (client->closing) {
2457        /*
2458         * The client may be closed when we are blocked in
2459         * nbd_co_receive_request()
2460         */
2461        goto done;
2462    }
2463
2464    nbd_client_receive_next_request(client);
2465    if (ret == -EIO) {
2466        goto disconnect;
2467    }
2468
2469    if (ret < 0) {
2470        /* It wans't -EIO, so, according to nbd_co_receive_request()
2471         * semantics, we should return the error to the client. */
2472        Error *export_err = local_err;
2473
2474        local_err = NULL;
2475        ret = nbd_send_generic_reply(client, request.handle, -EINVAL,
2476                                     error_get_pretty(export_err), &local_err);
2477        error_free(export_err);
2478    } else {
2479        ret = nbd_handle_request(client, &request, req->data, &local_err);
2480    }
2481    if (ret < 0) {
2482        error_prepend(&local_err, "Failed to send reply: ");
2483        goto disconnect;
2484    }
2485
2486    /* We must disconnect after NBD_CMD_WRITE if we did not
2487     * read the payload.
2488     */
2489    if (!req->complete) {
2490        error_setg(&local_err, "Request handling failed in intermediate state");
2491        goto disconnect;
2492    }
2493
2494done:
2495    nbd_request_put(req);
2496    nbd_client_put(client);
2497    return;
2498
2499disconnect:
2500    if (local_err) {
2501        error_reportf_err(local_err, "Disconnect client, due to: ");
2502    }
2503    nbd_request_put(req);
2504    client_close(client, true);
2505    nbd_client_put(client);
2506}
2507
2508static void nbd_client_receive_next_request(NBDClient *client)
2509{
2510    if (!client->recv_coroutine && client->nb_requests < MAX_NBD_REQUESTS) {
2511        nbd_client_get(client);
2512        client->recv_coroutine = qemu_coroutine_create(nbd_trip, client);
2513        aio_co_schedule(client->exp->ctx, client->recv_coroutine);
2514    }
2515}
2516
2517static coroutine_fn void nbd_co_client_start(void *opaque)
2518{
2519    NBDClient *client = opaque;
2520    Error *local_err = NULL;
2521
2522    qemu_co_mutex_init(&client->send_lock);
2523
2524    if (nbd_negotiate(client, &local_err)) {
2525        if (local_err) {
2526            error_report_err(local_err);
2527        }
2528        client_close(client, false);
2529        return;
2530    }
2531
2532    nbd_client_receive_next_request(client);
2533}
2534
2535/*
2536 * Create a new client listener using the given channel @sioc.
2537 * Begin servicing it in a coroutine.  When the connection closes, call
2538 * @close_fn with an indication of whether the client completed negotiation.
2539 */
2540void nbd_client_new(QIOChannelSocket *sioc,
2541                    QCryptoTLSCreds *tlscreds,
2542                    const char *tlsauthz,
2543                    void (*close_fn)(NBDClient *, bool))
2544{
2545    NBDClient *client;
2546    Coroutine *co;
2547
2548    client = g_new0(NBDClient, 1);
2549    client->refcount = 1;
2550    client->tlscreds = tlscreds;
2551    if (tlscreds) {
2552        object_ref(OBJECT(client->tlscreds));
2553    }
2554    client->tlsauthz = g_strdup(tlsauthz);
2555    client->sioc = sioc;
2556    object_ref(OBJECT(client->sioc));
2557    client->ioc = QIO_CHANNEL(sioc);
2558    object_ref(OBJECT(client->ioc));
2559    client->close_fn = close_fn;
2560
2561    co = qemu_coroutine_create(nbd_co_client_start, client);
2562    qemu_coroutine_enter(co);
2563}
2564