qemu/nbd/server.c
<<
>>
Prefs
   1/*
   2 *  Copyright (C) 2016-2017 Red Hat, Inc.
   3 *  Copyright (C) 2005  Anthony Liguori <anthony@codemonkey.ws>
   4 *
   5 *  Network Block Device Server Side
   6 *
   7 *  This program is free software; you can redistribute it and/or modify
   8 *  it under the terms of the GNU General Public License as published by
   9 *  the Free Software Foundation; under version 2 of the License.
  10 *
  11 *  This program is distributed in the hope that it will be useful,
  12 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 *  GNU General Public License for more details.
  15 *
  16 *  You should have received a copy of the GNU General Public License
  17 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include "qapi/error.h"
  22#include "trace.h"
  23#include "nbd-internal.h"
  24
  25static int system_errno_to_nbd_errno(int err)
  26{
  27    switch (err) {
  28    case 0:
  29        return NBD_SUCCESS;
  30    case EPERM:
  31    case EROFS:
  32        return NBD_EPERM;
  33    case EIO:
  34        return NBD_EIO;
  35    case ENOMEM:
  36        return NBD_ENOMEM;
  37#ifdef EDQUOT
  38    case EDQUOT:
  39#endif
  40    case EFBIG:
  41    case ENOSPC:
  42        return NBD_ENOSPC;
  43    case EOVERFLOW:
  44        return NBD_EOVERFLOW;
  45    case ESHUTDOWN:
  46        return NBD_ESHUTDOWN;
  47    case EINVAL:
  48    default:
  49        return NBD_EINVAL;
  50    }
  51}
  52
  53/* Definitions for opaque data types */
  54
  55typedef struct NBDRequestData NBDRequestData;
  56
  57struct NBDRequestData {
  58    QSIMPLEQ_ENTRY(NBDRequestData) entry;
  59    NBDClient *client;
  60    uint8_t *data;
  61    bool complete;
  62};
  63
  64struct NBDExport {
  65    int refcount;
  66    void (*close)(NBDExport *exp);
  67
  68    BlockBackend *blk;
  69    char *name;
  70    char *description;
  71    off_t dev_offset;
  72    off_t size;
  73    uint16_t nbdflags;
  74    QTAILQ_HEAD(, NBDClient) clients;
  75    QTAILQ_ENTRY(NBDExport) next;
  76
  77    AioContext *ctx;
  78
  79    BlockBackend *eject_notifier_blk;
  80    Notifier eject_notifier;
  81};
  82
  83static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
  84
  85struct NBDClient {
  86    int refcount;
  87    void (*close_fn)(NBDClient *client, bool negotiated);
  88
  89    NBDExport *exp;
  90    QCryptoTLSCreds *tlscreds;
  91    char *tlsaclname;
  92    QIOChannelSocket *sioc; /* The underlying data channel */
  93    QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */
  94
  95    Coroutine *recv_coroutine;
  96
  97    CoMutex send_lock;
  98    Coroutine *send_coroutine;
  99
 100    QTAILQ_ENTRY(NBDClient) next;
 101    int nb_requests;
 102    bool closing;
 103
 104    bool structured_reply;
 105};
 106
 107/* That's all folks */
 108
 109static void nbd_client_receive_next_request(NBDClient *client);
 110
 111/* Basic flow for negotiation
 112
 113   Server         Client
 114   Negotiate
 115
 116   or
 117
 118   Server         Client
 119   Negotiate #1
 120                  Option
 121   Negotiate #2
 122
 123   ----
 124
 125   followed by
 126
 127   Server         Client
 128                  Request
 129   Response
 130                  Request
 131   Response
 132                  ...
 133   ...
 134                  Request (type == 2)
 135
 136*/
 137
 138/* Send a reply header, including length, but no payload.
 139 * Return -errno on error, 0 on success. */
 140static int nbd_negotiate_send_rep_len(QIOChannel *ioc, uint32_t type,
 141                                      uint32_t opt, uint32_t len, Error **errp)
 142{
 143    uint64_t magic;
 144
 145    trace_nbd_negotiate_send_rep_len(opt, nbd_opt_lookup(opt),
 146                                     type, nbd_rep_lookup(type), len);
 147
 148    assert(len < NBD_MAX_BUFFER_SIZE);
 149    magic = cpu_to_be64(NBD_REP_MAGIC);
 150    if (nbd_write(ioc, &magic, sizeof(magic), errp) < 0) {
 151        error_prepend(errp, "write failed (rep magic): ");
 152        return -EINVAL;
 153    }
 154
 155    opt = cpu_to_be32(opt);
 156    if (nbd_write(ioc, &opt, sizeof(opt), errp) < 0) {
 157        error_prepend(errp, "write failed (rep opt): ");
 158        return -EINVAL;
 159    }
 160
 161    type = cpu_to_be32(type);
 162    if (nbd_write(ioc, &type, sizeof(type), errp) < 0) {
 163        error_prepend(errp, "write failed (rep type): ");
 164        return -EINVAL;
 165    }
 166
 167    len = cpu_to_be32(len);
 168    if (nbd_write(ioc, &len, sizeof(len), errp) < 0) {
 169        error_prepend(errp, "write failed (rep data length): ");
 170        return -EINVAL;
 171    }
 172    return 0;
 173}
 174
 175/* Send a reply header with default 0 length.
 176 * Return -errno on error, 0 on success. */
 177static int nbd_negotiate_send_rep(QIOChannel *ioc, uint32_t type, uint32_t opt,
 178                                  Error **errp)
 179{
 180    return nbd_negotiate_send_rep_len(ioc, type, opt, 0, errp);
 181}
 182
 183/* Send an error reply.
 184 * Return -errno on error, 0 on success. */
 185static int GCC_FMT_ATTR(5, 6)
 186nbd_negotiate_send_rep_err(QIOChannel *ioc, uint32_t type,
 187                           uint32_t opt, Error **errp, const char *fmt, ...)
 188{
 189    va_list va;
 190    char *msg;
 191    int ret;
 192    size_t len;
 193
 194    va_start(va, fmt);
 195    msg = g_strdup_vprintf(fmt, va);
 196    va_end(va);
 197    len = strlen(msg);
 198    assert(len < 4096);
 199    trace_nbd_negotiate_send_rep_err(msg);
 200    ret = nbd_negotiate_send_rep_len(ioc, type, opt, len, errp);
 201    if (ret < 0) {
 202        goto out;
 203    }
 204    if (nbd_write(ioc, msg, len, errp) < 0) {
 205        error_prepend(errp, "write failed (error message): ");
 206        ret = -EIO;
 207    } else {
 208        ret = 0;
 209    }
 210
 211out:
 212    g_free(msg);
 213    return ret;
 214}
 215
 216/* Send a single NBD_REP_SERVER reply to NBD_OPT_LIST, including payload.
 217 * Return -errno on error, 0 on success. */
 218static int nbd_negotiate_send_rep_list(QIOChannel *ioc, NBDExport *exp,
 219                                       Error **errp)
 220{
 221    size_t name_len, desc_len;
 222    uint32_t len;
 223    const char *name = exp->name ? exp->name : "";
 224    const char *desc = exp->description ? exp->description : "";
 225    int ret;
 226
 227    trace_nbd_negotiate_send_rep_list(name, desc);
 228    name_len = strlen(name);
 229    desc_len = strlen(desc);
 230    len = name_len + desc_len + sizeof(len);
 231    ret = nbd_negotiate_send_rep_len(ioc, NBD_REP_SERVER, NBD_OPT_LIST, len,
 232                                     errp);
 233    if (ret < 0) {
 234        return ret;
 235    }
 236
 237    len = cpu_to_be32(name_len);
 238    if (nbd_write(ioc, &len, sizeof(len), errp) < 0) {
 239        error_prepend(errp, "write failed (name length): ");
 240        return -EINVAL;
 241    }
 242
 243    if (nbd_write(ioc, name, name_len, errp) < 0) {
 244        error_prepend(errp, "write failed (name buffer): ");
 245        return -EINVAL;
 246    }
 247
 248    if (nbd_write(ioc, desc, desc_len, errp) < 0) {
 249        error_prepend(errp, "write failed (description buffer): ");
 250        return -EINVAL;
 251    }
 252
 253    return 0;
 254}
 255
 256/* Process the NBD_OPT_LIST command, with a potential series of replies.
 257 * Return -errno on error, 0 on success. */
 258static int nbd_negotiate_handle_list(NBDClient *client, Error **errp)
 259{
 260    NBDExport *exp;
 261
 262    /* For each export, send a NBD_REP_SERVER reply. */
 263    QTAILQ_FOREACH(exp, &exports, next) {
 264        if (nbd_negotiate_send_rep_list(client->ioc, exp, errp)) {
 265            return -EINVAL;
 266        }
 267    }
 268    /* Finish with a NBD_REP_ACK. */
 269    return nbd_negotiate_send_rep(client->ioc, NBD_REP_ACK, NBD_OPT_LIST, errp);
 270}
 271
 272/* Send a reply to NBD_OPT_EXPORT_NAME.
 273 * Return -errno on error, 0 on success. */
 274static int nbd_negotiate_handle_export_name(NBDClient *client, uint32_t length,
 275                                            uint16_t myflags, bool no_zeroes,
 276                                            Error **errp)
 277{
 278    char name[NBD_MAX_NAME_SIZE + 1];
 279    char buf[NBD_REPLY_EXPORT_NAME_SIZE] = "";
 280    size_t len;
 281    int ret;
 282
 283    /* Client sends:
 284        [20 ..  xx]   export name (length bytes)
 285       Server replies:
 286        [ 0 ..   7]   size
 287        [ 8 ..   9]   export flags
 288        [10 .. 133]   reserved     (0) [unless no_zeroes]
 289     */
 290    trace_nbd_negotiate_handle_export_name();
 291    if (length >= sizeof(name)) {
 292        error_setg(errp, "Bad length received");
 293        return -EINVAL;
 294    }
 295    if (nbd_read(client->ioc, name, length, errp) < 0) {
 296        error_prepend(errp, "read failed: ");
 297        return -EINVAL;
 298    }
 299    name[length] = '\0';
 300
 301    trace_nbd_negotiate_handle_export_name_request(name);
 302
 303    client->exp = nbd_export_find(name);
 304    if (!client->exp) {
 305        error_setg(errp, "export not found");
 306        return -EINVAL;
 307    }
 308
 309    trace_nbd_negotiate_new_style_size_flags(client->exp->size,
 310                                             client->exp->nbdflags | myflags);
 311    stq_be_p(buf, client->exp->size);
 312    stw_be_p(buf + 8, client->exp->nbdflags | myflags);
 313    len = no_zeroes ? 10 : sizeof(buf);
 314    ret = nbd_write(client->ioc, buf, len, errp);
 315    if (ret < 0) {
 316        error_prepend(errp, "write failed: ");
 317        return ret;
 318    }
 319
 320    QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
 321    nbd_export_get(client->exp);
 322
 323    return 0;
 324}
 325
 326/* Send a single NBD_REP_INFO, with a buffer @buf of @length bytes.
 327 * The buffer does NOT include the info type prefix.
 328 * Return -errno on error, 0 if ready to send more. */
 329static int nbd_negotiate_send_info(NBDClient *client, uint32_t opt,
 330                                   uint16_t info, uint32_t length, void *buf,
 331                                   Error **errp)
 332{
 333    int rc;
 334
 335    trace_nbd_negotiate_send_info(info, nbd_info_lookup(info), length);
 336    rc = nbd_negotiate_send_rep_len(client->ioc, NBD_REP_INFO, opt,
 337                                    sizeof(info) + length, errp);
 338    if (rc < 0) {
 339        return rc;
 340    }
 341    cpu_to_be16s(&info);
 342    if (nbd_write(client->ioc, &info, sizeof(info), errp) < 0) {
 343        return -EIO;
 344    }
 345    if (nbd_write(client->ioc, buf, length, errp) < 0) {
 346        return -EIO;
 347    }
 348    return 0;
 349}
 350
 351/* Handle NBD_OPT_INFO and NBD_OPT_GO.
 352 * Return -errno on error, 0 if ready for next option, and 1 to move
 353 * into transmission phase.  */
 354static int nbd_negotiate_handle_info(NBDClient *client, uint32_t length,
 355                                     uint32_t opt, uint16_t myflags,
 356                                     Error **errp)
 357{
 358    int rc;
 359    char name[NBD_MAX_NAME_SIZE + 1];
 360    NBDExport *exp;
 361    uint16_t requests;
 362    uint16_t request;
 363    uint32_t namelen;
 364    bool sendname = false;
 365    bool blocksize = false;
 366    uint32_t sizes[3];
 367    char buf[sizeof(uint64_t) + sizeof(uint16_t)];
 368    const char *msg;
 369
 370    /* Client sends:
 371        4 bytes: L, name length (can be 0)
 372        L bytes: export name
 373        2 bytes: N, number of requests (can be 0)
 374        N * 2 bytes: N requests
 375    */
 376    if (length < sizeof(namelen) + sizeof(requests)) {
 377        msg = "overall request too short";
 378        goto invalid;
 379    }
 380    if (nbd_read(client->ioc, &namelen, sizeof(namelen), errp) < 0) {
 381        return -EIO;
 382    }
 383    be32_to_cpus(&namelen);
 384    length -= sizeof(namelen);
 385    if (namelen > length - sizeof(requests) || (length - namelen) % 2) {
 386        msg = "name length is incorrect";
 387        goto invalid;
 388    }
 389    if (namelen >= sizeof(name)) {
 390        msg = "name too long for qemu";
 391        goto invalid;
 392    }
 393    if (nbd_read(client->ioc, name, namelen, errp) < 0) {
 394        return -EIO;
 395    }
 396    name[namelen] = '\0';
 397    length -= namelen;
 398    trace_nbd_negotiate_handle_export_name_request(name);
 399
 400    if (nbd_read(client->ioc, &requests, sizeof(requests), errp) < 0) {
 401        return -EIO;
 402    }
 403    be16_to_cpus(&requests);
 404    length -= sizeof(requests);
 405    trace_nbd_negotiate_handle_info_requests(requests);
 406    if (requests != length / sizeof(request)) {
 407        msg = "incorrect number of  requests for overall length";
 408        goto invalid;
 409    }
 410    while (requests--) {
 411        if (nbd_read(client->ioc, &request, sizeof(request), errp) < 0) {
 412            return -EIO;
 413        }
 414        be16_to_cpus(&request);
 415        length -= sizeof(request);
 416        trace_nbd_negotiate_handle_info_request(request,
 417                                                nbd_info_lookup(request));
 418        /* We care about NBD_INFO_NAME and NBD_INFO_BLOCK_SIZE;
 419         * everything else is either a request we don't know or
 420         * something we send regardless of request */
 421        switch (request) {
 422        case NBD_INFO_NAME:
 423            sendname = true;
 424            break;
 425        case NBD_INFO_BLOCK_SIZE:
 426            blocksize = true;
 427            break;
 428        }
 429    }
 430    assert(length == 0);
 431
 432    exp = nbd_export_find(name);
 433    if (!exp) {
 434        return nbd_negotiate_send_rep_err(client->ioc, NBD_REP_ERR_UNKNOWN,
 435                                          opt, errp, "export '%s' not present",
 436                                          name);
 437    }
 438
 439    /* Don't bother sending NBD_INFO_NAME unless client requested it */
 440    if (sendname) {
 441        rc = nbd_negotiate_send_info(client, opt, NBD_INFO_NAME, namelen, name,
 442                                     errp);
 443        if (rc < 0) {
 444            return rc;
 445        }
 446    }
 447
 448    /* Send NBD_INFO_DESCRIPTION only if available, regardless of
 449     * client request */
 450    if (exp->description) {
 451        size_t len = strlen(exp->description);
 452
 453        rc = nbd_negotiate_send_info(client, opt, NBD_INFO_DESCRIPTION,
 454                                     len, exp->description, errp);
 455        if (rc < 0) {
 456            return rc;
 457        }
 458    }
 459
 460    /* Send NBD_INFO_BLOCK_SIZE always, but tweak the minimum size
 461     * according to whether the client requested it, and according to
 462     * whether this is OPT_INFO or OPT_GO. */
 463    /* minimum - 1 for back-compat, or 512 if client is new enough.
 464     * TODO: consult blk_bs(blk)->bl.request_alignment? */
 465    sizes[0] = (opt == NBD_OPT_INFO || blocksize) ? BDRV_SECTOR_SIZE : 1;
 466    /* preferred - Hard-code to 4096 for now.
 467     * TODO: is blk_bs(blk)->bl.opt_transfer appropriate? */
 468    sizes[1] = 4096;
 469    /* maximum - At most 32M, but smaller as appropriate. */
 470    sizes[2] = MIN(blk_get_max_transfer(exp->blk), NBD_MAX_BUFFER_SIZE);
 471    trace_nbd_negotiate_handle_info_block_size(sizes[0], sizes[1], sizes[2]);
 472    cpu_to_be32s(&sizes[0]);
 473    cpu_to_be32s(&sizes[1]);
 474    cpu_to_be32s(&sizes[2]);
 475    rc = nbd_negotiate_send_info(client, opt, NBD_INFO_BLOCK_SIZE,
 476                                 sizeof(sizes), sizes, errp);
 477    if (rc < 0) {
 478        return rc;
 479    }
 480
 481    /* Send NBD_INFO_EXPORT always */
 482    trace_nbd_negotiate_new_style_size_flags(exp->size,
 483                                             exp->nbdflags | myflags);
 484    stq_be_p(buf, exp->size);
 485    stw_be_p(buf + 8, exp->nbdflags | myflags);
 486    rc = nbd_negotiate_send_info(client, opt, NBD_INFO_EXPORT,
 487                                 sizeof(buf), buf, errp);
 488    if (rc < 0) {
 489        return rc;
 490    }
 491
 492    /* If the client is just asking for NBD_OPT_INFO, but forgot to
 493     * request block sizes, return an error.
 494     * TODO: consult blk_bs(blk)->request_align, and only error if it
 495     * is not 1? */
 496    if (opt == NBD_OPT_INFO && !blocksize) {
 497        return nbd_negotiate_send_rep_err(client->ioc,
 498                                          NBD_REP_ERR_BLOCK_SIZE_REQD, opt,
 499                                          errp,
 500                                          "request NBD_INFO_BLOCK_SIZE to "
 501                                          "use this export");
 502    }
 503
 504    /* Final reply */
 505    rc = nbd_negotiate_send_rep(client->ioc, NBD_REP_ACK, opt, errp);
 506    if (rc < 0) {
 507        return rc;
 508    }
 509
 510    if (opt == NBD_OPT_GO) {
 511        client->exp = exp;
 512        QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
 513        nbd_export_get(client->exp);
 514        rc = 1;
 515    }
 516    return rc;
 517
 518 invalid:
 519    if (nbd_drop(client->ioc, length, errp) < 0) {
 520        return -EIO;
 521    }
 522    return nbd_negotiate_send_rep_err(client->ioc, NBD_REP_ERR_INVALID, opt,
 523                                      errp, "%s", msg);
 524}
 525
 526
 527/* Handle NBD_OPT_STARTTLS. Return NULL to drop connection, or else the
 528 * new channel for all further (now-encrypted) communication. */
 529static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client,
 530                                                 Error **errp)
 531{
 532    QIOChannel *ioc;
 533    QIOChannelTLS *tioc;
 534    struct NBDTLSHandshakeData data = { 0 };
 535
 536    trace_nbd_negotiate_handle_starttls();
 537    ioc = client->ioc;
 538
 539    if (nbd_negotiate_send_rep(client->ioc, NBD_REP_ACK,
 540                               NBD_OPT_STARTTLS, errp) < 0) {
 541        return NULL;
 542    }
 543
 544    tioc = qio_channel_tls_new_server(ioc,
 545                                      client->tlscreds,
 546                                      client->tlsaclname,
 547                                      errp);
 548    if (!tioc) {
 549        return NULL;
 550    }
 551
 552    qio_channel_set_name(QIO_CHANNEL(tioc), "nbd-server-tls");
 553    trace_nbd_negotiate_handle_starttls_handshake();
 554    data.loop = g_main_loop_new(g_main_context_default(), FALSE);
 555    qio_channel_tls_handshake(tioc,
 556                              nbd_tls_handshake,
 557                              &data,
 558                              NULL);
 559
 560    if (!data.complete) {
 561        g_main_loop_run(data.loop);
 562    }
 563    g_main_loop_unref(data.loop);
 564    if (data.error) {
 565        object_unref(OBJECT(tioc));
 566        error_propagate(errp, data.error);
 567        return NULL;
 568    }
 569
 570    return QIO_CHANNEL(tioc);
 571}
 572
 573/* nbd_reject_length: Handle any unexpected payload.
 574 * @fatal requests that we quit talking to the client, even if we are able
 575 * to successfully send an error to the guest.
 576 * Return:
 577 * -errno  transmission error occurred or @fatal was requested, errp is set
 578 * 0       error message successfully sent to client, errp is not set
 579 */
 580static int nbd_reject_length(NBDClient *client, uint32_t length,
 581                             uint32_t option, bool fatal, Error **errp)
 582{
 583    int ret;
 584
 585    assert(length);
 586    if (nbd_drop(client->ioc, length, errp) < 0) {
 587        return -EIO;
 588    }
 589    ret = nbd_negotiate_send_rep_err(client->ioc, NBD_REP_ERR_INVALID,
 590                                     option, errp,
 591                                     "option '%s' should have zero length",
 592                                     nbd_opt_lookup(option));
 593    if (fatal && !ret) {
 594        error_setg(errp, "option '%s' should have zero length",
 595                   nbd_opt_lookup(option));
 596        return -EINVAL;
 597    }
 598    return ret;
 599}
 600
 601/* nbd_negotiate_options
 602 * Process all NBD_OPT_* client option commands, during fixed newstyle
 603 * negotiation.
 604 * Return:
 605 * -errno  on error, errp is set
 606 * 0       on successful negotiation, errp is not set
 607 * 1       if client sent NBD_OPT_ABORT, i.e. on valid disconnect,
 608 *         errp is not set
 609 */
 610static int nbd_negotiate_options(NBDClient *client, uint16_t myflags,
 611                                 Error **errp)
 612{
 613    uint32_t flags;
 614    bool fixedNewstyle = false;
 615    bool no_zeroes = false;
 616
 617    /* Client sends:
 618        [ 0 ..   3]   client flags
 619
 620       Then we loop until NBD_OPT_EXPORT_NAME or NBD_OPT_GO:
 621        [ 0 ..   7]   NBD_OPTS_MAGIC
 622        [ 8 ..  11]   NBD option
 623        [12 ..  15]   Data length
 624        ...           Rest of request
 625
 626        [ 0 ..   7]   NBD_OPTS_MAGIC
 627        [ 8 ..  11]   Second NBD option
 628        [12 ..  15]   Data length
 629        ...           Rest of request
 630    */
 631
 632    if (nbd_read(client->ioc, &flags, sizeof(flags), errp) < 0) {
 633        error_prepend(errp, "read failed: ");
 634        return -EIO;
 635    }
 636    be32_to_cpus(&flags);
 637    trace_nbd_negotiate_options_flags(flags);
 638    if (flags & NBD_FLAG_C_FIXED_NEWSTYLE) {
 639        fixedNewstyle = true;
 640        flags &= ~NBD_FLAG_C_FIXED_NEWSTYLE;
 641    }
 642    if (flags & NBD_FLAG_C_NO_ZEROES) {
 643        no_zeroes = true;
 644        flags &= ~NBD_FLAG_C_NO_ZEROES;
 645    }
 646    if (flags != 0) {
 647        error_setg(errp, "Unknown client flags 0x%" PRIx32 " received", flags);
 648        return -EINVAL;
 649    }
 650
 651    while (1) {
 652        int ret;
 653        uint32_t option, length;
 654        uint64_t magic;
 655
 656        if (nbd_read(client->ioc, &magic, sizeof(magic), errp) < 0) {
 657            error_prepend(errp, "read failed: ");
 658            return -EINVAL;
 659        }
 660        magic = be64_to_cpu(magic);
 661        trace_nbd_negotiate_options_check_magic(magic);
 662        if (magic != NBD_OPTS_MAGIC) {
 663            error_setg(errp, "Bad magic received");
 664            return -EINVAL;
 665        }
 666
 667        if (nbd_read(client->ioc, &option,
 668                     sizeof(option), errp) < 0) {
 669            error_prepend(errp, "read failed: ");
 670            return -EINVAL;
 671        }
 672        option = be32_to_cpu(option);
 673
 674        if (nbd_read(client->ioc, &length, sizeof(length), errp) < 0) {
 675            error_prepend(errp, "read failed: ");
 676            return -EINVAL;
 677        }
 678        length = be32_to_cpu(length);
 679
 680        if (length > NBD_MAX_BUFFER_SIZE) {
 681            error_setg(errp, "len (%" PRIu32" ) is larger than max len (%u)",
 682                       length, NBD_MAX_BUFFER_SIZE);
 683            return -EINVAL;
 684        }
 685
 686        trace_nbd_negotiate_options_check_option(option,
 687                                                 nbd_opt_lookup(option));
 688        if (client->tlscreds &&
 689            client->ioc == (QIOChannel *)client->sioc) {
 690            QIOChannel *tioc;
 691            if (!fixedNewstyle) {
 692                error_setg(errp, "Unsupported option 0x%" PRIx32, option);
 693                return -EINVAL;
 694            }
 695            switch (option) {
 696            case NBD_OPT_STARTTLS:
 697                if (length) {
 698                    /* Unconditionally drop the connection if the client
 699                     * can't start a TLS negotiation correctly */
 700                    return nbd_reject_length(client, length, option, true,
 701                                             errp);
 702                }
 703                tioc = nbd_negotiate_handle_starttls(client, errp);
 704                if (!tioc) {
 705                    return -EIO;
 706                }
 707                ret = 0;
 708                object_unref(OBJECT(client->ioc));
 709                client->ioc = QIO_CHANNEL(tioc);
 710                break;
 711
 712            case NBD_OPT_EXPORT_NAME:
 713                /* No way to return an error to client, so drop connection */
 714                error_setg(errp, "Option 0x%x not permitted before TLS",
 715                           option);
 716                return -EINVAL;
 717
 718            default:
 719                if (nbd_drop(client->ioc, length, errp) < 0) {
 720                    return -EIO;
 721                }
 722                ret = nbd_negotiate_send_rep_err(client->ioc,
 723                                                 NBD_REP_ERR_TLS_REQD,
 724                                                 option, errp,
 725                                                 "Option 0x%" PRIx32
 726                                                 "not permitted before TLS",
 727                                                 option);
 728                /* Let the client keep trying, unless they asked to
 729                 * quit. In this mode, we've already sent an error, so
 730                 * we can't ack the abort.  */
 731                if (option == NBD_OPT_ABORT) {
 732                    return 1;
 733                }
 734                break;
 735            }
 736        } else if (fixedNewstyle) {
 737            switch (option) {
 738            case NBD_OPT_LIST:
 739                if (length) {
 740                    ret = nbd_reject_length(client, length, option, false,
 741                                            errp);
 742                } else {
 743                    ret = nbd_negotiate_handle_list(client, errp);
 744                }
 745                break;
 746
 747            case NBD_OPT_ABORT:
 748                /* NBD spec says we must try to reply before
 749                 * disconnecting, but that we must also tolerate
 750                 * guests that don't wait for our reply. */
 751                nbd_negotiate_send_rep(client->ioc, NBD_REP_ACK, option, NULL);
 752                return 1;
 753
 754            case NBD_OPT_EXPORT_NAME:
 755                return nbd_negotiate_handle_export_name(client, length,
 756                                                        myflags, no_zeroes,
 757                                                        errp);
 758
 759            case NBD_OPT_INFO:
 760            case NBD_OPT_GO:
 761                ret = nbd_negotiate_handle_info(client, length, option,
 762                                                myflags, errp);
 763                if (ret == 1) {
 764                    assert(option == NBD_OPT_GO);
 765                    return 0;
 766                }
 767                break;
 768
 769            case NBD_OPT_STARTTLS:
 770                if (length) {
 771                    ret = nbd_reject_length(client, length, option, false,
 772                                            errp);
 773                } else if (client->tlscreds) {
 774                    ret = nbd_negotiate_send_rep_err(client->ioc,
 775                                                     NBD_REP_ERR_INVALID,
 776                                                     option, errp,
 777                                                     "TLS already enabled");
 778                } else {
 779                    ret = nbd_negotiate_send_rep_err(client->ioc,
 780                                                     NBD_REP_ERR_POLICY,
 781                                                     option, errp,
 782                                                     "TLS not configured");
 783                }
 784                break;
 785
 786            case NBD_OPT_STRUCTURED_REPLY:
 787                if (length) {
 788                    ret = nbd_reject_length(client, length, option, false,
 789                                            errp);
 790                } else if (client->structured_reply) {
 791                    ret = nbd_negotiate_send_rep_err(
 792                        client->ioc, NBD_REP_ERR_INVALID, option, errp,
 793                        "structured reply already negotiated");
 794                } else {
 795                    ret = nbd_negotiate_send_rep(client->ioc, NBD_REP_ACK,
 796                                                 option, errp);
 797                    client->structured_reply = true;
 798                    myflags |= NBD_FLAG_SEND_DF;
 799                }
 800                break;
 801
 802            default:
 803                if (nbd_drop(client->ioc, length, errp) < 0) {
 804                    return -EIO;
 805                }
 806                ret = nbd_negotiate_send_rep_err(client->ioc,
 807                                                 NBD_REP_ERR_UNSUP,
 808                                                 option, errp,
 809                                                 "Unsupported option 0x%"
 810                                                 PRIx32 " (%s)", option,
 811                                                 nbd_opt_lookup(option));
 812                break;
 813            }
 814        } else {
 815            /*
 816             * If broken new-style we should drop the connection
 817             * for anything except NBD_OPT_EXPORT_NAME
 818             */
 819            switch (option) {
 820            case NBD_OPT_EXPORT_NAME:
 821                return nbd_negotiate_handle_export_name(client, length,
 822                                                        myflags, no_zeroes,
 823                                                        errp);
 824
 825            default:
 826                error_setg(errp, "Unsupported option 0x%" PRIx32 " (%s)",
 827                           option, nbd_opt_lookup(option));
 828                return -EINVAL;
 829            }
 830        }
 831        if (ret < 0) {
 832            return ret;
 833        }
 834    }
 835}
 836
 837/* nbd_negotiate
 838 * Return:
 839 * -errno  on error, errp is set
 840 * 0       on successful negotiation, errp is not set
 841 * 1       if client sent NBD_OPT_ABORT, i.e. on valid disconnect,
 842 *         errp is not set
 843 */
 844static coroutine_fn int nbd_negotiate(NBDClient *client, Error **errp)
 845{
 846    char buf[NBD_OLDSTYLE_NEGOTIATE_SIZE] = "";
 847    int ret;
 848    const uint16_t myflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM |
 849                              NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA |
 850                              NBD_FLAG_SEND_WRITE_ZEROES);
 851    bool oldStyle;
 852
 853    /* Old style negotiation header, no room for options
 854        [ 0 ..   7]   passwd       ("NBDMAGIC")
 855        [ 8 ..  15]   magic        (NBD_CLIENT_MAGIC)
 856        [16 ..  23]   size
 857        [24 ..  27]   export flags (zero-extended)
 858        [28 .. 151]   reserved     (0)
 859
 860       New style negotiation header, client can send options
 861        [ 0 ..   7]   passwd       ("NBDMAGIC")
 862        [ 8 ..  15]   magic        (NBD_OPTS_MAGIC)
 863        [16 ..  17]   server flags (0)
 864        ....options sent, ending in NBD_OPT_EXPORT_NAME or NBD_OPT_GO....
 865     */
 866
 867    qio_channel_set_blocking(client->ioc, false, NULL);
 868
 869    trace_nbd_negotiate_begin();
 870    memcpy(buf, "NBDMAGIC", 8);
 871
 872    oldStyle = client->exp != NULL && !client->tlscreds;
 873    if (oldStyle) {
 874        trace_nbd_negotiate_old_style(client->exp->size,
 875                                      client->exp->nbdflags | myflags);
 876        stq_be_p(buf + 8, NBD_CLIENT_MAGIC);
 877        stq_be_p(buf + 16, client->exp->size);
 878        stl_be_p(buf + 24, client->exp->nbdflags | myflags);
 879
 880        if (nbd_write(client->ioc, buf, sizeof(buf), errp) < 0) {
 881            error_prepend(errp, "write failed: ");
 882            return -EINVAL;
 883        }
 884    } else {
 885        stq_be_p(buf + 8, NBD_OPTS_MAGIC);
 886        stw_be_p(buf + 16, NBD_FLAG_FIXED_NEWSTYLE | NBD_FLAG_NO_ZEROES);
 887
 888        if (nbd_write(client->ioc, buf, 18, errp) < 0) {
 889            error_prepend(errp, "write failed: ");
 890            return -EINVAL;
 891        }
 892        ret = nbd_negotiate_options(client, myflags, errp);
 893        if (ret != 0) {
 894            if (ret < 0) {
 895                error_prepend(errp, "option negotiation failed: ");
 896            }
 897            return ret;
 898        }
 899    }
 900
 901    trace_nbd_negotiate_success();
 902
 903    return 0;
 904}
 905
 906static int nbd_receive_request(QIOChannel *ioc, NBDRequest *request,
 907                               Error **errp)
 908{
 909    uint8_t buf[NBD_REQUEST_SIZE];
 910    uint32_t magic;
 911    int ret;
 912
 913    ret = nbd_read(ioc, buf, sizeof(buf), errp);
 914    if (ret < 0) {
 915        return ret;
 916    }
 917
 918    /* Request
 919       [ 0 ..  3]   magic   (NBD_REQUEST_MAGIC)
 920       [ 4 ..  5]   flags   (NBD_CMD_FLAG_FUA, ...)
 921       [ 6 ..  7]   type    (NBD_CMD_READ, ...)
 922       [ 8 .. 15]   handle
 923       [16 .. 23]   from
 924       [24 .. 27]   len
 925     */
 926
 927    magic = ldl_be_p(buf);
 928    request->flags  = lduw_be_p(buf + 4);
 929    request->type   = lduw_be_p(buf + 6);
 930    request->handle = ldq_be_p(buf + 8);
 931    request->from   = ldq_be_p(buf + 16);
 932    request->len    = ldl_be_p(buf + 24);
 933
 934    trace_nbd_receive_request(magic, request->flags, request->type,
 935                              request->from, request->len);
 936
 937    if (magic != NBD_REQUEST_MAGIC) {
 938        error_setg(errp, "invalid magic (got 0x%" PRIx32 ")", magic);
 939        return -EINVAL;
 940    }
 941    return 0;
 942}
 943
 944#define MAX_NBD_REQUESTS 16
 945
 946void nbd_client_get(NBDClient *client)
 947{
 948    client->refcount++;
 949}
 950
 951void nbd_client_put(NBDClient *client)
 952{
 953    if (--client->refcount == 0) {
 954        /* The last reference should be dropped by client->close,
 955         * which is called by client_close.
 956         */
 957        assert(client->closing);
 958
 959        qio_channel_detach_aio_context(client->ioc);
 960        object_unref(OBJECT(client->sioc));
 961        object_unref(OBJECT(client->ioc));
 962        if (client->tlscreds) {
 963            object_unref(OBJECT(client->tlscreds));
 964        }
 965        g_free(client->tlsaclname);
 966        if (client->exp) {
 967            QTAILQ_REMOVE(&client->exp->clients, client, next);
 968            nbd_export_put(client->exp);
 969        }
 970        g_free(client);
 971    }
 972}
 973
 974static void client_close(NBDClient *client, bool negotiated)
 975{
 976    if (client->closing) {
 977        return;
 978    }
 979
 980    client->closing = true;
 981
 982    /* Force requests to finish.  They will drop their own references,
 983     * then we'll close the socket and free the NBDClient.
 984     */
 985    qio_channel_shutdown(client->ioc, QIO_CHANNEL_SHUTDOWN_BOTH,
 986                         NULL);
 987
 988    /* Also tell the client, so that they release their reference.  */
 989    if (client->close_fn) {
 990        client->close_fn(client, negotiated);
 991    }
 992}
 993
 994static NBDRequestData *nbd_request_get(NBDClient *client)
 995{
 996    NBDRequestData *req;
 997
 998    assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
 999    client->nb_requests++;
1000
1001    req = g_new0(NBDRequestData, 1);
1002    nbd_client_get(client);
1003    req->client = client;
1004    return req;
1005}
1006
1007static void nbd_request_put(NBDRequestData *req)
1008{
1009    NBDClient *client = req->client;
1010
1011    if (req->data) {
1012        qemu_vfree(req->data);
1013    }
1014    g_free(req);
1015
1016    client->nb_requests--;
1017    nbd_client_receive_next_request(client);
1018
1019    nbd_client_put(client);
1020}
1021
1022static void blk_aio_attached(AioContext *ctx, void *opaque)
1023{
1024    NBDExport *exp = opaque;
1025    NBDClient *client;
1026
1027    trace_nbd_blk_aio_attached(exp->name, ctx);
1028
1029    exp->ctx = ctx;
1030
1031    QTAILQ_FOREACH(client, &exp->clients, next) {
1032        qio_channel_attach_aio_context(client->ioc, ctx);
1033        if (client->recv_coroutine) {
1034            aio_co_schedule(ctx, client->recv_coroutine);
1035        }
1036        if (client->send_coroutine) {
1037            aio_co_schedule(ctx, client->send_coroutine);
1038        }
1039    }
1040}
1041
1042static void blk_aio_detach(void *opaque)
1043{
1044    NBDExport *exp = opaque;
1045    NBDClient *client;
1046
1047    trace_nbd_blk_aio_detach(exp->name, exp->ctx);
1048
1049    QTAILQ_FOREACH(client, &exp->clients, next) {
1050        qio_channel_detach_aio_context(client->ioc);
1051    }
1052
1053    exp->ctx = NULL;
1054}
1055
1056static void nbd_eject_notifier(Notifier *n, void *data)
1057{
1058    NBDExport *exp = container_of(n, NBDExport, eject_notifier);
1059    nbd_export_close(exp);
1060}
1061
1062NBDExport *nbd_export_new(BlockDriverState *bs, off_t dev_offset, off_t size,
1063                          uint16_t nbdflags, void (*close)(NBDExport *),
1064                          bool writethrough, BlockBackend *on_eject_blk,
1065                          Error **errp)
1066{
1067    AioContext *ctx;
1068    BlockBackend *blk;
1069    NBDExport *exp = g_new0(NBDExport, 1);
1070    uint64_t perm;
1071    int ret;
1072
1073    /*
1074     * NBD exports are used for non-shared storage migration.  Make sure
1075     * that BDRV_O_INACTIVE is cleared and the image is ready for write
1076     * access since the export could be available before migration handover.
1077     */
1078    ctx = bdrv_get_aio_context(bs);
1079    aio_context_acquire(ctx);
1080    bdrv_invalidate_cache(bs, NULL);
1081    aio_context_release(ctx);
1082
1083    /* Don't allow resize while the NBD server is running, otherwise we don't
1084     * care what happens with the node. */
1085    perm = BLK_PERM_CONSISTENT_READ;
1086    if ((nbdflags & NBD_FLAG_READ_ONLY) == 0) {
1087        perm |= BLK_PERM_WRITE;
1088    }
1089    blk = blk_new(perm, BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
1090                        BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD);
1091    ret = blk_insert_bs(blk, bs, errp);
1092    if (ret < 0) {
1093        goto fail;
1094    }
1095    blk_set_enable_write_cache(blk, !writethrough);
1096
1097    exp->refcount = 1;
1098    QTAILQ_INIT(&exp->clients);
1099    exp->blk = blk;
1100    exp->dev_offset = dev_offset;
1101    exp->nbdflags = nbdflags;
1102    exp->size = size < 0 ? blk_getlength(blk) : size;
1103    if (exp->size < 0) {
1104        error_setg_errno(errp, -exp->size,
1105                         "Failed to determine the NBD export's length");
1106        goto fail;
1107    }
1108    exp->size -= exp->size % BDRV_SECTOR_SIZE;
1109
1110    exp->close = close;
1111    exp->ctx = blk_get_aio_context(blk);
1112    blk_add_aio_context_notifier(blk, blk_aio_attached, blk_aio_detach, exp);
1113
1114    if (on_eject_blk) {
1115        blk_ref(on_eject_blk);
1116        exp->eject_notifier_blk = on_eject_blk;
1117        exp->eject_notifier.notify = nbd_eject_notifier;
1118        blk_add_remove_bs_notifier(on_eject_blk, &exp->eject_notifier);
1119    }
1120    return exp;
1121
1122fail:
1123    blk_unref(blk);
1124    g_free(exp);
1125    return NULL;
1126}
1127
1128NBDExport *nbd_export_find(const char *name)
1129{
1130    NBDExport *exp;
1131    QTAILQ_FOREACH(exp, &exports, next) {
1132        if (strcmp(name, exp->name) == 0) {
1133            return exp;
1134        }
1135    }
1136
1137    return NULL;
1138}
1139
1140void nbd_export_set_name(NBDExport *exp, const char *name)
1141{
1142    if (exp->name == name) {
1143        return;
1144    }
1145
1146    nbd_export_get(exp);
1147    if (exp->name != NULL) {
1148        g_free(exp->name);
1149        exp->name = NULL;
1150        QTAILQ_REMOVE(&exports, exp, next);
1151        nbd_export_put(exp);
1152    }
1153    if (name != NULL) {
1154        nbd_export_get(exp);
1155        exp->name = g_strdup(name);
1156        QTAILQ_INSERT_TAIL(&exports, exp, next);
1157    }
1158    nbd_export_put(exp);
1159}
1160
1161void nbd_export_set_description(NBDExport *exp, const char *description)
1162{
1163    g_free(exp->description);
1164    exp->description = g_strdup(description);
1165}
1166
1167void nbd_export_close(NBDExport *exp)
1168{
1169    NBDClient *client, *next;
1170
1171    nbd_export_get(exp);
1172    QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) {
1173        client_close(client, true);
1174    }
1175    nbd_export_set_name(exp, NULL);
1176    nbd_export_set_description(exp, NULL);
1177    nbd_export_put(exp);
1178}
1179
1180void nbd_export_get(NBDExport *exp)
1181{
1182    assert(exp->refcount > 0);
1183    exp->refcount++;
1184}
1185
1186void nbd_export_put(NBDExport *exp)
1187{
1188    assert(exp->refcount > 0);
1189    if (exp->refcount == 1) {
1190        nbd_export_close(exp);
1191    }
1192
1193    if (--exp->refcount == 0) {
1194        assert(exp->name == NULL);
1195        assert(exp->description == NULL);
1196
1197        if (exp->close) {
1198            exp->close(exp);
1199        }
1200
1201        if (exp->blk) {
1202            if (exp->eject_notifier_blk) {
1203                notifier_remove(&exp->eject_notifier);
1204                blk_unref(exp->eject_notifier_blk);
1205            }
1206            blk_remove_aio_context_notifier(exp->blk, blk_aio_attached,
1207                                            blk_aio_detach, exp);
1208            blk_unref(exp->blk);
1209            exp->blk = NULL;
1210        }
1211
1212        g_free(exp);
1213    }
1214}
1215
1216BlockBackend *nbd_export_get_blockdev(NBDExport *exp)
1217{
1218    return exp->blk;
1219}
1220
1221void nbd_export_close_all(void)
1222{
1223    NBDExport *exp, *next;
1224
1225    QTAILQ_FOREACH_SAFE(exp, &exports, next, next) {
1226        nbd_export_close(exp);
1227    }
1228}
1229
1230static int coroutine_fn nbd_co_send_iov(NBDClient *client, struct iovec *iov,
1231                                        unsigned niov, Error **errp)
1232{
1233    int ret;
1234
1235    g_assert(qemu_in_coroutine());
1236    qemu_co_mutex_lock(&client->send_lock);
1237    client->send_coroutine = qemu_coroutine_self();
1238
1239    ret = qio_channel_writev_all(client->ioc, iov, niov, errp) < 0 ? -EIO : 0;
1240
1241    client->send_coroutine = NULL;
1242    qemu_co_mutex_unlock(&client->send_lock);
1243
1244    return ret;
1245}
1246
1247static inline void set_be_simple_reply(NBDSimpleReply *reply, uint64_t error,
1248                                       uint64_t handle)
1249{
1250    stl_be_p(&reply->magic, NBD_SIMPLE_REPLY_MAGIC);
1251    stl_be_p(&reply->error, error);
1252    stq_be_p(&reply->handle, handle);
1253}
1254
1255static int nbd_co_send_simple_reply(NBDClient *client,
1256                                    uint64_t handle,
1257                                    uint32_t error,
1258                                    void *data,
1259                                    size_t len,
1260                                    Error **errp)
1261{
1262    NBDSimpleReply reply;
1263    int nbd_err = system_errno_to_nbd_errno(error);
1264    struct iovec iov[] = {
1265        {.iov_base = &reply, .iov_len = sizeof(reply)},
1266        {.iov_base = data, .iov_len = len}
1267    };
1268
1269    trace_nbd_co_send_simple_reply(handle, nbd_err, nbd_err_lookup(nbd_err),
1270                                   len);
1271    set_be_simple_reply(&reply, nbd_err, handle);
1272
1273    return nbd_co_send_iov(client, iov, len ? 2 : 1, errp);
1274}
1275
1276static inline void set_be_chunk(NBDStructuredReplyChunk *chunk, uint16_t flags,
1277                                uint16_t type, uint64_t handle, uint32_t length)
1278{
1279    stl_be_p(&chunk->magic, NBD_STRUCTURED_REPLY_MAGIC);
1280    stw_be_p(&chunk->flags, flags);
1281    stw_be_p(&chunk->type, type);
1282    stq_be_p(&chunk->handle, handle);
1283    stl_be_p(&chunk->length, length);
1284}
1285
1286static int coroutine_fn nbd_co_send_structured_done(NBDClient *client,
1287                                                    uint64_t handle,
1288                                                    Error **errp)
1289{
1290    NBDStructuredReplyChunk chunk;
1291    struct iovec iov[] = {
1292        {.iov_base = &chunk, .iov_len = sizeof(chunk)},
1293    };
1294
1295    trace_nbd_co_send_structured_done(handle);
1296    set_be_chunk(&chunk, NBD_REPLY_FLAG_DONE, NBD_REPLY_TYPE_NONE, handle, 0);
1297
1298    return nbd_co_send_iov(client, iov, 1, errp);
1299}
1300
1301static int coroutine_fn nbd_co_send_structured_read(NBDClient *client,
1302                                                    uint64_t handle,
1303                                                    uint64_t offset,
1304                                                    void *data,
1305                                                    size_t size,
1306                                                    Error **errp)
1307{
1308    NBDStructuredReadData chunk;
1309    struct iovec iov[] = {
1310        {.iov_base = &chunk, .iov_len = sizeof(chunk)},
1311        {.iov_base = data, .iov_len = size}
1312    };
1313
1314    assert(size);
1315    trace_nbd_co_send_structured_read(handle, offset, data, size);
1316    set_be_chunk(&chunk.h, NBD_REPLY_FLAG_DONE, NBD_REPLY_TYPE_OFFSET_DATA,
1317                 handle, sizeof(chunk) - sizeof(chunk.h) + size);
1318    stq_be_p(&chunk.offset, offset);
1319
1320    return nbd_co_send_iov(client, iov, 2, errp);
1321}
1322
1323static int coroutine_fn nbd_co_send_structured_error(NBDClient *client,
1324                                                     uint64_t handle,
1325                                                     uint32_t error,
1326                                                     const char *msg,
1327                                                     Error **errp)
1328{
1329    NBDStructuredError chunk;
1330    int nbd_err = system_errno_to_nbd_errno(error);
1331    struct iovec iov[] = {
1332        {.iov_base = &chunk, .iov_len = sizeof(chunk)},
1333        {.iov_base = (char *)msg, .iov_len = msg ? strlen(msg) : 0},
1334    };
1335
1336    assert(nbd_err);
1337    trace_nbd_co_send_structured_error(handle, nbd_err,
1338                                       nbd_err_lookup(nbd_err), msg ? msg : "");
1339    set_be_chunk(&chunk.h, NBD_REPLY_FLAG_DONE, NBD_REPLY_TYPE_ERROR, handle,
1340                 sizeof(chunk) - sizeof(chunk.h) + iov[1].iov_len);
1341    stl_be_p(&chunk.error, nbd_err);
1342    stw_be_p(&chunk.message_length, iov[1].iov_len);
1343
1344    return nbd_co_send_iov(client, iov, 1 + !!iov[1].iov_len, errp);
1345}
1346
1347/* nbd_co_receive_request
1348 * Collect a client request. Return 0 if request looks valid, -EIO to drop
1349 * connection right away, and any other negative value to report an error to
1350 * the client (although the caller may still need to disconnect after reporting
1351 * the error).
1352 */
1353static int nbd_co_receive_request(NBDRequestData *req, NBDRequest *request,
1354                                  Error **errp)
1355{
1356    NBDClient *client = req->client;
1357    int valid_flags;
1358
1359    g_assert(qemu_in_coroutine());
1360    assert(client->recv_coroutine == qemu_coroutine_self());
1361    if (nbd_receive_request(client->ioc, request, errp) < 0) {
1362        return -EIO;
1363    }
1364
1365    trace_nbd_co_receive_request_decode_type(request->handle, request->type,
1366                                             nbd_cmd_lookup(request->type));
1367
1368    if (request->type != NBD_CMD_WRITE) {
1369        /* No payload, we are ready to read the next request.  */
1370        req->complete = true;
1371    }
1372
1373    if (request->type == NBD_CMD_DISC) {
1374        /* Special case: we're going to disconnect without a reply,
1375         * whether or not flags, from, or len are bogus */
1376        return -EIO;
1377    }
1378
1379    if (request->type == NBD_CMD_READ || request->type == NBD_CMD_WRITE) {
1380        if (request->len > NBD_MAX_BUFFER_SIZE) {
1381            error_setg(errp, "len (%" PRIu32" ) is larger than max len (%u)",
1382                       request->len, NBD_MAX_BUFFER_SIZE);
1383            return -EINVAL;
1384        }
1385
1386        req->data = blk_try_blockalign(client->exp->blk, request->len);
1387        if (req->data == NULL) {
1388            error_setg(errp, "No memory");
1389            return -ENOMEM;
1390        }
1391    }
1392    if (request->type == NBD_CMD_WRITE) {
1393        if (nbd_read(client->ioc, req->data, request->len, errp) < 0) {
1394            error_prepend(errp, "reading from socket failed: ");
1395            return -EIO;
1396        }
1397        req->complete = true;
1398
1399        trace_nbd_co_receive_request_payload_received(request->handle,
1400                                                      request->len);
1401    }
1402
1403    /* Sanity checks. */
1404    if (client->exp->nbdflags & NBD_FLAG_READ_ONLY &&
1405        (request->type == NBD_CMD_WRITE ||
1406         request->type == NBD_CMD_WRITE_ZEROES ||
1407         request->type == NBD_CMD_TRIM)) {
1408        error_setg(errp, "Export is read-only");
1409        return -EROFS;
1410    }
1411    if (request->from > client->exp->size ||
1412        request->from + request->len > client->exp->size) {
1413        error_setg(errp, "operation past EOF; From: %" PRIu64 ", Len: %" PRIu32
1414                   ", Size: %" PRIu64, request->from, request->len,
1415                   (uint64_t)client->exp->size);
1416        return (request->type == NBD_CMD_WRITE ||
1417                request->type == NBD_CMD_WRITE_ZEROES) ? -ENOSPC : -EINVAL;
1418    }
1419    valid_flags = NBD_CMD_FLAG_FUA;
1420    if (request->type == NBD_CMD_READ && client->structured_reply) {
1421        valid_flags |= NBD_CMD_FLAG_DF;
1422    } else if (request->type == NBD_CMD_WRITE_ZEROES) {
1423        valid_flags |= NBD_CMD_FLAG_NO_HOLE;
1424    }
1425    if (request->flags & ~valid_flags) {
1426        error_setg(errp, "unsupported flags for command %s (got 0x%x)",
1427                   nbd_cmd_lookup(request->type), request->flags);
1428        return -EINVAL;
1429    }
1430
1431    return 0;
1432}
1433
1434/* Owns a reference to the NBDClient passed as opaque.  */
1435static coroutine_fn void nbd_trip(void *opaque)
1436{
1437    NBDClient *client = opaque;
1438    NBDExport *exp = client->exp;
1439    NBDRequestData *req;
1440    NBDRequest request = { 0 };    /* GCC thinks it can be used uninitialized */
1441    int ret;
1442    int flags;
1443    int reply_data_len = 0;
1444    Error *local_err = NULL;
1445    char *msg = NULL;
1446
1447    trace_nbd_trip();
1448    if (client->closing) {
1449        nbd_client_put(client);
1450        return;
1451    }
1452
1453    req = nbd_request_get(client);
1454    ret = nbd_co_receive_request(req, &request, &local_err);
1455    client->recv_coroutine = NULL;
1456    nbd_client_receive_next_request(client);
1457    if (ret == -EIO) {
1458        goto disconnect;
1459    }
1460
1461    if (ret < 0) {
1462        goto reply;
1463    }
1464
1465    if (client->closing) {
1466        /*
1467         * The client may be closed when we are blocked in
1468         * nbd_co_receive_request()
1469         */
1470        goto done;
1471    }
1472
1473    switch (request.type) {
1474    case NBD_CMD_READ:
1475        /* XXX: NBD Protocol only documents use of FUA with WRITE */
1476        if (request.flags & NBD_CMD_FLAG_FUA) {
1477            ret = blk_co_flush(exp->blk);
1478            if (ret < 0) {
1479                error_setg_errno(&local_err, -ret, "flush failed");
1480                break;
1481            }
1482        }
1483
1484        ret = blk_pread(exp->blk, request.from + exp->dev_offset,
1485                        req->data, request.len);
1486        if (ret < 0) {
1487            error_setg_errno(&local_err, -ret, "reading from file failed");
1488            break;
1489        }
1490
1491        reply_data_len = request.len;
1492
1493        break;
1494    case NBD_CMD_WRITE:
1495        flags = 0;
1496        if (request.flags & NBD_CMD_FLAG_FUA) {
1497            flags |= BDRV_REQ_FUA;
1498        }
1499        ret = blk_pwrite(exp->blk, request.from + exp->dev_offset,
1500                         req->data, request.len, flags);
1501        if (ret < 0) {
1502            error_setg_errno(&local_err, -ret, "writing to file failed");
1503        }
1504
1505        break;
1506    case NBD_CMD_WRITE_ZEROES:
1507        flags = 0;
1508        if (request.flags & NBD_CMD_FLAG_FUA) {
1509            flags |= BDRV_REQ_FUA;
1510        }
1511        if (!(request.flags & NBD_CMD_FLAG_NO_HOLE)) {
1512            flags |= BDRV_REQ_MAY_UNMAP;
1513        }
1514        ret = blk_pwrite_zeroes(exp->blk, request.from + exp->dev_offset,
1515                                request.len, flags);
1516        if (ret < 0) {
1517            error_setg_errno(&local_err, -ret, "writing to file failed");
1518        }
1519
1520        break;
1521    case NBD_CMD_DISC:
1522        /* unreachable, thanks to special case in nbd_co_receive_request() */
1523        abort();
1524
1525    case NBD_CMD_FLUSH:
1526        ret = blk_co_flush(exp->blk);
1527        if (ret < 0) {
1528            error_setg_errno(&local_err, -ret, "flush failed");
1529        }
1530
1531        break;
1532    case NBD_CMD_TRIM:
1533        ret = blk_co_pdiscard(exp->blk, request.from + exp->dev_offset,
1534                              request.len);
1535        if (ret < 0) {
1536            error_setg_errno(&local_err, -ret, "discard failed");
1537        }
1538
1539        break;
1540    default:
1541        error_setg(&local_err, "invalid request type (%" PRIu32 ") received",
1542                   request.type);
1543        ret = -EINVAL;
1544    }
1545
1546reply:
1547    if (local_err) {
1548        /* If we get here, local_err was not a fatal error, and should be sent
1549         * to the client. */
1550        assert(ret < 0);
1551        msg = g_strdup(error_get_pretty(local_err));
1552        error_report_err(local_err);
1553        local_err = NULL;
1554    }
1555
1556    if (client->structured_reply &&
1557        (ret < 0 || request.type == NBD_CMD_READ)) {
1558        if (ret < 0) {
1559            ret = nbd_co_send_structured_error(req->client, request.handle,
1560                                               -ret, msg, &local_err);
1561        } else if (reply_data_len) {
1562            ret = nbd_co_send_structured_read(req->client, request.handle,
1563                                              request.from, req->data,
1564                                              reply_data_len, &local_err);
1565        } else {
1566            ret = nbd_co_send_structured_done(req->client, request.handle,
1567                                              &local_err);
1568        }
1569    } else {
1570        ret = nbd_co_send_simple_reply(req->client, request.handle,
1571                                       ret < 0 ? -ret : 0,
1572                                       req->data, reply_data_len, &local_err);
1573    }
1574    g_free(msg);
1575    if (ret < 0) {
1576        error_prepend(&local_err, "Failed to send reply: ");
1577        goto disconnect;
1578    }
1579
1580    /* We must disconnect after NBD_CMD_WRITE if we did not
1581     * read the payload.
1582     */
1583    if (!req->complete) {
1584        error_setg(&local_err, "Request handling failed in intermediate state");
1585        goto disconnect;
1586    }
1587
1588done:
1589    nbd_request_put(req);
1590    nbd_client_put(client);
1591    return;
1592
1593disconnect:
1594    if (local_err) {
1595        error_reportf_err(local_err, "Disconnect client, due to: ");
1596    }
1597    nbd_request_put(req);
1598    client_close(client, true);
1599    nbd_client_put(client);
1600}
1601
1602static void nbd_client_receive_next_request(NBDClient *client)
1603{
1604    if (!client->recv_coroutine && client->nb_requests < MAX_NBD_REQUESTS) {
1605        nbd_client_get(client);
1606        client->recv_coroutine = qemu_coroutine_create(nbd_trip, client);
1607        aio_co_schedule(client->exp->ctx, client->recv_coroutine);
1608    }
1609}
1610
1611static coroutine_fn void nbd_co_client_start(void *opaque)
1612{
1613    NBDClient *client = opaque;
1614    NBDExport *exp = client->exp;
1615    Error *local_err = NULL;
1616
1617    if (exp) {
1618        nbd_export_get(exp);
1619        QTAILQ_INSERT_TAIL(&exp->clients, client, next);
1620    }
1621    qemu_co_mutex_init(&client->send_lock);
1622
1623    if (nbd_negotiate(client, &local_err)) {
1624        if (local_err) {
1625            error_report_err(local_err);
1626        }
1627        client_close(client, false);
1628        return;
1629    }
1630
1631    nbd_client_receive_next_request(client);
1632}
1633
1634/*
1635 * Create a new client listener on the given export @exp, using the
1636 * given channel @sioc.  Begin servicing it in a coroutine.  When the
1637 * connection closes, call @close_fn with an indication of whether the
1638 * client completed negotiation.
1639 */
1640void nbd_client_new(NBDExport *exp,
1641                    QIOChannelSocket *sioc,
1642                    QCryptoTLSCreds *tlscreds,
1643                    const char *tlsaclname,
1644                    void (*close_fn)(NBDClient *, bool))
1645{
1646    NBDClient *client;
1647    Coroutine *co;
1648
1649    client = g_new0(NBDClient, 1);
1650    client->refcount = 1;
1651    client->exp = exp;
1652    client->tlscreds = tlscreds;
1653    if (tlscreds) {
1654        object_ref(OBJECT(client->tlscreds));
1655    }
1656    client->tlsaclname = g_strdup(tlsaclname);
1657    client->sioc = sioc;
1658    object_ref(OBJECT(client->sioc));
1659    client->ioc = QIO_CHANNEL(sioc);
1660    object_ref(OBJECT(client->ioc));
1661    client->close_fn = close_fn;
1662
1663    co = qemu_coroutine_create(nbd_co_client_start, client);
1664    qemu_coroutine_enter(co);
1665}
1666