qemu/nbd/server.c
<<
>>
Prefs
   1/*
   2 *  Copyright (C) 2005  Anthony Liguori <anthony@codemonkey.ws>
   3 *
   4 *  Network Block Device Server Side
   5 *
   6 *  This program is free software; you can redistribute it and/or modify
   7 *  it under the terms of the GNU General Public License as published by
   8 *  the Free Software Foundation; under version 2 of the License.
   9 *
  10 *  This program is distributed in the hope that it will be useful,
  11 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 *  GNU General Public License for more details.
  14 *
  15 *  You should have received a copy of the GNU General Public License
  16 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
  17 */
  18
  19#include "qemu/osdep.h"
  20#include "qapi/error.h"
  21#include "nbd-internal.h"
  22
  23static int system_errno_to_nbd_errno(int err)
  24{
  25    switch (err) {
  26    case 0:
  27        return NBD_SUCCESS;
  28    case EPERM:
  29    case EROFS:
  30        return NBD_EPERM;
  31    case EIO:
  32        return NBD_EIO;
  33    case ENOMEM:
  34        return NBD_ENOMEM;
  35#ifdef EDQUOT
  36    case EDQUOT:
  37#endif
  38    case EFBIG:
  39    case ENOSPC:
  40        return NBD_ENOSPC;
  41    case EINVAL:
  42    default:
  43        return NBD_EINVAL;
  44    }
  45}
  46
  47/* Definitions for opaque data types */
  48
  49typedef struct NBDRequest NBDRequest;
  50
  51struct NBDRequest {
  52    QSIMPLEQ_ENTRY(NBDRequest) entry;
  53    NBDClient *client;
  54    uint8_t *data;
  55};
  56
  57struct NBDExport {
  58    int refcount;
  59    void (*close)(NBDExport *exp);
  60
  61    BlockBackend *blk;
  62    char *name;
  63    off_t dev_offset;
  64    off_t size;
  65    uint32_t nbdflags;
  66    QTAILQ_HEAD(, NBDClient) clients;
  67    QTAILQ_ENTRY(NBDExport) next;
  68
  69    AioContext *ctx;
  70
  71    Notifier eject_notifier;
  72};
  73
  74static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
  75
  76struct NBDClient {
  77    int refcount;
  78    void (*close)(NBDClient *client);
  79
  80    NBDExport *exp;
  81    QCryptoTLSCreds *tlscreds;
  82    char *tlsaclname;
  83    QIOChannelSocket *sioc; /* The underlying data channel */
  84    QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */
  85
  86    Coroutine *recv_coroutine;
  87
  88    CoMutex send_lock;
  89    Coroutine *send_coroutine;
  90
  91    bool can_read;
  92
  93    QTAILQ_ENTRY(NBDClient) next;
  94    int nb_requests;
  95    bool closing;
  96};
  97
  98/* That's all folks */
  99
 100static void nbd_set_handlers(NBDClient *client);
 101static void nbd_unset_handlers(NBDClient *client);
 102static void nbd_update_can_read(NBDClient *client);
 103
 104static gboolean nbd_negotiate_continue(QIOChannel *ioc,
 105                                       GIOCondition condition,
 106                                       void *opaque)
 107{
 108    qemu_coroutine_enter(opaque, NULL);
 109    return TRUE;
 110}
 111
 112static ssize_t nbd_negotiate_read(QIOChannel *ioc, void *buffer, size_t size)
 113{
 114    ssize_t ret;
 115    guint watch;
 116
 117    assert(qemu_in_coroutine());
 118    /* Negotiation are always in main loop. */
 119    watch = qio_channel_add_watch(ioc,
 120                                  G_IO_IN,
 121                                  nbd_negotiate_continue,
 122                                  qemu_coroutine_self(),
 123                                  NULL);
 124    ret = read_sync(ioc, buffer, size);
 125    g_source_remove(watch);
 126    return ret;
 127
 128}
 129
 130static ssize_t nbd_negotiate_write(QIOChannel *ioc, void *buffer, size_t size)
 131{
 132    ssize_t ret;
 133    guint watch;
 134
 135    assert(qemu_in_coroutine());
 136    /* Negotiation are always in main loop. */
 137    watch = qio_channel_add_watch(ioc,
 138                                  G_IO_OUT,
 139                                  nbd_negotiate_continue,
 140                                  qemu_coroutine_self(),
 141                                  NULL);
 142    ret = write_sync(ioc, buffer, size);
 143    g_source_remove(watch);
 144    return ret;
 145}
 146
 147static ssize_t nbd_negotiate_drop_sync(QIOChannel *ioc, size_t size)
 148{
 149    ssize_t ret, dropped = size;
 150    uint8_t *buffer = g_malloc(MIN(65536, size));
 151
 152    while (size > 0) {
 153        ret = nbd_negotiate_read(ioc, buffer, MIN(65536, size));
 154        if (ret < 0) {
 155            g_free(buffer);
 156            return ret;
 157        }
 158
 159        assert(ret <= size);
 160        size -= ret;
 161    }
 162
 163    g_free(buffer);
 164    return dropped;
 165}
 166
 167/* Basic flow for negotiation
 168
 169   Server         Client
 170   Negotiate
 171
 172   or
 173
 174   Server         Client
 175   Negotiate #1
 176                  Option
 177   Negotiate #2
 178
 179   ----
 180
 181   followed by
 182
 183   Server         Client
 184                  Request
 185   Response
 186                  Request
 187   Response
 188                  ...
 189   ...
 190                  Request (type == 2)
 191
 192*/
 193
 194static int nbd_negotiate_send_rep(QIOChannel *ioc, uint32_t type, uint32_t opt)
 195{
 196    uint64_t magic;
 197    uint32_t len;
 198
 199    TRACE("Reply opt=%x type=%x", type, opt);
 200
 201    magic = cpu_to_be64(NBD_REP_MAGIC);
 202    if (nbd_negotiate_write(ioc, &magic, sizeof(magic)) != sizeof(magic)) {
 203        LOG("write failed (rep magic)");
 204        return -EINVAL;
 205    }
 206    opt = cpu_to_be32(opt);
 207    if (nbd_negotiate_write(ioc, &opt, sizeof(opt)) != sizeof(opt)) {
 208        LOG("write failed (rep opt)");
 209        return -EINVAL;
 210    }
 211    type = cpu_to_be32(type);
 212    if (nbd_negotiate_write(ioc, &type, sizeof(type)) != sizeof(type)) {
 213        LOG("write failed (rep type)");
 214        return -EINVAL;
 215    }
 216    len = cpu_to_be32(0);
 217    if (nbd_negotiate_write(ioc, &len, sizeof(len)) != sizeof(len)) {
 218        LOG("write failed (rep data length)");
 219        return -EINVAL;
 220    }
 221    return 0;
 222}
 223
 224static int nbd_negotiate_send_rep_list(QIOChannel *ioc, NBDExport *exp)
 225{
 226    uint64_t magic, name_len;
 227    uint32_t opt, type, len;
 228
 229    TRACE("Advertizing export name '%s'", exp->name ? exp->name : "");
 230    name_len = strlen(exp->name);
 231    magic = cpu_to_be64(NBD_REP_MAGIC);
 232    if (nbd_negotiate_write(ioc, &magic, sizeof(magic)) != sizeof(magic)) {
 233        LOG("write failed (magic)");
 234        return -EINVAL;
 235     }
 236    opt = cpu_to_be32(NBD_OPT_LIST);
 237    if (nbd_negotiate_write(ioc, &opt, sizeof(opt)) != sizeof(opt)) {
 238        LOG("write failed (opt)");
 239        return -EINVAL;
 240    }
 241    type = cpu_to_be32(NBD_REP_SERVER);
 242    if (nbd_negotiate_write(ioc, &type, sizeof(type)) != sizeof(type)) {
 243        LOG("write failed (reply type)");
 244        return -EINVAL;
 245    }
 246    len = cpu_to_be32(name_len + sizeof(len));
 247    if (nbd_negotiate_write(ioc, &len, sizeof(len)) != sizeof(len)) {
 248        LOG("write failed (length)");
 249        return -EINVAL;
 250    }
 251    len = cpu_to_be32(name_len);
 252    if (nbd_negotiate_write(ioc, &len, sizeof(len)) != sizeof(len)) {
 253        LOG("write failed (length)");
 254        return -EINVAL;
 255    }
 256    if (nbd_negotiate_write(ioc, exp->name, name_len) != name_len) {
 257        LOG("write failed (buffer)");
 258        return -EINVAL;
 259    }
 260    return 0;
 261}
 262
 263static int nbd_negotiate_handle_list(NBDClient *client, uint32_t length)
 264{
 265    NBDExport *exp;
 266
 267    if (length) {
 268        if (nbd_negotiate_drop_sync(client->ioc, length) != length) {
 269            return -EIO;
 270        }
 271        return nbd_negotiate_send_rep(client->ioc,
 272                                      NBD_REP_ERR_INVALID, NBD_OPT_LIST);
 273    }
 274
 275    /* For each export, send a NBD_REP_SERVER reply. */
 276    QTAILQ_FOREACH(exp, &exports, next) {
 277        if (nbd_negotiate_send_rep_list(client->ioc, exp)) {
 278            return -EINVAL;
 279        }
 280    }
 281    /* Finish with a NBD_REP_ACK. */
 282    return nbd_negotiate_send_rep(client->ioc, NBD_REP_ACK, NBD_OPT_LIST);
 283}
 284
 285static int nbd_negotiate_handle_export_name(NBDClient *client, uint32_t length)
 286{
 287    int rc = -EINVAL;
 288    char name[256];
 289
 290    /* Client sends:
 291        [20 ..  xx]   export name (length bytes)
 292     */
 293    TRACE("Checking length");
 294    if (length > 255) {
 295        LOG("Bad length received");
 296        goto fail;
 297    }
 298    if (nbd_negotiate_read(client->ioc, name, length) != length) {
 299        LOG("read failed");
 300        goto fail;
 301    }
 302    name[length] = '\0';
 303
 304    TRACE("Client requested export '%s'", name);
 305
 306    client->exp = nbd_export_find(name);
 307    if (!client->exp) {
 308        LOG("export not found");
 309        goto fail;
 310    }
 311
 312    QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
 313    nbd_export_get(client->exp);
 314    rc = 0;
 315fail:
 316    return rc;
 317}
 318
 319
 320static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client,
 321                                                 uint32_t length)
 322{
 323    QIOChannel *ioc;
 324    QIOChannelTLS *tioc;
 325    struct NBDTLSHandshakeData data = { 0 };
 326
 327    TRACE("Setting up TLS");
 328    ioc = client->ioc;
 329    if (length) {
 330        if (nbd_negotiate_drop_sync(ioc, length) != length) {
 331            return NULL;
 332        }
 333        nbd_negotiate_send_rep(ioc, NBD_REP_ERR_INVALID, NBD_OPT_STARTTLS);
 334        return NULL;
 335    }
 336
 337    nbd_negotiate_send_rep(client->ioc, NBD_REP_ACK, NBD_OPT_STARTTLS);
 338
 339    tioc = qio_channel_tls_new_server(ioc,
 340                                      client->tlscreds,
 341                                      client->tlsaclname,
 342                                      NULL);
 343    if (!tioc) {
 344        return NULL;
 345    }
 346
 347    TRACE("Starting TLS handshake");
 348    data.loop = g_main_loop_new(g_main_context_default(), FALSE);
 349    qio_channel_tls_handshake(tioc,
 350                              nbd_tls_handshake,
 351                              &data,
 352                              NULL);
 353
 354    if (!data.complete) {
 355        g_main_loop_run(data.loop);
 356    }
 357    g_main_loop_unref(data.loop);
 358    if (data.error) {
 359        object_unref(OBJECT(tioc));
 360        error_free(data.error);
 361        return NULL;
 362    }
 363
 364    return QIO_CHANNEL(tioc);
 365}
 366
 367
 368static int nbd_negotiate_options(NBDClient *client)
 369{
 370    uint32_t flags;
 371    bool fixedNewstyle = false;
 372
 373    /* Client sends:
 374        [ 0 ..   3]   client flags
 375
 376        [ 0 ..   7]   NBD_OPTS_MAGIC
 377        [ 8 ..  11]   NBD option
 378        [12 ..  15]   Data length
 379        ...           Rest of request
 380
 381        [ 0 ..   7]   NBD_OPTS_MAGIC
 382        [ 8 ..  11]   Second NBD option
 383        [12 ..  15]   Data length
 384        ...           Rest of request
 385    */
 386
 387    if (nbd_negotiate_read(client->ioc, &flags, sizeof(flags)) !=
 388        sizeof(flags)) {
 389        LOG("read failed");
 390        return -EIO;
 391    }
 392    TRACE("Checking client flags");
 393    be32_to_cpus(&flags);
 394    if (flags & NBD_FLAG_C_FIXED_NEWSTYLE) {
 395        TRACE("Support supports fixed newstyle handshake");
 396        fixedNewstyle = true;
 397        flags &= ~NBD_FLAG_C_FIXED_NEWSTYLE;
 398    }
 399    if (flags != 0) {
 400        TRACE("Unknown client flags 0x%x received", flags);
 401        return -EIO;
 402    }
 403
 404    while (1) {
 405        int ret;
 406        uint32_t clientflags, length;
 407        uint64_t magic;
 408
 409        if (nbd_negotiate_read(client->ioc, &magic, sizeof(magic)) !=
 410            sizeof(magic)) {
 411            LOG("read failed");
 412            return -EINVAL;
 413        }
 414        TRACE("Checking opts magic");
 415        if (magic != be64_to_cpu(NBD_OPTS_MAGIC)) {
 416            LOG("Bad magic received");
 417            return -EINVAL;
 418        }
 419
 420        if (nbd_negotiate_read(client->ioc, &clientflags,
 421                               sizeof(clientflags)) != sizeof(clientflags)) {
 422            LOG("read failed");
 423            return -EINVAL;
 424        }
 425        clientflags = be32_to_cpu(clientflags);
 426
 427        if (nbd_negotiate_read(client->ioc, &length, sizeof(length)) !=
 428            sizeof(length)) {
 429            LOG("read failed");
 430            return -EINVAL;
 431        }
 432        length = be32_to_cpu(length);
 433
 434        TRACE("Checking option 0x%x", clientflags);
 435        if (client->tlscreds &&
 436            client->ioc == (QIOChannel *)client->sioc) {
 437            QIOChannel *tioc;
 438            if (!fixedNewstyle) {
 439                TRACE("Unsupported option 0x%x", clientflags);
 440                return -EINVAL;
 441            }
 442            switch (clientflags) {
 443            case NBD_OPT_STARTTLS:
 444                tioc = nbd_negotiate_handle_starttls(client, length);
 445                if (!tioc) {
 446                    return -EIO;
 447                }
 448                object_unref(OBJECT(client->ioc));
 449                client->ioc = QIO_CHANNEL(tioc);
 450                break;
 451
 452            case NBD_OPT_EXPORT_NAME:
 453                /* No way to return an error to client, so drop connection */
 454                TRACE("Option 0x%x not permitted before TLS", clientflags);
 455                return -EINVAL;
 456
 457            default:
 458                TRACE("Option 0x%x not permitted before TLS", clientflags);
 459                if (nbd_negotiate_drop_sync(client->ioc, length) != length) {
 460                    return -EIO;
 461                }
 462                nbd_negotiate_send_rep(client->ioc, NBD_REP_ERR_TLS_REQD,
 463                                       clientflags);
 464                break;
 465            }
 466        } else if (fixedNewstyle) {
 467            switch (clientflags) {
 468            case NBD_OPT_LIST:
 469                ret = nbd_negotiate_handle_list(client, length);
 470                if (ret < 0) {
 471                    return ret;
 472                }
 473                break;
 474
 475            case NBD_OPT_ABORT:
 476                return -EINVAL;
 477
 478            case NBD_OPT_EXPORT_NAME:
 479                return nbd_negotiate_handle_export_name(client, length);
 480
 481            case NBD_OPT_STARTTLS:
 482                if (nbd_negotiate_drop_sync(client->ioc, length) != length) {
 483                    return -EIO;
 484                }
 485                if (client->tlscreds) {
 486                    TRACE("TLS already enabled");
 487                    nbd_negotiate_send_rep(client->ioc, NBD_REP_ERR_INVALID,
 488                                           clientflags);
 489                } else {
 490                    TRACE("TLS not configured");
 491                    nbd_negotiate_send_rep(client->ioc, NBD_REP_ERR_POLICY,
 492                                           clientflags);
 493                }
 494                break;
 495            default:
 496                TRACE("Unsupported option 0x%x", clientflags);
 497                if (nbd_negotiate_drop_sync(client->ioc, length) != length) {
 498                    return -EIO;
 499                }
 500                nbd_negotiate_send_rep(client->ioc, NBD_REP_ERR_UNSUP,
 501                                       clientflags);
 502                break;
 503            }
 504        } else {
 505            /*
 506             * If broken new-style we should drop the connection
 507             * for anything except NBD_OPT_EXPORT_NAME
 508             */
 509            switch (clientflags) {
 510            case NBD_OPT_EXPORT_NAME:
 511                return nbd_negotiate_handle_export_name(client, length);
 512
 513            default:
 514                TRACE("Unsupported option 0x%x", clientflags);
 515                return -EINVAL;
 516            }
 517        }
 518    }
 519}
 520
 521typedef struct {
 522    NBDClient *client;
 523    Coroutine *co;
 524} NBDClientNewData;
 525
 526static coroutine_fn int nbd_negotiate(NBDClientNewData *data)
 527{
 528    NBDClient *client = data->client;
 529    char buf[8 + 8 + 8 + 128];
 530    int rc;
 531    const int myflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM |
 532                         NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA);
 533    bool oldStyle;
 534
 535    /* Old style negotiation header without options
 536        [ 0 ..   7]   passwd       ("NBDMAGIC")
 537        [ 8 ..  15]   magic        (NBD_CLIENT_MAGIC)
 538        [16 ..  23]   size
 539        [24 ..  25]   server flags (0)
 540        [26 ..  27]   export flags
 541        [28 .. 151]   reserved     (0)
 542
 543       New style negotiation header with options
 544        [ 0 ..   7]   passwd       ("NBDMAGIC")
 545        [ 8 ..  15]   magic        (NBD_OPTS_MAGIC)
 546        [16 ..  17]   server flags (0)
 547        ....options sent....
 548        [18 ..  25]   size
 549        [26 ..  27]   export flags
 550        [28 .. 151]   reserved     (0)
 551     */
 552
 553    qio_channel_set_blocking(client->ioc, false, NULL);
 554    rc = -EINVAL;
 555
 556    TRACE("Beginning negotiation.");
 557    memset(buf, 0, sizeof(buf));
 558    memcpy(buf, "NBDMAGIC", 8);
 559
 560    oldStyle = client->exp != NULL && !client->tlscreds;
 561    if (oldStyle) {
 562        assert ((client->exp->nbdflags & ~65535) == 0);
 563        stq_be_p(buf + 8, NBD_CLIENT_MAGIC);
 564        stq_be_p(buf + 16, client->exp->size);
 565        stw_be_p(buf + 26, client->exp->nbdflags | myflags);
 566    } else {
 567        stq_be_p(buf + 8, NBD_OPTS_MAGIC);
 568        stw_be_p(buf + 16, NBD_FLAG_FIXED_NEWSTYLE);
 569    }
 570
 571    if (oldStyle) {
 572        if (client->tlscreds) {
 573            TRACE("TLS cannot be enabled with oldstyle protocol");
 574            goto fail;
 575        }
 576        if (nbd_negotiate_write(client->ioc, buf, sizeof(buf)) != sizeof(buf)) {
 577            LOG("write failed");
 578            goto fail;
 579        }
 580    } else {
 581        if (nbd_negotiate_write(client->ioc, buf, 18) != 18) {
 582            LOG("write failed");
 583            goto fail;
 584        }
 585        rc = nbd_negotiate_options(client);
 586        if (rc != 0) {
 587            LOG("option negotiation failed");
 588            goto fail;
 589        }
 590
 591        assert ((client->exp->nbdflags & ~65535) == 0);
 592        stq_be_p(buf + 18, client->exp->size);
 593        stw_be_p(buf + 26, client->exp->nbdflags | myflags);
 594        if (nbd_negotiate_write(client->ioc, buf + 18, sizeof(buf) - 18) !=
 595            sizeof(buf) - 18) {
 596            LOG("write failed");
 597            goto fail;
 598        }
 599    }
 600
 601    TRACE("Negotiation succeeded.");
 602    rc = 0;
 603fail:
 604    return rc;
 605}
 606
 607#ifdef __linux__
 608
 609int nbd_disconnect(int fd)
 610{
 611    ioctl(fd, NBD_CLEAR_QUE);
 612    ioctl(fd, NBD_DISCONNECT);
 613    ioctl(fd, NBD_CLEAR_SOCK);
 614    return 0;
 615}
 616
 617#else
 618
 619int nbd_disconnect(int fd)
 620{
 621    return -ENOTSUP;
 622}
 623#endif
 624
 625static ssize_t nbd_receive_request(QIOChannel *ioc, struct nbd_request *request)
 626{
 627    uint8_t buf[NBD_REQUEST_SIZE];
 628    uint32_t magic;
 629    ssize_t ret;
 630
 631    ret = read_sync(ioc, buf, sizeof(buf));
 632    if (ret < 0) {
 633        return ret;
 634    }
 635
 636    if (ret != sizeof(buf)) {
 637        LOG("read failed");
 638        return -EINVAL;
 639    }
 640
 641    /* Request
 642       [ 0 ..  3]   magic   (NBD_REQUEST_MAGIC)
 643       [ 4 ..  7]   type    (0 == READ, 1 == WRITE)
 644       [ 8 .. 15]   handle
 645       [16 .. 23]   from
 646       [24 .. 27]   len
 647     */
 648
 649    magic = be32_to_cpup((uint32_t*)buf);
 650    request->type  = be32_to_cpup((uint32_t*)(buf + 4));
 651    request->handle = be64_to_cpup((uint64_t*)(buf + 8));
 652    request->from  = be64_to_cpup((uint64_t*)(buf + 16));
 653    request->len   = be32_to_cpup((uint32_t*)(buf + 24));
 654
 655    TRACE("Got request: "
 656          "{ magic = 0x%x, .type = %d, from = %" PRIu64" , len = %u }",
 657          magic, request->type, request->from, request->len);
 658
 659    if (magic != NBD_REQUEST_MAGIC) {
 660        LOG("invalid magic (got 0x%x)", magic);
 661        return -EINVAL;
 662    }
 663    return 0;
 664}
 665
 666static ssize_t nbd_send_reply(QIOChannel *ioc, struct nbd_reply *reply)
 667{
 668    uint8_t buf[NBD_REPLY_SIZE];
 669    ssize_t ret;
 670
 671    reply->error = system_errno_to_nbd_errno(reply->error);
 672
 673    TRACE("Sending response to client: { .error = %d, handle = %" PRIu64 " }",
 674          reply->error, reply->handle);
 675
 676    /* Reply
 677       [ 0 ..  3]    magic   (NBD_REPLY_MAGIC)
 678       [ 4 ..  7]    error   (0 == no error)
 679       [ 7 .. 15]    handle
 680     */
 681    stl_be_p(buf, NBD_REPLY_MAGIC);
 682    stl_be_p(buf + 4, reply->error);
 683    stq_be_p(buf + 8, reply->handle);
 684
 685    ret = write_sync(ioc, buf, sizeof(buf));
 686    if (ret < 0) {
 687        return ret;
 688    }
 689
 690    if (ret != sizeof(buf)) {
 691        LOG("writing to socket failed");
 692        return -EINVAL;
 693    }
 694    return 0;
 695}
 696
 697#define MAX_NBD_REQUESTS 16
 698
 699void nbd_client_get(NBDClient *client)
 700{
 701    client->refcount++;
 702}
 703
 704void nbd_client_put(NBDClient *client)
 705{
 706    if (--client->refcount == 0) {
 707        /* The last reference should be dropped by client->close,
 708         * which is called by client_close.
 709         */
 710        assert(client->closing);
 711
 712        nbd_unset_handlers(client);
 713        object_unref(OBJECT(client->sioc));
 714        object_unref(OBJECT(client->ioc));
 715        if (client->tlscreds) {
 716            object_unref(OBJECT(client->tlscreds));
 717        }
 718        g_free(client->tlsaclname);
 719        if (client->exp) {
 720            QTAILQ_REMOVE(&client->exp->clients, client, next);
 721            nbd_export_put(client->exp);
 722        }
 723        g_free(client);
 724    }
 725}
 726
 727static void client_close(NBDClient *client)
 728{
 729    if (client->closing) {
 730        return;
 731    }
 732
 733    client->closing = true;
 734
 735    /* Force requests to finish.  They will drop their own references,
 736     * then we'll close the socket and free the NBDClient.
 737     */
 738    qio_channel_shutdown(client->ioc, QIO_CHANNEL_SHUTDOWN_BOTH,
 739                         NULL);
 740
 741    /* Also tell the client, so that they release their reference.  */
 742    if (client->close) {
 743        client->close(client);
 744    }
 745}
 746
 747static NBDRequest *nbd_request_get(NBDClient *client)
 748{
 749    NBDRequest *req;
 750
 751    assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
 752    client->nb_requests++;
 753    nbd_update_can_read(client);
 754
 755    req = g_new0(NBDRequest, 1);
 756    nbd_client_get(client);
 757    req->client = client;
 758    return req;
 759}
 760
 761static void nbd_request_put(NBDRequest *req)
 762{
 763    NBDClient *client = req->client;
 764
 765    if (req->data) {
 766        qemu_vfree(req->data);
 767    }
 768    g_free(req);
 769
 770    client->nb_requests--;
 771    nbd_update_can_read(client);
 772    nbd_client_put(client);
 773}
 774
 775static void blk_aio_attached(AioContext *ctx, void *opaque)
 776{
 777    NBDExport *exp = opaque;
 778    NBDClient *client;
 779
 780    TRACE("Export %s: Attaching clients to AIO context %p\n", exp->name, ctx);
 781
 782    exp->ctx = ctx;
 783
 784    QTAILQ_FOREACH(client, &exp->clients, next) {
 785        nbd_set_handlers(client);
 786    }
 787}
 788
 789static void blk_aio_detach(void *opaque)
 790{
 791    NBDExport *exp = opaque;
 792    NBDClient *client;
 793
 794    TRACE("Export %s: Detaching clients from AIO context %p\n", exp->name, exp->ctx);
 795
 796    QTAILQ_FOREACH(client, &exp->clients, next) {
 797        nbd_unset_handlers(client);
 798    }
 799
 800    exp->ctx = NULL;
 801}
 802
 803static void nbd_eject_notifier(Notifier *n, void *data)
 804{
 805    NBDExport *exp = container_of(n, NBDExport, eject_notifier);
 806    nbd_export_close(exp);
 807}
 808
 809NBDExport *nbd_export_new(BlockBackend *blk, off_t dev_offset, off_t size,
 810                          uint32_t nbdflags, void (*close)(NBDExport *),
 811                          Error **errp)
 812{
 813    NBDExport *exp = g_malloc0(sizeof(NBDExport));
 814    exp->refcount = 1;
 815    QTAILQ_INIT(&exp->clients);
 816    exp->blk = blk;
 817    exp->dev_offset = dev_offset;
 818    exp->nbdflags = nbdflags;
 819    exp->size = size < 0 ? blk_getlength(blk) : size;
 820    if (exp->size < 0) {
 821        error_setg_errno(errp, -exp->size,
 822                         "Failed to determine the NBD export's length");
 823        goto fail;
 824    }
 825    exp->size -= exp->size % BDRV_SECTOR_SIZE;
 826
 827    exp->close = close;
 828    exp->ctx = blk_get_aio_context(blk);
 829    blk_ref(blk);
 830    blk_add_aio_context_notifier(blk, blk_aio_attached, blk_aio_detach, exp);
 831
 832    exp->eject_notifier.notify = nbd_eject_notifier;
 833    blk_add_remove_bs_notifier(blk, &exp->eject_notifier);
 834
 835    /*
 836     * NBD exports are used for non-shared storage migration.  Make sure
 837     * that BDRV_O_INACTIVE is cleared and the image is ready for write
 838     * access since the export could be available before migration handover.
 839     */
 840    aio_context_acquire(exp->ctx);
 841    blk_invalidate_cache(blk, NULL);
 842    aio_context_release(exp->ctx);
 843    return exp;
 844
 845fail:
 846    g_free(exp);
 847    return NULL;
 848}
 849
 850NBDExport *nbd_export_find(const char *name)
 851{
 852    NBDExport *exp;
 853    QTAILQ_FOREACH(exp, &exports, next) {
 854        if (strcmp(name, exp->name) == 0) {
 855            return exp;
 856        }
 857    }
 858
 859    return NULL;
 860}
 861
 862void nbd_export_set_name(NBDExport *exp, const char *name)
 863{
 864    if (exp->name == name) {
 865        return;
 866    }
 867
 868    nbd_export_get(exp);
 869    if (exp->name != NULL) {
 870        g_free(exp->name);
 871        exp->name = NULL;
 872        QTAILQ_REMOVE(&exports, exp, next);
 873        nbd_export_put(exp);
 874    }
 875    if (name != NULL) {
 876        nbd_export_get(exp);
 877        exp->name = g_strdup(name);
 878        QTAILQ_INSERT_TAIL(&exports, exp, next);
 879    }
 880    nbd_export_put(exp);
 881}
 882
 883void nbd_export_close(NBDExport *exp)
 884{
 885    NBDClient *client, *next;
 886
 887    nbd_export_get(exp);
 888    QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) {
 889        client_close(client);
 890    }
 891    nbd_export_set_name(exp, NULL);
 892    nbd_export_put(exp);
 893}
 894
 895void nbd_export_get(NBDExport *exp)
 896{
 897    assert(exp->refcount > 0);
 898    exp->refcount++;
 899}
 900
 901void nbd_export_put(NBDExport *exp)
 902{
 903    assert(exp->refcount > 0);
 904    if (exp->refcount == 1) {
 905        nbd_export_close(exp);
 906    }
 907
 908    if (--exp->refcount == 0) {
 909        assert(exp->name == NULL);
 910
 911        if (exp->close) {
 912            exp->close(exp);
 913        }
 914
 915        if (exp->blk) {
 916            notifier_remove(&exp->eject_notifier);
 917            blk_remove_aio_context_notifier(exp->blk, blk_aio_attached,
 918                                            blk_aio_detach, exp);
 919            blk_unref(exp->blk);
 920            exp->blk = NULL;
 921        }
 922
 923        g_free(exp);
 924    }
 925}
 926
 927BlockBackend *nbd_export_get_blockdev(NBDExport *exp)
 928{
 929    return exp->blk;
 930}
 931
 932void nbd_export_close_all(void)
 933{
 934    NBDExport *exp, *next;
 935
 936    QTAILQ_FOREACH_SAFE(exp, &exports, next, next) {
 937        nbd_export_close(exp);
 938    }
 939}
 940
 941static ssize_t nbd_co_send_reply(NBDRequest *req, struct nbd_reply *reply,
 942                                 int len)
 943{
 944    NBDClient *client = req->client;
 945    ssize_t rc, ret;
 946
 947    g_assert(qemu_in_coroutine());
 948    qemu_co_mutex_lock(&client->send_lock);
 949    client->send_coroutine = qemu_coroutine_self();
 950    nbd_set_handlers(client);
 951
 952    if (!len) {
 953        rc = nbd_send_reply(client->ioc, reply);
 954    } else {
 955        qio_channel_set_cork(client->ioc, true);
 956        rc = nbd_send_reply(client->ioc, reply);
 957        if (rc >= 0) {
 958            ret = write_sync(client->ioc, req->data, len);
 959            if (ret != len) {
 960                rc = -EIO;
 961            }
 962        }
 963        qio_channel_set_cork(client->ioc, false);
 964    }
 965
 966    client->send_coroutine = NULL;
 967    nbd_set_handlers(client);
 968    qemu_co_mutex_unlock(&client->send_lock);
 969    return rc;
 970}
 971
 972static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request *request)
 973{
 974    NBDClient *client = req->client;
 975    uint32_t command;
 976    ssize_t rc;
 977
 978    g_assert(qemu_in_coroutine());
 979    client->recv_coroutine = qemu_coroutine_self();
 980    nbd_update_can_read(client);
 981
 982    rc = nbd_receive_request(client->ioc, request);
 983    if (rc < 0) {
 984        if (rc != -EAGAIN) {
 985            rc = -EIO;
 986        }
 987        goto out;
 988    }
 989
 990    if ((request->from + request->len) < request->from) {
 991        LOG("integer overflow detected! "
 992            "you're probably being attacked");
 993        rc = -EINVAL;
 994        goto out;
 995    }
 996
 997    TRACE("Decoding type");
 998
 999    command = request->type & NBD_CMD_MASK_COMMAND;
1000    if (command == NBD_CMD_READ || command == NBD_CMD_WRITE) {
1001        if (request->len > NBD_MAX_BUFFER_SIZE) {
1002            LOG("len (%u) is larger than max len (%u)",
1003                request->len, NBD_MAX_BUFFER_SIZE);
1004            rc = -EINVAL;
1005            goto out;
1006        }
1007
1008        req->data = blk_try_blockalign(client->exp->blk, request->len);
1009        if (req->data == NULL) {
1010            rc = -ENOMEM;
1011            goto out;
1012        }
1013    }
1014    if (command == NBD_CMD_WRITE) {
1015        TRACE("Reading %u byte(s)", request->len);
1016
1017        if (read_sync(client->ioc, req->data, request->len) != request->len) {
1018            LOG("reading from socket failed");
1019            rc = -EIO;
1020            goto out;
1021        }
1022    }
1023    rc = 0;
1024
1025out:
1026    client->recv_coroutine = NULL;
1027    nbd_update_can_read(client);
1028
1029    return rc;
1030}
1031
1032static void nbd_trip(void *opaque)
1033{
1034    NBDClient *client = opaque;
1035    NBDExport *exp = client->exp;
1036    NBDRequest *req;
1037    struct nbd_request request;
1038    struct nbd_reply reply;
1039    ssize_t ret;
1040    uint32_t command;
1041
1042    TRACE("Reading request.");
1043    if (client->closing) {
1044        return;
1045    }
1046
1047    req = nbd_request_get(client);
1048    ret = nbd_co_receive_request(req, &request);
1049    if (ret == -EAGAIN) {
1050        goto done;
1051    }
1052    if (ret == -EIO) {
1053        goto out;
1054    }
1055
1056    reply.handle = request.handle;
1057    reply.error = 0;
1058
1059    if (ret < 0) {
1060        reply.error = -ret;
1061        goto error_reply;
1062    }
1063    command = request.type & NBD_CMD_MASK_COMMAND;
1064    if (command != NBD_CMD_DISC && (request.from + request.len) > exp->size) {
1065            LOG("From: %" PRIu64 ", Len: %u, Size: %" PRIu64
1066            ", Offset: %" PRIu64 "\n",
1067                    request.from, request.len,
1068                    (uint64_t)exp->size, (uint64_t)exp->dev_offset);
1069        LOG("requested operation past EOF--bad client?");
1070        goto invalid_request;
1071    }
1072
1073    if (client->closing) {
1074        /*
1075         * The client may be closed when we are blocked in
1076         * nbd_co_receive_request()
1077         */
1078        goto done;
1079    }
1080
1081    switch (command) {
1082    case NBD_CMD_READ:
1083        TRACE("Request type is READ");
1084
1085        if (request.type & NBD_CMD_FLAG_FUA) {
1086            ret = blk_co_flush(exp->blk);
1087            if (ret < 0) {
1088                LOG("flush failed");
1089                reply.error = -ret;
1090                goto error_reply;
1091            }
1092        }
1093
1094        ret = blk_pread(exp->blk, request.from + exp->dev_offset,
1095                        req->data, request.len);
1096        if (ret < 0) {
1097            LOG("reading from file failed");
1098            reply.error = -ret;
1099            goto error_reply;
1100        }
1101
1102        TRACE("Read %u byte(s)", request.len);
1103        if (nbd_co_send_reply(req, &reply, request.len) < 0)
1104            goto out;
1105        break;
1106    case NBD_CMD_WRITE:
1107        TRACE("Request type is WRITE");
1108
1109        if (exp->nbdflags & NBD_FLAG_READ_ONLY) {
1110            TRACE("Server is read-only, return error");
1111            reply.error = EROFS;
1112            goto error_reply;
1113        }
1114
1115        TRACE("Writing to device");
1116
1117        ret = blk_pwrite(exp->blk, request.from + exp->dev_offset,
1118                        req->data, request.len);
1119        if (ret < 0) {
1120            LOG("writing to file failed");
1121            reply.error = -ret;
1122            goto error_reply;
1123        }
1124
1125        if (request.type & NBD_CMD_FLAG_FUA) {
1126            ret = blk_co_flush(exp->blk);
1127            if (ret < 0) {
1128                LOG("flush failed");
1129                reply.error = -ret;
1130                goto error_reply;
1131            }
1132        }
1133
1134        if (nbd_co_send_reply(req, &reply, 0) < 0) {
1135            goto out;
1136        }
1137        break;
1138    case NBD_CMD_DISC:
1139        TRACE("Request type is DISCONNECT");
1140        errno = 0;
1141        goto out;
1142    case NBD_CMD_FLUSH:
1143        TRACE("Request type is FLUSH");
1144
1145        ret = blk_co_flush(exp->blk);
1146        if (ret < 0) {
1147            LOG("flush failed");
1148            reply.error = -ret;
1149        }
1150        if (nbd_co_send_reply(req, &reply, 0) < 0) {
1151            goto out;
1152        }
1153        break;
1154    case NBD_CMD_TRIM:
1155        TRACE("Request type is TRIM");
1156        ret = blk_co_discard(exp->blk, (request.from + exp->dev_offset)
1157                                       / BDRV_SECTOR_SIZE,
1158                             request.len / BDRV_SECTOR_SIZE);
1159        if (ret < 0) {
1160            LOG("discard failed");
1161            reply.error = -ret;
1162        }
1163        if (nbd_co_send_reply(req, &reply, 0) < 0) {
1164            goto out;
1165        }
1166        break;
1167    default:
1168        LOG("invalid request type (%u) received", request.type);
1169    invalid_request:
1170        reply.error = EINVAL;
1171    error_reply:
1172        if (nbd_co_send_reply(req, &reply, 0) < 0) {
1173            goto out;
1174        }
1175        break;
1176    }
1177
1178    TRACE("Request/Reply complete");
1179
1180done:
1181    nbd_request_put(req);
1182    return;
1183
1184out:
1185    nbd_request_put(req);
1186    client_close(client);
1187}
1188
1189static void nbd_read(void *opaque)
1190{
1191    NBDClient *client = opaque;
1192
1193    if (client->recv_coroutine) {
1194        qemu_coroutine_enter(client->recv_coroutine, NULL);
1195    } else {
1196        qemu_coroutine_enter(qemu_coroutine_create(nbd_trip), client);
1197    }
1198}
1199
1200static void nbd_restart_write(void *opaque)
1201{
1202    NBDClient *client = opaque;
1203
1204    qemu_coroutine_enter(client->send_coroutine, NULL);
1205}
1206
1207static void nbd_set_handlers(NBDClient *client)
1208{
1209    if (client->exp && client->exp->ctx) {
1210        aio_set_fd_handler(client->exp->ctx, client->sioc->fd,
1211                           true,
1212                           client->can_read ? nbd_read : NULL,
1213                           client->send_coroutine ? nbd_restart_write : NULL,
1214                           client);
1215    }
1216}
1217
1218static void nbd_unset_handlers(NBDClient *client)
1219{
1220    if (client->exp && client->exp->ctx) {
1221        aio_set_fd_handler(client->exp->ctx, client->sioc->fd,
1222                           true, NULL, NULL, NULL);
1223    }
1224}
1225
1226static void nbd_update_can_read(NBDClient *client)
1227{
1228    bool can_read = client->recv_coroutine ||
1229                    client->nb_requests < MAX_NBD_REQUESTS;
1230
1231    if (can_read != client->can_read) {
1232        client->can_read = can_read;
1233        nbd_set_handlers(client);
1234
1235        /* There is no need to invoke aio_notify(), since aio_set_fd_handler()
1236         * in nbd_set_handlers() will have taken care of that */
1237    }
1238}
1239
1240static coroutine_fn void nbd_co_client_start(void *opaque)
1241{
1242    NBDClientNewData *data = opaque;
1243    NBDClient *client = data->client;
1244    NBDExport *exp = client->exp;
1245
1246    if (exp) {
1247        nbd_export_get(exp);
1248    }
1249    if (nbd_negotiate(data)) {
1250        client_close(client);
1251        goto out;
1252    }
1253    qemu_co_mutex_init(&client->send_lock);
1254    nbd_set_handlers(client);
1255
1256    if (exp) {
1257        QTAILQ_INSERT_TAIL(&exp->clients, client, next);
1258    }
1259out:
1260    g_free(data);
1261}
1262
1263void nbd_client_new(NBDExport *exp,
1264                    QIOChannelSocket *sioc,
1265                    QCryptoTLSCreds *tlscreds,
1266                    const char *tlsaclname,
1267                    void (*close_fn)(NBDClient *))
1268{
1269    NBDClient *client;
1270    NBDClientNewData *data = g_new(NBDClientNewData, 1);
1271
1272    client = g_malloc0(sizeof(NBDClient));
1273    client->refcount = 1;
1274    client->exp = exp;
1275    client->tlscreds = tlscreds;
1276    if (tlscreds) {
1277        object_ref(OBJECT(client->tlscreds));
1278    }
1279    client->tlsaclname = g_strdup(tlsaclname);
1280    client->sioc = sioc;
1281    object_ref(OBJECT(client->sioc));
1282    client->ioc = QIO_CHANNEL(sioc);
1283    object_ref(OBJECT(client->ioc));
1284    client->can_read = true;
1285    client->close = close_fn;
1286
1287    data->client = client;
1288    data->co = qemu_coroutine_create(nbd_co_client_start);
1289    qemu_coroutine_enter(data->co, data);
1290}
1291