qemu/nbd.c
<<
>>
Prefs
   1/*
   2 *  Copyright (C) 2005  Anthony Liguori <anthony@codemonkey.ws>
   3 *
   4 *  Network Block Device
   5 *
   6 *  This program is free software; you can redistribute it and/or modify
   7 *  it under the terms of the GNU General Public License as published by
   8 *  the Free Software Foundation; under version 2 of the License.
   9 *
  10 *  This program is distributed in the hope that it will be useful,
  11 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 *  GNU General Public License for more details.
  14 *
  15 *  You should have received a copy of the GNU General Public License
  16 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
  17 */
  18
  19#include "block/nbd.h"
  20#include "sysemu/block-backend.h"
  21
  22#include "block/coroutine.h"
  23
  24#include <errno.h>
  25#include <string.h>
  26#ifndef _WIN32
  27#include <sys/ioctl.h>
  28#endif
  29#if defined(__sun__) || defined(__HAIKU__)
  30#include <sys/ioccom.h>
  31#endif
  32#include <ctype.h>
  33#include <inttypes.h>
  34
  35#ifdef __linux__
  36#include <linux/fs.h>
  37#endif
  38
  39#include "qemu/sockets.h"
  40#include "qemu/queue.h"
  41#include "qemu/main-loop.h"
  42
  43//#define DEBUG_NBD
  44
  45#ifdef DEBUG_NBD
  46#define TRACE(msg, ...) do { \
  47    LOG(msg, ## __VA_ARGS__); \
  48} while(0)
  49#else
  50#define TRACE(msg, ...) \
  51    do { } while (0)
  52#endif
  53
  54#define LOG(msg, ...) do { \
  55    fprintf(stderr, "%s:%s():L%d: " msg "\n", \
  56            __FILE__, __FUNCTION__, __LINE__, ## __VA_ARGS__); \
  57} while(0)
  58
  59/* This is all part of the "official" NBD API.
  60 *
  61 * The most up-to-date documentation is available at:
  62 * https://github.com/yoe/nbd/blob/master/doc/proto.txt
  63 */
  64
  65#define NBD_REQUEST_SIZE        (4 + 4 + 8 + 8 + 4)
  66#define NBD_REPLY_SIZE          (4 + 4 + 8)
  67#define NBD_REQUEST_MAGIC       0x25609513
  68#define NBD_REPLY_MAGIC         0x67446698
  69#define NBD_OPTS_MAGIC          0x49484156454F5054LL
  70#define NBD_CLIENT_MAGIC        0x0000420281861253LL
  71#define NBD_REP_MAGIC           0x3e889045565a9LL
  72
  73#define NBD_SET_SOCK            _IO(0xab, 0)
  74#define NBD_SET_BLKSIZE         _IO(0xab, 1)
  75#define NBD_SET_SIZE            _IO(0xab, 2)
  76#define NBD_DO_IT               _IO(0xab, 3)
  77#define NBD_CLEAR_SOCK          _IO(0xab, 4)
  78#define NBD_CLEAR_QUE           _IO(0xab, 5)
  79#define NBD_PRINT_DEBUG         _IO(0xab, 6)
  80#define NBD_SET_SIZE_BLOCKS     _IO(0xab, 7)
  81#define NBD_DISCONNECT          _IO(0xab, 8)
  82#define NBD_SET_TIMEOUT         _IO(0xab, 9)
  83#define NBD_SET_FLAGS           _IO(0xab, 10)
  84
  85#define NBD_OPT_EXPORT_NAME     (1)
  86#define NBD_OPT_ABORT           (2)
  87#define NBD_OPT_LIST            (3)
  88
  89/* Definitions for opaque data types */
  90
  91typedef struct NBDRequest NBDRequest;
  92
  93struct NBDRequest {
  94    QSIMPLEQ_ENTRY(NBDRequest) entry;
  95    NBDClient *client;
  96    uint8_t *data;
  97};
  98
  99struct NBDExport {
 100    int refcount;
 101    void (*close)(NBDExport *exp);
 102
 103    BlockBackend *blk;
 104    char *name;
 105    off_t dev_offset;
 106    off_t size;
 107    uint32_t nbdflags;
 108    QTAILQ_HEAD(, NBDClient) clients;
 109    QTAILQ_ENTRY(NBDExport) next;
 110
 111    AioContext *ctx;
 112};
 113
 114static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
 115
 116struct NBDClient {
 117    int refcount;
 118    void (*close)(NBDClient *client);
 119
 120    NBDExport *exp;
 121    int sock;
 122
 123    Coroutine *recv_coroutine;
 124
 125    CoMutex send_lock;
 126    Coroutine *send_coroutine;
 127
 128    bool can_read;
 129
 130    QTAILQ_ENTRY(NBDClient) next;
 131    int nb_requests;
 132    bool closing;
 133};
 134
 135/* That's all folks */
 136
 137static void nbd_set_handlers(NBDClient *client);
 138static void nbd_unset_handlers(NBDClient *client);
 139static void nbd_update_can_read(NBDClient *client);
 140
 141ssize_t nbd_wr_sync(int fd, void *buffer, size_t size, bool do_read)
 142{
 143    size_t offset = 0;
 144    int err;
 145
 146    if (qemu_in_coroutine()) {
 147        if (do_read) {
 148            return qemu_co_recv(fd, buffer, size);
 149        } else {
 150            return qemu_co_send(fd, buffer, size);
 151        }
 152    }
 153
 154    while (offset < size) {
 155        ssize_t len;
 156
 157        if (do_read) {
 158            len = qemu_recv(fd, buffer + offset, size - offset, 0);
 159        } else {
 160            len = send(fd, buffer + offset, size - offset, 0);
 161        }
 162
 163        if (len < 0) {
 164            err = socket_error();
 165
 166            /* recoverable error */
 167            if (err == EINTR || (offset > 0 && (err == EAGAIN || err == EWOULDBLOCK))) {
 168                continue;
 169            }
 170
 171            /* unrecoverable error */
 172            return -err;
 173        }
 174
 175        /* eof */
 176        if (len == 0) {
 177            break;
 178        }
 179
 180        offset += len;
 181    }
 182
 183    return offset;
 184}
 185
 186static ssize_t read_sync(int fd, void *buffer, size_t size)
 187{
 188    /* Sockets are kept in blocking mode in the negotiation phase.  After
 189     * that, a non-readable socket simply means that another thread stole
 190     * our request/reply.  Synchronization is done with recv_coroutine, so
 191     * that this is coroutine-safe.
 192     */
 193    return nbd_wr_sync(fd, buffer, size, true);
 194}
 195
 196static ssize_t drop_sync(int fd, size_t size)
 197{
 198    ssize_t ret, dropped = size;
 199    uint8_t *buffer = g_malloc(MIN(65536, size));
 200
 201    while (size > 0) {
 202        ret = read_sync(fd, buffer, MIN(65536, size));
 203        if (ret < 0) {
 204            g_free(buffer);
 205            return ret;
 206        }
 207
 208        assert(ret <= size);
 209        size -= ret;
 210    }
 211
 212    g_free(buffer);
 213    return dropped;
 214}
 215
 216static ssize_t write_sync(int fd, void *buffer, size_t size)
 217{
 218    int ret;
 219    do {
 220        /* For writes, we do expect the socket to be writable.  */
 221        ret = nbd_wr_sync(fd, buffer, size, false);
 222    } while (ret == -EAGAIN);
 223    return ret;
 224}
 225
 226/* Basic flow for negotiation
 227
 228   Server         Client
 229   Negotiate
 230
 231   or
 232
 233   Server         Client
 234   Negotiate #1
 235                  Option
 236   Negotiate #2
 237
 238   ----
 239
 240   followed by
 241
 242   Server         Client
 243                  Request
 244   Response
 245                  Request
 246   Response
 247                  ...
 248   ...
 249                  Request (type == 2)
 250
 251*/
 252
 253static int nbd_send_rep(int csock, uint32_t type, uint32_t opt)
 254{
 255    uint64_t magic;
 256    uint32_t len;
 257
 258    magic = cpu_to_be64(NBD_REP_MAGIC);
 259    if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
 260        LOG("write failed (rep magic)");
 261        return -EINVAL;
 262    }
 263    opt = cpu_to_be32(opt);
 264    if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) {
 265        LOG("write failed (rep opt)");
 266        return -EINVAL;
 267    }
 268    type = cpu_to_be32(type);
 269    if (write_sync(csock, &type, sizeof(type)) != sizeof(type)) {
 270        LOG("write failed (rep type)");
 271        return -EINVAL;
 272    }
 273    len = cpu_to_be32(0);
 274    if (write_sync(csock, &len, sizeof(len)) != sizeof(len)) {
 275        LOG("write failed (rep data length)");
 276        return -EINVAL;
 277    }
 278    return 0;
 279}
 280
 281static int nbd_send_rep_list(int csock, NBDExport *exp)
 282{
 283    uint64_t magic, name_len;
 284    uint32_t opt, type, len;
 285
 286    name_len = strlen(exp->name);
 287    magic = cpu_to_be64(NBD_REP_MAGIC);
 288    if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
 289        LOG("write failed (magic)");
 290        return -EINVAL;
 291     }
 292    opt = cpu_to_be32(NBD_OPT_LIST);
 293    if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) {
 294        LOG("write failed (opt)");
 295        return -EINVAL;
 296    }
 297    type = cpu_to_be32(NBD_REP_SERVER);
 298    if (write_sync(csock, &type, sizeof(type)) != sizeof(type)) {
 299        LOG("write failed (reply type)");
 300        return -EINVAL;
 301    }
 302    len = cpu_to_be32(name_len + sizeof(len));
 303    if (write_sync(csock, &len, sizeof(len)) != sizeof(len)) {
 304        LOG("write failed (length)");
 305        return -EINVAL;
 306    }
 307    len = cpu_to_be32(name_len);
 308    if (write_sync(csock, &len, sizeof(len)) != sizeof(len)) {
 309        LOG("write failed (length)");
 310        return -EINVAL;
 311    }
 312    if (write_sync(csock, exp->name, name_len) != name_len) {
 313        LOG("write failed (buffer)");
 314        return -EINVAL;
 315    }
 316    return 0;
 317}
 318
 319static int nbd_handle_list(NBDClient *client, uint32_t length)
 320{
 321    int csock;
 322    NBDExport *exp;
 323
 324    csock = client->sock;
 325    if (length) {
 326        if (drop_sync(csock, length) != length) {
 327            return -EIO;
 328        }
 329        return nbd_send_rep(csock, NBD_REP_ERR_INVALID, NBD_OPT_LIST);
 330    }
 331
 332    /* For each export, send a NBD_REP_SERVER reply. */
 333    QTAILQ_FOREACH(exp, &exports, next) {
 334        if (nbd_send_rep_list(csock, exp)) {
 335            return -EINVAL;
 336        }
 337    }
 338    /* Finish with a NBD_REP_ACK. */
 339    return nbd_send_rep(csock, NBD_REP_ACK, NBD_OPT_LIST);
 340}
 341
 342static int nbd_handle_export_name(NBDClient *client, uint32_t length)
 343{
 344    int rc = -EINVAL, csock = client->sock;
 345    char name[256];
 346
 347    /* Client sends:
 348        [20 ..  xx]   export name (length bytes)
 349     */
 350    TRACE("Checking length");
 351    if (length > 255) {
 352        LOG("Bad length received");
 353        goto fail;
 354    }
 355    if (read_sync(csock, name, length) != length) {
 356        LOG("read failed");
 357        goto fail;
 358    }
 359    name[length] = '\0';
 360
 361    client->exp = nbd_export_find(name);
 362    if (!client->exp) {
 363        LOG("export not found");
 364        goto fail;
 365    }
 366
 367    QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
 368    nbd_export_get(client->exp);
 369    rc = 0;
 370fail:
 371    return rc;
 372}
 373
 374static int nbd_receive_options(NBDClient *client)
 375{
 376    int csock = client->sock;
 377    uint32_t flags;
 378
 379    /* Client sends:
 380        [ 0 ..   3]   client flags
 381
 382        [ 0 ..   7]   NBD_OPTS_MAGIC
 383        [ 8 ..  11]   NBD option
 384        [12 ..  15]   Data length
 385        ...           Rest of request
 386
 387        [ 0 ..   7]   NBD_OPTS_MAGIC
 388        [ 8 ..  11]   Second NBD option
 389        [12 ..  15]   Data length
 390        ...           Rest of request
 391    */
 392
 393    if (read_sync(csock, &flags, sizeof(flags)) != sizeof(flags)) {
 394        LOG("read failed");
 395        return -EIO;
 396    }
 397    TRACE("Checking client flags");
 398    be32_to_cpus(&flags);
 399    if (flags != 0 && flags != NBD_FLAG_C_FIXED_NEWSTYLE) {
 400        LOG("Bad client flags received");
 401        return -EIO;
 402    }
 403
 404    while (1) {
 405        int ret;
 406        uint32_t tmp, length;
 407        uint64_t magic;
 408
 409        if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
 410            LOG("read failed");
 411            return -EINVAL;
 412        }
 413        TRACE("Checking opts magic");
 414        if (magic != be64_to_cpu(NBD_OPTS_MAGIC)) {
 415            LOG("Bad magic received");
 416            return -EINVAL;
 417        }
 418
 419        if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
 420            LOG("read failed");
 421            return -EINVAL;
 422        }
 423
 424        if (read_sync(csock, &length, sizeof(length)) != sizeof(length)) {
 425            LOG("read failed");
 426            return -EINVAL;
 427        }
 428        length = be32_to_cpu(length);
 429
 430        TRACE("Checking option");
 431        switch (be32_to_cpu(tmp)) {
 432        case NBD_OPT_LIST:
 433            ret = nbd_handle_list(client, length);
 434            if (ret < 0) {
 435                return ret;
 436            }
 437            break;
 438
 439        case NBD_OPT_ABORT:
 440            return -EINVAL;
 441
 442        case NBD_OPT_EXPORT_NAME:
 443            return nbd_handle_export_name(client, length);
 444
 445        default:
 446            tmp = be32_to_cpu(tmp);
 447            LOG("Unsupported option 0x%x", tmp);
 448            nbd_send_rep(client->sock, NBD_REP_ERR_UNSUP, tmp);
 449            return -EINVAL;
 450        }
 451    }
 452}
 453
 454static int nbd_send_negotiate(NBDClient *client)
 455{
 456    int csock = client->sock;
 457    char buf[8 + 8 + 8 + 128];
 458    int rc;
 459    const int myflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM |
 460                         NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA);
 461
 462    /* Negotiation header without options:
 463        [ 0 ..   7]   passwd       ("NBDMAGIC")
 464        [ 8 ..  15]   magic        (NBD_CLIENT_MAGIC)
 465        [16 ..  23]   size
 466        [24 ..  25]   server flags (0)
 467        [26 ..  27]   export flags
 468        [28 .. 151]   reserved     (0)
 469
 470       Negotiation header with options, part 1:
 471        [ 0 ..   7]   passwd       ("NBDMAGIC")
 472        [ 8 ..  15]   magic        (NBD_OPTS_MAGIC)
 473        [16 ..  17]   server flags (0)
 474
 475       part 2 (after options are sent):
 476        [18 ..  25]   size
 477        [26 ..  27]   export flags
 478        [28 .. 151]   reserved     (0)
 479     */
 480
 481    qemu_set_block(csock);
 482    rc = -EINVAL;
 483
 484    TRACE("Beginning negotiation.");
 485    memset(buf, 0, sizeof(buf));
 486    memcpy(buf, "NBDMAGIC", 8);
 487    if (client->exp) {
 488        assert ((client->exp->nbdflags & ~65535) == 0);
 489        cpu_to_be64w((uint64_t*)(buf + 8), NBD_CLIENT_MAGIC);
 490        cpu_to_be64w((uint64_t*)(buf + 16), client->exp->size);
 491        cpu_to_be16w((uint16_t*)(buf + 26), client->exp->nbdflags | myflags);
 492    } else {
 493        cpu_to_be64w((uint64_t*)(buf + 8), NBD_OPTS_MAGIC);
 494        cpu_to_be16w((uint16_t *)(buf + 16), NBD_FLAG_FIXED_NEWSTYLE);
 495    }
 496
 497    if (client->exp) {
 498        if (write_sync(csock, buf, sizeof(buf)) != sizeof(buf)) {
 499            LOG("write failed");
 500            goto fail;
 501        }
 502    } else {
 503        if (write_sync(csock, buf, 18) != 18) {
 504            LOG("write failed");
 505            goto fail;
 506        }
 507        rc = nbd_receive_options(client);
 508        if (rc != 0) {
 509            LOG("option negotiation failed");
 510            goto fail;
 511        }
 512
 513        assert ((client->exp->nbdflags & ~65535) == 0);
 514        cpu_to_be64w((uint64_t*)(buf + 18), client->exp->size);
 515        cpu_to_be16w((uint16_t*)(buf + 26), client->exp->nbdflags | myflags);
 516        if (write_sync(csock, buf + 18, sizeof(buf) - 18) != sizeof(buf) - 18) {
 517            LOG("write failed");
 518            goto fail;
 519        }
 520    }
 521
 522    TRACE("Negotiation succeeded.");
 523    rc = 0;
 524fail:
 525    qemu_set_nonblock(csock);
 526    return rc;
 527}
 528
 529int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
 530                          off_t *size, Error **errp)
 531{
 532    char buf[256];
 533    uint64_t magic, s;
 534    uint16_t tmp;
 535    int rc;
 536
 537    TRACE("Receiving negotiation.");
 538
 539    rc = -EINVAL;
 540
 541    if (read_sync(csock, buf, 8) != 8) {
 542        error_setg(errp, "Failed to read data");
 543        goto fail;
 544    }
 545
 546    buf[8] = '\0';
 547    if (strlen(buf) == 0) {
 548        error_setg(errp, "Server connection closed unexpectedly");
 549        goto fail;
 550    }
 551
 552    TRACE("Magic is %c%c%c%c%c%c%c%c",
 553          qemu_isprint(buf[0]) ? buf[0] : '.',
 554          qemu_isprint(buf[1]) ? buf[1] : '.',
 555          qemu_isprint(buf[2]) ? buf[2] : '.',
 556          qemu_isprint(buf[3]) ? buf[3] : '.',
 557          qemu_isprint(buf[4]) ? buf[4] : '.',
 558          qemu_isprint(buf[5]) ? buf[5] : '.',
 559          qemu_isprint(buf[6]) ? buf[6] : '.',
 560          qemu_isprint(buf[7]) ? buf[7] : '.');
 561
 562    if (memcmp(buf, "NBDMAGIC", 8) != 0) {
 563        error_setg(errp, "Invalid magic received");
 564        goto fail;
 565    }
 566
 567    if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
 568        error_setg(errp, "Failed to read magic");
 569        goto fail;
 570    }
 571    magic = be64_to_cpu(magic);
 572    TRACE("Magic is 0x%" PRIx64, magic);
 573
 574    if (name) {
 575        uint32_t reserved = 0;
 576        uint32_t opt;
 577        uint32_t namesize;
 578
 579        TRACE("Checking magic (opts_magic)");
 580        if (magic != NBD_OPTS_MAGIC) {
 581            if (magic == NBD_CLIENT_MAGIC) {
 582                error_setg(errp, "Server does not support export names");
 583            } else {
 584                error_setg(errp, "Bad magic received");
 585            }
 586            goto fail;
 587        }
 588        if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
 589            error_setg(errp, "Failed to read server flags");
 590            goto fail;
 591        }
 592        *flags = be16_to_cpu(tmp) << 16;
 593        /* reserved for future use */
 594        if (write_sync(csock, &reserved, sizeof(reserved)) !=
 595            sizeof(reserved)) {
 596            error_setg(errp, "Failed to read reserved field");
 597            goto fail;
 598        }
 599        /* write the export name */
 600        magic = cpu_to_be64(magic);
 601        if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
 602            error_setg(errp, "Failed to send export name magic");
 603            goto fail;
 604        }
 605        opt = cpu_to_be32(NBD_OPT_EXPORT_NAME);
 606        if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) {
 607            error_setg(errp, "Failed to send export name option number");
 608            goto fail;
 609        }
 610        namesize = cpu_to_be32(strlen(name));
 611        if (write_sync(csock, &namesize, sizeof(namesize)) !=
 612            sizeof(namesize)) {
 613            error_setg(errp, "Failed to send export name length");
 614            goto fail;
 615        }
 616        if (write_sync(csock, (char*)name, strlen(name)) != strlen(name)) {
 617            error_setg(errp, "Failed to send export name");
 618            goto fail;
 619        }
 620    } else {
 621        TRACE("Checking magic (cli_magic)");
 622
 623        if (magic != NBD_CLIENT_MAGIC) {
 624            if (magic == NBD_OPTS_MAGIC) {
 625                error_setg(errp, "Server requires an export name");
 626            } else {
 627                error_setg(errp, "Bad magic received");
 628            }
 629            goto fail;
 630        }
 631    }
 632
 633    if (read_sync(csock, &s, sizeof(s)) != sizeof(s)) {
 634        error_setg(errp, "Failed to read export length");
 635        goto fail;
 636    }
 637    *size = be64_to_cpu(s);
 638    TRACE("Size is %" PRIu64, *size);
 639
 640    if (!name) {
 641        if (read_sync(csock, flags, sizeof(*flags)) != sizeof(*flags)) {
 642            error_setg(errp, "Failed to read export flags");
 643            goto fail;
 644        }
 645        *flags = be32_to_cpup(flags);
 646    } else {
 647        if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
 648            error_setg(errp, "Failed to read export flags");
 649            goto fail;
 650        }
 651        *flags |= be16_to_cpu(tmp);
 652    }
 653    if (read_sync(csock, &buf, 124) != 124) {
 654        error_setg(errp, "Failed to read reserved block");
 655        goto fail;
 656    }
 657    rc = 0;
 658
 659fail:
 660    return rc;
 661}
 662
 663#ifdef __linux__
 664int nbd_init(int fd, int csock, uint32_t flags, off_t size)
 665{
 666    TRACE("Setting NBD socket");
 667
 668    if (ioctl(fd, NBD_SET_SOCK, csock) < 0) {
 669        int serrno = errno;
 670        LOG("Failed to set NBD socket");
 671        return -serrno;
 672    }
 673
 674    TRACE("Setting block size to %lu", (unsigned long)BDRV_SECTOR_SIZE);
 675
 676    if (ioctl(fd, NBD_SET_BLKSIZE, (size_t)BDRV_SECTOR_SIZE) < 0) {
 677        int serrno = errno;
 678        LOG("Failed setting NBD block size");
 679        return -serrno;
 680    }
 681
 682    TRACE("Setting size to %zd block(s)", (size_t)(size / BDRV_SECTOR_SIZE));
 683
 684    if (ioctl(fd, NBD_SET_SIZE_BLOCKS, size / (size_t)BDRV_SECTOR_SIZE) < 0) {
 685        int serrno = errno;
 686        LOG("Failed setting size (in blocks)");
 687        return -serrno;
 688    }
 689
 690    if (ioctl(fd, NBD_SET_FLAGS, flags) < 0) {
 691        if (errno == ENOTTY) {
 692            int read_only = (flags & NBD_FLAG_READ_ONLY) != 0;
 693            TRACE("Setting readonly attribute");
 694
 695            if (ioctl(fd, BLKROSET, (unsigned long) &read_only) < 0) {
 696                int serrno = errno;
 697                LOG("Failed setting read-only attribute");
 698                return -serrno;
 699            }
 700        } else {
 701            int serrno = errno;
 702            LOG("Failed setting flags");
 703            return -serrno;
 704        }
 705    }
 706
 707    TRACE("Negotiation ended");
 708
 709    return 0;
 710}
 711
 712int nbd_disconnect(int fd)
 713{
 714    ioctl(fd, NBD_CLEAR_QUE);
 715    ioctl(fd, NBD_DISCONNECT);
 716    ioctl(fd, NBD_CLEAR_SOCK);
 717    return 0;
 718}
 719
 720int nbd_client(int fd)
 721{
 722    int ret;
 723    int serrno;
 724
 725    TRACE("Doing NBD loop");
 726
 727    ret = ioctl(fd, NBD_DO_IT);
 728    if (ret < 0 && errno == EPIPE) {
 729        /* NBD_DO_IT normally returns EPIPE when someone has disconnected
 730         * the socket via NBD_DISCONNECT.  We do not want to return 1 in
 731         * that case.
 732         */
 733        ret = 0;
 734    }
 735    serrno = errno;
 736
 737    TRACE("NBD loop returned %d: %s", ret, strerror(serrno));
 738
 739    TRACE("Clearing NBD queue");
 740    ioctl(fd, NBD_CLEAR_QUE);
 741
 742    TRACE("Clearing NBD socket");
 743    ioctl(fd, NBD_CLEAR_SOCK);
 744
 745    errno = serrno;
 746    return ret;
 747}
 748#else
 749int nbd_init(int fd, int csock, uint32_t flags, off_t size)
 750{
 751    return -ENOTSUP;
 752}
 753
 754int nbd_disconnect(int fd)
 755{
 756    return -ENOTSUP;
 757}
 758
 759int nbd_client(int fd)
 760{
 761    return -ENOTSUP;
 762}
 763#endif
 764
 765ssize_t nbd_send_request(int csock, struct nbd_request *request)
 766{
 767    uint8_t buf[NBD_REQUEST_SIZE];
 768    ssize_t ret;
 769
 770    cpu_to_be32w((uint32_t*)buf, NBD_REQUEST_MAGIC);
 771    cpu_to_be32w((uint32_t*)(buf + 4), request->type);
 772    cpu_to_be64w((uint64_t*)(buf + 8), request->handle);
 773    cpu_to_be64w((uint64_t*)(buf + 16), request->from);
 774    cpu_to_be32w((uint32_t*)(buf + 24), request->len);
 775
 776    TRACE("Sending request to client: "
 777          "{ .from = %" PRIu64", .len = %u, .handle = %" PRIu64", .type=%i}",
 778          request->from, request->len, request->handle, request->type);
 779
 780    ret = write_sync(csock, buf, sizeof(buf));
 781    if (ret < 0) {
 782        return ret;
 783    }
 784
 785    if (ret != sizeof(buf)) {
 786        LOG("writing to socket failed");
 787        return -EINVAL;
 788    }
 789    return 0;
 790}
 791
 792static ssize_t nbd_receive_request(int csock, struct nbd_request *request)
 793{
 794    uint8_t buf[NBD_REQUEST_SIZE];
 795    uint32_t magic;
 796    ssize_t ret;
 797
 798    ret = read_sync(csock, buf, sizeof(buf));
 799    if (ret < 0) {
 800        return ret;
 801    }
 802
 803    if (ret != sizeof(buf)) {
 804        LOG("read failed");
 805        return -EINVAL;
 806    }
 807
 808    /* Request
 809       [ 0 ..  3]   magic   (NBD_REQUEST_MAGIC)
 810       [ 4 ..  7]   type    (0 == READ, 1 == WRITE)
 811       [ 8 .. 15]   handle
 812       [16 .. 23]   from
 813       [24 .. 27]   len
 814     */
 815
 816    magic = be32_to_cpup((uint32_t*)buf);
 817    request->type  = be32_to_cpup((uint32_t*)(buf + 4));
 818    request->handle = be64_to_cpup((uint64_t*)(buf + 8));
 819    request->from  = be64_to_cpup((uint64_t*)(buf + 16));
 820    request->len   = be32_to_cpup((uint32_t*)(buf + 24));
 821
 822    TRACE("Got request: "
 823          "{ magic = 0x%x, .type = %d, from = %" PRIu64" , len = %u }",
 824          magic, request->type, request->from, request->len);
 825
 826    if (magic != NBD_REQUEST_MAGIC) {
 827        LOG("invalid magic (got 0x%x)", magic);
 828        return -EINVAL;
 829    }
 830    return 0;
 831}
 832
 833ssize_t nbd_receive_reply(int csock, struct nbd_reply *reply)
 834{
 835    uint8_t buf[NBD_REPLY_SIZE];
 836    uint32_t magic;
 837    ssize_t ret;
 838
 839    ret = read_sync(csock, buf, sizeof(buf));
 840    if (ret < 0) {
 841        return ret;
 842    }
 843
 844    if (ret != sizeof(buf)) {
 845        LOG("read failed");
 846        return -EINVAL;
 847    }
 848
 849    /* Reply
 850       [ 0 ..  3]    magic   (NBD_REPLY_MAGIC)
 851       [ 4 ..  7]    error   (0 == no error)
 852       [ 7 .. 15]    handle
 853     */
 854
 855    magic = be32_to_cpup((uint32_t*)buf);
 856    reply->error  = be32_to_cpup((uint32_t*)(buf + 4));
 857    reply->handle = be64_to_cpup((uint64_t*)(buf + 8));
 858
 859    TRACE("Got reply: "
 860          "{ magic = 0x%x, .error = %d, handle = %" PRIu64" }",
 861          magic, reply->error, reply->handle);
 862
 863    if (magic != NBD_REPLY_MAGIC) {
 864        LOG("invalid magic (got 0x%x)", magic);
 865        return -EINVAL;
 866    }
 867    return 0;
 868}
 869
 870static ssize_t nbd_send_reply(int csock, struct nbd_reply *reply)
 871{
 872    uint8_t buf[NBD_REPLY_SIZE];
 873    ssize_t ret;
 874
 875    /* Reply
 876       [ 0 ..  3]    magic   (NBD_REPLY_MAGIC)
 877       [ 4 ..  7]    error   (0 == no error)
 878       [ 7 .. 15]    handle
 879     */
 880    cpu_to_be32w((uint32_t*)buf, NBD_REPLY_MAGIC);
 881    cpu_to_be32w((uint32_t*)(buf + 4), reply->error);
 882    cpu_to_be64w((uint64_t*)(buf + 8), reply->handle);
 883
 884    TRACE("Sending response to client");
 885
 886    ret = write_sync(csock, buf, sizeof(buf));
 887    if (ret < 0) {
 888        return ret;
 889    }
 890
 891    if (ret != sizeof(buf)) {
 892        LOG("writing to socket failed");
 893        return -EINVAL;
 894    }
 895    return 0;
 896}
 897
 898#define MAX_NBD_REQUESTS 16
 899
 900void nbd_client_get(NBDClient *client)
 901{
 902    client->refcount++;
 903}
 904
 905void nbd_client_put(NBDClient *client)
 906{
 907    if (--client->refcount == 0) {
 908        /* The last reference should be dropped by client->close,
 909         * which is called by client_close.
 910         */
 911        assert(client->closing);
 912
 913        nbd_unset_handlers(client);
 914        close(client->sock);
 915        client->sock = -1;
 916        if (client->exp) {
 917            QTAILQ_REMOVE(&client->exp->clients, client, next);
 918            nbd_export_put(client->exp);
 919        }
 920        g_free(client);
 921    }
 922}
 923
 924static void client_close(NBDClient *client)
 925{
 926    if (client->closing) {
 927        return;
 928    }
 929
 930    client->closing = true;
 931
 932    /* Force requests to finish.  They will drop their own references,
 933     * then we'll close the socket and free the NBDClient.
 934     */
 935    shutdown(client->sock, 2);
 936
 937    /* Also tell the client, so that they release their reference.  */
 938    if (client->close) {
 939        client->close(client);
 940    }
 941}
 942
 943static NBDRequest *nbd_request_get(NBDClient *client)
 944{
 945    NBDRequest *req;
 946
 947    assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
 948    client->nb_requests++;
 949    nbd_update_can_read(client);
 950
 951    req = g_slice_new0(NBDRequest);
 952    nbd_client_get(client);
 953    req->client = client;
 954    return req;
 955}
 956
 957static void nbd_request_put(NBDRequest *req)
 958{
 959    NBDClient *client = req->client;
 960
 961    if (req->data) {
 962        qemu_vfree(req->data);
 963    }
 964    g_slice_free(NBDRequest, req);
 965
 966    client->nb_requests--;
 967    nbd_update_can_read(client);
 968    nbd_client_put(client);
 969}
 970
 971static void blk_aio_attached(AioContext *ctx, void *opaque)
 972{
 973    NBDExport *exp = opaque;
 974    NBDClient *client;
 975
 976    TRACE("Export %s: Attaching clients to AIO context %p\n", exp->name, ctx);
 977
 978    exp->ctx = ctx;
 979
 980    QTAILQ_FOREACH(client, &exp->clients, next) {
 981        nbd_set_handlers(client);
 982    }
 983}
 984
 985static void blk_aio_detach(void *opaque)
 986{
 987    NBDExport *exp = opaque;
 988    NBDClient *client;
 989
 990    TRACE("Export %s: Detaching clients from AIO context %p\n", exp->name, exp->ctx);
 991
 992    QTAILQ_FOREACH(client, &exp->clients, next) {
 993        nbd_unset_handlers(client);
 994    }
 995
 996    exp->ctx = NULL;
 997}
 998
 999NBDExport *nbd_export_new(BlockBackend *blk, off_t dev_offset, off_t size,
1000                          uint32_t nbdflags, void (*close)(NBDExport *),
1001                          Error **errp)
1002{
1003    NBDExport *exp = g_malloc0(sizeof(NBDExport));
1004    exp->refcount = 1;
1005    QTAILQ_INIT(&exp->clients);
1006    exp->blk = blk;
1007    exp->dev_offset = dev_offset;
1008    exp->nbdflags = nbdflags;
1009    exp->size = size < 0 ? blk_getlength(blk) : size;
1010    if (exp->size < 0) {
1011        error_setg_errno(errp, -exp->size,
1012                         "Failed to determine the NBD export's length");
1013        goto fail;
1014    }
1015    exp->size -= exp->size % BDRV_SECTOR_SIZE;
1016
1017    exp->close = close;
1018    exp->ctx = blk_get_aio_context(blk);
1019    blk_ref(blk);
1020    blk_add_aio_context_notifier(blk, blk_aio_attached, blk_aio_detach, exp);
1021    /*
1022     * NBD exports are used for non-shared storage migration.  Make sure
1023     * that BDRV_O_INCOMING is cleared and the image is ready for write
1024     * access since the export could be available before migration handover.
1025     */
1026    blk_invalidate_cache(blk, NULL);
1027    return exp;
1028
1029fail:
1030    g_free(exp);
1031    return NULL;
1032}
1033
1034NBDExport *nbd_export_find(const char *name)
1035{
1036    NBDExport *exp;
1037    QTAILQ_FOREACH(exp, &exports, next) {
1038        if (strcmp(name, exp->name) == 0) {
1039            return exp;
1040        }
1041    }
1042
1043    return NULL;
1044}
1045
1046void nbd_export_set_name(NBDExport *exp, const char *name)
1047{
1048    if (exp->name == name) {
1049        return;
1050    }
1051
1052    nbd_export_get(exp);
1053    if (exp->name != NULL) {
1054        g_free(exp->name);
1055        exp->name = NULL;
1056        QTAILQ_REMOVE(&exports, exp, next);
1057        nbd_export_put(exp);
1058    }
1059    if (name != NULL) {
1060        nbd_export_get(exp);
1061        exp->name = g_strdup(name);
1062        QTAILQ_INSERT_TAIL(&exports, exp, next);
1063    }
1064    nbd_export_put(exp);
1065}
1066
1067void nbd_export_close(NBDExport *exp)
1068{
1069    NBDClient *client, *next;
1070
1071    nbd_export_get(exp);
1072    QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) {
1073        client_close(client);
1074    }
1075    nbd_export_set_name(exp, NULL);
1076    nbd_export_put(exp);
1077    if (exp->blk) {
1078        blk_remove_aio_context_notifier(exp->blk, blk_aio_attached,
1079                                        blk_aio_detach, exp);
1080        blk_unref(exp->blk);
1081        exp->blk = NULL;
1082    }
1083}
1084
1085void nbd_export_get(NBDExport *exp)
1086{
1087    assert(exp->refcount > 0);
1088    exp->refcount++;
1089}
1090
1091void nbd_export_put(NBDExport *exp)
1092{
1093    assert(exp->refcount > 0);
1094    if (exp->refcount == 1) {
1095        nbd_export_close(exp);
1096    }
1097
1098    if (--exp->refcount == 0) {
1099        assert(exp->name == NULL);
1100
1101        if (exp->close) {
1102            exp->close(exp);
1103        }
1104
1105        g_free(exp);
1106    }
1107}
1108
1109BlockBackend *nbd_export_get_blockdev(NBDExport *exp)
1110{
1111    return exp->blk;
1112}
1113
1114void nbd_export_close_all(void)
1115{
1116    NBDExport *exp, *next;
1117
1118    QTAILQ_FOREACH_SAFE(exp, &exports, next, next) {
1119        nbd_export_close(exp);
1120    }
1121}
1122
1123static ssize_t nbd_co_send_reply(NBDRequest *req, struct nbd_reply *reply,
1124                                 int len)
1125{
1126    NBDClient *client = req->client;
1127    int csock = client->sock;
1128    ssize_t rc, ret;
1129
1130    qemu_co_mutex_lock(&client->send_lock);
1131    client->send_coroutine = qemu_coroutine_self();
1132    nbd_set_handlers(client);
1133
1134    if (!len) {
1135        rc = nbd_send_reply(csock, reply);
1136    } else {
1137        socket_set_cork(csock, 1);
1138        rc = nbd_send_reply(csock, reply);
1139        if (rc >= 0) {
1140            ret = qemu_co_send(csock, req->data, len);
1141            if (ret != len) {
1142                rc = -EIO;
1143            }
1144        }
1145        socket_set_cork(csock, 0);
1146    }
1147
1148    client->send_coroutine = NULL;
1149    nbd_set_handlers(client);
1150    qemu_co_mutex_unlock(&client->send_lock);
1151    return rc;
1152}
1153
1154static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request *request)
1155{
1156    NBDClient *client = req->client;
1157    int csock = client->sock;
1158    uint32_t command;
1159    ssize_t rc;
1160
1161    client->recv_coroutine = qemu_coroutine_self();
1162    nbd_update_can_read(client);
1163
1164    rc = nbd_receive_request(csock, request);
1165    if (rc < 0) {
1166        if (rc != -EAGAIN) {
1167            rc = -EIO;
1168        }
1169        goto out;
1170    }
1171
1172    if (request->len > NBD_MAX_BUFFER_SIZE) {
1173        LOG("len (%u) is larger than max len (%u)",
1174            request->len, NBD_MAX_BUFFER_SIZE);
1175        rc = -EINVAL;
1176        goto out;
1177    }
1178
1179    if ((request->from + request->len) < request->from) {
1180        LOG("integer overflow detected! "
1181            "you're probably being attacked");
1182        rc = -EINVAL;
1183        goto out;
1184    }
1185
1186    TRACE("Decoding type");
1187
1188    command = request->type & NBD_CMD_MASK_COMMAND;
1189    if (command == NBD_CMD_READ || command == NBD_CMD_WRITE) {
1190        req->data = blk_blockalign(client->exp->blk, request->len);
1191    }
1192    if (command == NBD_CMD_WRITE) {
1193        TRACE("Reading %u byte(s)", request->len);
1194
1195        if (qemu_co_recv(csock, req->data, request->len) != request->len) {
1196            LOG("reading from socket failed");
1197            rc = -EIO;
1198            goto out;
1199        }
1200    }
1201    rc = 0;
1202
1203out:
1204    client->recv_coroutine = NULL;
1205    nbd_update_can_read(client);
1206
1207    return rc;
1208}
1209
1210static void nbd_trip(void *opaque)
1211{
1212    NBDClient *client = opaque;
1213    NBDExport *exp = client->exp;
1214    NBDRequest *req;
1215    struct nbd_request request;
1216    struct nbd_reply reply;
1217    ssize_t ret;
1218    uint32_t command;
1219
1220    TRACE("Reading request.");
1221    if (client->closing) {
1222        return;
1223    }
1224
1225    req = nbd_request_get(client);
1226    ret = nbd_co_receive_request(req, &request);
1227    if (ret == -EAGAIN) {
1228        goto done;
1229    }
1230    if (ret == -EIO) {
1231        goto out;
1232    }
1233
1234    reply.handle = request.handle;
1235    reply.error = 0;
1236
1237    if (ret < 0) {
1238        reply.error = -ret;
1239        goto error_reply;
1240    }
1241    command = request.type & NBD_CMD_MASK_COMMAND;
1242    if (command != NBD_CMD_DISC && (request.from + request.len) > exp->size) {
1243            LOG("From: %" PRIu64 ", Len: %u, Size: %" PRIu64
1244            ", Offset: %" PRIu64 "\n",
1245                    request.from, request.len,
1246                    (uint64_t)exp->size, (uint64_t)exp->dev_offset);
1247        LOG("requested operation past EOF--bad client?");
1248        goto invalid_request;
1249    }
1250
1251    switch (command) {
1252    case NBD_CMD_READ:
1253        TRACE("Request type is READ");
1254
1255        if (request.type & NBD_CMD_FLAG_FUA) {
1256            ret = blk_co_flush(exp->blk);
1257            if (ret < 0) {
1258                LOG("flush failed");
1259                reply.error = -ret;
1260                goto error_reply;
1261            }
1262        }
1263
1264        ret = blk_read(exp->blk,
1265                       (request.from + exp->dev_offset) / BDRV_SECTOR_SIZE,
1266                       req->data, request.len / BDRV_SECTOR_SIZE);
1267        if (ret < 0) {
1268            LOG("reading from file failed");
1269            reply.error = -ret;
1270            goto error_reply;
1271        }
1272
1273        TRACE("Read %u byte(s)", request.len);
1274        if (nbd_co_send_reply(req, &reply, request.len) < 0)
1275            goto out;
1276        break;
1277    case NBD_CMD_WRITE:
1278        TRACE("Request type is WRITE");
1279
1280        if (exp->nbdflags & NBD_FLAG_READ_ONLY) {
1281            TRACE("Server is read-only, return error");
1282            reply.error = EROFS;
1283            goto error_reply;
1284        }
1285
1286        TRACE("Writing to device");
1287
1288        ret = blk_write(exp->blk,
1289                        (request.from + exp->dev_offset) / BDRV_SECTOR_SIZE,
1290                        req->data, request.len / BDRV_SECTOR_SIZE);
1291        if (ret < 0) {
1292            LOG("writing to file failed");
1293            reply.error = -ret;
1294            goto error_reply;
1295        }
1296
1297        if (request.type & NBD_CMD_FLAG_FUA) {
1298            ret = blk_co_flush(exp->blk);
1299            if (ret < 0) {
1300                LOG("flush failed");
1301                reply.error = -ret;
1302                goto error_reply;
1303            }
1304        }
1305
1306        if (nbd_co_send_reply(req, &reply, 0) < 0) {
1307            goto out;
1308        }
1309        break;
1310    case NBD_CMD_DISC:
1311        TRACE("Request type is DISCONNECT");
1312        errno = 0;
1313        goto out;
1314    case NBD_CMD_FLUSH:
1315        TRACE("Request type is FLUSH");
1316
1317        ret = blk_co_flush(exp->blk);
1318        if (ret < 0) {
1319            LOG("flush failed");
1320            reply.error = -ret;
1321        }
1322        if (nbd_co_send_reply(req, &reply, 0) < 0) {
1323            goto out;
1324        }
1325        break;
1326    case NBD_CMD_TRIM:
1327        TRACE("Request type is TRIM");
1328        ret = blk_co_discard(exp->blk, (request.from + exp->dev_offset)
1329                                       / BDRV_SECTOR_SIZE,
1330                             request.len / BDRV_SECTOR_SIZE);
1331        if (ret < 0) {
1332            LOG("discard failed");
1333            reply.error = -ret;
1334        }
1335        if (nbd_co_send_reply(req, &reply, 0) < 0) {
1336            goto out;
1337        }
1338        break;
1339    default:
1340        LOG("invalid request type (%u) received", request.type);
1341    invalid_request:
1342        reply.error = EINVAL;
1343    error_reply:
1344        if (nbd_co_send_reply(req, &reply, 0) < 0) {
1345            goto out;
1346        }
1347        break;
1348    }
1349
1350    TRACE("Request/Reply complete");
1351
1352done:
1353    nbd_request_put(req);
1354    return;
1355
1356out:
1357    nbd_request_put(req);
1358    client_close(client);
1359}
1360
1361static void nbd_read(void *opaque)
1362{
1363    NBDClient *client = opaque;
1364
1365    if (client->recv_coroutine) {
1366        qemu_coroutine_enter(client->recv_coroutine, NULL);
1367    } else {
1368        qemu_coroutine_enter(qemu_coroutine_create(nbd_trip), client);
1369    }
1370}
1371
1372static void nbd_restart_write(void *opaque)
1373{
1374    NBDClient *client = opaque;
1375
1376    qemu_coroutine_enter(client->send_coroutine, NULL);
1377}
1378
1379static void nbd_set_handlers(NBDClient *client)
1380{
1381    if (client->exp && client->exp->ctx) {
1382        aio_set_fd_handler(client->exp->ctx, client->sock,
1383                           client->can_read ? nbd_read : NULL,
1384                           client->send_coroutine ? nbd_restart_write : NULL,
1385                           client);
1386    }
1387}
1388
1389static void nbd_unset_handlers(NBDClient *client)
1390{
1391    if (client->exp && client->exp->ctx) {
1392        aio_set_fd_handler(client->exp->ctx, client->sock, NULL, NULL, NULL);
1393    }
1394}
1395
1396static void nbd_update_can_read(NBDClient *client)
1397{
1398    bool can_read = client->recv_coroutine ||
1399                    client->nb_requests < MAX_NBD_REQUESTS;
1400
1401    if (can_read != client->can_read) {
1402        client->can_read = can_read;
1403        nbd_set_handlers(client);
1404
1405        /* There is no need to invoke aio_notify(), since aio_set_fd_handler()
1406         * in nbd_set_handlers() will have taken care of that */
1407    }
1408}
1409
1410NBDClient *nbd_client_new(NBDExport *exp, int csock,
1411                          void (*close)(NBDClient *))
1412{
1413    NBDClient *client;
1414    client = g_malloc0(sizeof(NBDClient));
1415    client->refcount = 1;
1416    client->exp = exp;
1417    client->sock = csock;
1418    client->can_read = true;
1419    if (nbd_send_negotiate(client)) {
1420        g_free(client);
1421        return NULL;
1422    }
1423    client->close = close;
1424    qemu_co_mutex_init(&client->send_lock);
1425    nbd_set_handlers(client);
1426
1427    if (exp) {
1428        QTAILQ_INSERT_TAIL(&exp->clients, client, next);
1429        nbd_export_get(exp);
1430    }
1431    return client;
1432}
1433