qemu/hw/block/virtio-blk.c
<<
>>
Prefs
   1/*
   2 * Virtio Block Device
   3 *
   4 * Copyright IBM, Corp. 2007
   5 *
   6 * Authors:
   7 *  Anthony Liguori   <aliguori@us.ibm.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2.  See
  10 * the COPYING file in the top-level directory.
  11 *
  12 */
  13
  14#include "qemu/osdep.h"
  15#include "qapi/error.h"
  16#include "qemu-common.h"
  17#include "qemu/iov.h"
  18#include "qemu/error-report.h"
  19#include "trace.h"
  20#include "hw/block/block.h"
  21#include "sysemu/block-backend.h"
  22#include "sysemu/blockdev.h"
  23#include "hw/virtio/virtio-blk.h"
  24#include "dataplane/virtio-blk.h"
  25#include "block/scsi.h"
  26#ifdef __linux__
  27# include <scsi/sg.h>
  28#endif
  29#include "hw/virtio/virtio-bus.h"
  30#include "hw/virtio/virtio-access.h"
  31
  32void virtio_blk_init_request(VirtIOBlock *s, VirtIOBlockReq *req)
  33{
  34    req->dev = s;
  35    req->qiov.size = 0;
  36    req->in_len = 0;
  37    req->next = NULL;
  38    req->mr_next = NULL;
  39}
  40
  41void virtio_blk_free_request(VirtIOBlockReq *req)
  42{
  43    if (req) {
  44        g_free(req);
  45    }
  46}
  47
  48static void virtio_blk_req_complete(VirtIOBlockReq *req, unsigned char status)
  49{
  50    VirtIOBlock *s = req->dev;
  51    VirtIODevice *vdev = VIRTIO_DEVICE(s);
  52
  53    trace_virtio_blk_req_complete(req, status);
  54
  55    stb_p(&req->in->status, status);
  56    virtqueue_push(s->vq, &req->elem, req->in_len);
  57    if (s->dataplane_started && !s->dataplane_disabled) {
  58        virtio_blk_data_plane_notify(s->dataplane);
  59    } else {
  60        virtio_notify(vdev, s->vq);
  61    }
  62}
  63
  64static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
  65    bool is_read)
  66{
  67    BlockErrorAction action = blk_get_error_action(req->dev->blk,
  68                                                   is_read, error);
  69    VirtIOBlock *s = req->dev;
  70
  71    if (action == BLOCK_ERROR_ACTION_STOP) {
  72        /* Break the link as the next request is going to be parsed from the
  73         * ring again. Otherwise we may end up doing a double completion! */
  74        req->mr_next = NULL;
  75        req->next = s->rq;
  76        s->rq = req;
  77    } else if (action == BLOCK_ERROR_ACTION_REPORT) {
  78        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
  79        block_acct_failed(blk_get_stats(s->blk), &req->acct);
  80        virtio_blk_free_request(req);
  81    }
  82
  83    blk_error_action(s->blk, action, is_read, error);
  84    return action != BLOCK_ERROR_ACTION_IGNORE;
  85}
  86
  87static void virtio_blk_rw_complete(void *opaque, int ret)
  88{
  89    VirtIOBlockReq *next = opaque;
  90
  91    while (next) {
  92        VirtIOBlockReq *req = next;
  93        next = req->mr_next;
  94        trace_virtio_blk_rw_complete(req, ret);
  95
  96        if (req->qiov.nalloc != -1) {
  97            /* If nalloc is != 1 req->qiov is a local copy of the original
  98             * external iovec. It was allocated in submit_merged_requests
  99             * to be able to merge requests. */
 100            qemu_iovec_destroy(&req->qiov);
 101        }
 102
 103        if (ret) {
 104            int p = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
 105            bool is_read = !(p & VIRTIO_BLK_T_OUT);
 106            /* Note that memory may be dirtied on read failure.  If the
 107             * virtio request is not completed here, as is the case for
 108             * BLOCK_ERROR_ACTION_STOP, the memory may not be copied
 109             * correctly during live migration.  While this is ugly,
 110             * it is acceptable because the device is free to write to
 111             * the memory until the request is completed (which will
 112             * happen on the other side of the migration).
 113             */
 114            if (virtio_blk_handle_rw_error(req, -ret, is_read)) {
 115                continue;
 116            }
 117        }
 118
 119        virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
 120        block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
 121        virtio_blk_free_request(req);
 122    }
 123}
 124
 125static void virtio_blk_flush_complete(void *opaque, int ret)
 126{
 127    VirtIOBlockReq *req = opaque;
 128
 129    if (ret) {
 130        if (virtio_blk_handle_rw_error(req, -ret, 0)) {
 131            return;
 132        }
 133    }
 134
 135    virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
 136    block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
 137    virtio_blk_free_request(req);
 138}
 139
 140#ifdef __linux__
 141
 142typedef struct {
 143    VirtIOBlockReq *req;
 144    struct sg_io_hdr hdr;
 145} VirtIOBlockIoctlReq;
 146
 147static void virtio_blk_ioctl_complete(void *opaque, int status)
 148{
 149    VirtIOBlockIoctlReq *ioctl_req = opaque;
 150    VirtIOBlockReq *req = ioctl_req->req;
 151    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
 152    struct virtio_scsi_inhdr *scsi;
 153    struct sg_io_hdr *hdr;
 154
 155    scsi = (void *)req->elem.in_sg[req->elem.in_num - 2].iov_base;
 156
 157    if (status) {
 158        status = VIRTIO_BLK_S_UNSUPP;
 159        virtio_stl_p(vdev, &scsi->errors, 255);
 160        goto out;
 161    }
 162
 163    hdr = &ioctl_req->hdr;
 164    /*
 165     * From SCSI-Generic-HOWTO: "Some lower level drivers (e.g. ide-scsi)
 166     * clear the masked_status field [hence status gets cleared too, see
 167     * block/scsi_ioctl.c] even when a CHECK_CONDITION or COMMAND_TERMINATED
 168     * status has occurred.  However they do set DRIVER_SENSE in driver_status
 169     * field. Also a (sb_len_wr > 0) indicates there is a sense buffer.
 170     */
 171    if (hdr->status == 0 && hdr->sb_len_wr > 0) {
 172        hdr->status = CHECK_CONDITION;
 173    }
 174
 175    virtio_stl_p(vdev, &scsi->errors,
 176                 hdr->status | (hdr->msg_status << 8) |
 177                 (hdr->host_status << 16) | (hdr->driver_status << 24));
 178    virtio_stl_p(vdev, &scsi->residual, hdr->resid);
 179    virtio_stl_p(vdev, &scsi->sense_len, hdr->sb_len_wr);
 180    virtio_stl_p(vdev, &scsi->data_len, hdr->dxfer_len);
 181
 182out:
 183    virtio_blk_req_complete(req, status);
 184    virtio_blk_free_request(req);
 185    g_free(ioctl_req);
 186}
 187
 188#endif
 189
 190static VirtIOBlockReq *virtio_blk_get_request(VirtIOBlock *s)
 191{
 192    VirtIOBlockReq *req = virtqueue_pop(s->vq, sizeof(VirtIOBlockReq));
 193
 194    if (req) {
 195        virtio_blk_init_request(s, req);
 196    }
 197    return req;
 198}
 199
 200static int virtio_blk_handle_scsi_req(VirtIOBlockReq *req)
 201{
 202    int status = VIRTIO_BLK_S_OK;
 203    struct virtio_scsi_inhdr *scsi = NULL;
 204    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
 205    VirtQueueElement *elem = &req->elem;
 206    VirtIOBlock *blk = req->dev;
 207
 208#ifdef __linux__
 209    int i;
 210    VirtIOBlockIoctlReq *ioctl_req;
 211    BlockAIOCB *acb;
 212#endif
 213
 214    /*
 215     * We require at least one output segment each for the virtio_blk_outhdr
 216     * and the SCSI command block.
 217     *
 218     * We also at least require the virtio_blk_inhdr, the virtio_scsi_inhdr
 219     * and the sense buffer pointer in the input segments.
 220     */
 221    if (elem->out_num < 2 || elem->in_num < 3) {
 222        status = VIRTIO_BLK_S_IOERR;
 223        goto fail;
 224    }
 225
 226    /*
 227     * The scsi inhdr is placed in the second-to-last input segment, just
 228     * before the regular inhdr.
 229     */
 230    scsi = (void *)elem->in_sg[elem->in_num - 2].iov_base;
 231
 232    if (!blk->conf.scsi) {
 233        status = VIRTIO_BLK_S_UNSUPP;
 234        goto fail;
 235    }
 236
 237    /*
 238     * No support for bidirection commands yet.
 239     */
 240    if (elem->out_num > 2 && elem->in_num > 3) {
 241        status = VIRTIO_BLK_S_UNSUPP;
 242        goto fail;
 243    }
 244
 245#ifdef __linux__
 246    ioctl_req = g_new0(VirtIOBlockIoctlReq, 1);
 247    ioctl_req->req = req;
 248    ioctl_req->hdr.interface_id = 'S';
 249    ioctl_req->hdr.cmd_len = elem->out_sg[1].iov_len;
 250    ioctl_req->hdr.cmdp = elem->out_sg[1].iov_base;
 251    ioctl_req->hdr.dxfer_len = 0;
 252
 253    if (elem->out_num > 2) {
 254        /*
 255         * If there are more than the minimally required 2 output segments
 256         * there is write payload starting from the third iovec.
 257         */
 258        ioctl_req->hdr.dxfer_direction = SG_DXFER_TO_DEV;
 259        ioctl_req->hdr.iovec_count = elem->out_num - 2;
 260
 261        for (i = 0; i < ioctl_req->hdr.iovec_count; i++) {
 262            ioctl_req->hdr.dxfer_len += elem->out_sg[i + 2].iov_len;
 263        }
 264
 265        ioctl_req->hdr.dxferp = elem->out_sg + 2;
 266
 267    } else if (elem->in_num > 3) {
 268        /*
 269         * If we have more than 3 input segments the guest wants to actually
 270         * read data.
 271         */
 272        ioctl_req->hdr.dxfer_direction = SG_DXFER_FROM_DEV;
 273        ioctl_req->hdr.iovec_count = elem->in_num - 3;
 274        for (i = 0; i < ioctl_req->hdr.iovec_count; i++) {
 275            ioctl_req->hdr.dxfer_len += elem->in_sg[i].iov_len;
 276        }
 277
 278        ioctl_req->hdr.dxferp = elem->in_sg;
 279    } else {
 280        /*
 281         * Some SCSI commands don't actually transfer any data.
 282         */
 283        ioctl_req->hdr.dxfer_direction = SG_DXFER_NONE;
 284    }
 285
 286    ioctl_req->hdr.sbp = elem->in_sg[elem->in_num - 3].iov_base;
 287    ioctl_req->hdr.mx_sb_len = elem->in_sg[elem->in_num - 3].iov_len;
 288
 289    acb = blk_aio_ioctl(blk->blk, SG_IO, &ioctl_req->hdr,
 290                        virtio_blk_ioctl_complete, ioctl_req);
 291    if (!acb) {
 292        g_free(ioctl_req);
 293        status = VIRTIO_BLK_S_UNSUPP;
 294        goto fail;
 295    }
 296    return -EINPROGRESS;
 297#else
 298    abort();
 299#endif
 300
 301fail:
 302    /* Just put anything nonzero so that the ioctl fails in the guest.  */
 303    if (scsi) {
 304        virtio_stl_p(vdev, &scsi->errors, 255);
 305    }
 306    return status;
 307}
 308
 309static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
 310{
 311    int status;
 312
 313    status = virtio_blk_handle_scsi_req(req);
 314    if (status != -EINPROGRESS) {
 315        virtio_blk_req_complete(req, status);
 316        virtio_blk_free_request(req);
 317    }
 318}
 319
 320static inline void submit_requests(BlockBackend *blk, MultiReqBuffer *mrb,
 321                                   int start, int num_reqs, int niov)
 322{
 323    QEMUIOVector *qiov = &mrb->reqs[start]->qiov;
 324    int64_t sector_num = mrb->reqs[start]->sector_num;
 325    int nb_sectors = mrb->reqs[start]->qiov.size / BDRV_SECTOR_SIZE;
 326    bool is_write = mrb->is_write;
 327
 328    if (num_reqs > 1) {
 329        int i;
 330        struct iovec *tmp_iov = qiov->iov;
 331        int tmp_niov = qiov->niov;
 332
 333        /* mrb->reqs[start]->qiov was initialized from external so we can't
 334         * modifiy it here. We need to initialize it locally and then add the
 335         * external iovecs. */
 336        qemu_iovec_init(qiov, niov);
 337
 338        for (i = 0; i < tmp_niov; i++) {
 339            qemu_iovec_add(qiov, tmp_iov[i].iov_base, tmp_iov[i].iov_len);
 340        }
 341
 342        for (i = start + 1; i < start + num_reqs; i++) {
 343            qemu_iovec_concat(qiov, &mrb->reqs[i]->qiov, 0,
 344                              mrb->reqs[i]->qiov.size);
 345            mrb->reqs[i - 1]->mr_next = mrb->reqs[i];
 346            nb_sectors += mrb->reqs[i]->qiov.size / BDRV_SECTOR_SIZE;
 347        }
 348        assert(nb_sectors == qiov->size / BDRV_SECTOR_SIZE);
 349
 350        trace_virtio_blk_submit_multireq(mrb, start, num_reqs, sector_num,
 351                                         nb_sectors, is_write);
 352        block_acct_merge_done(blk_get_stats(blk),
 353                              is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ,
 354                              num_reqs - 1);
 355    }
 356
 357    if (is_write) {
 358        blk_aio_writev(blk, sector_num, qiov, nb_sectors,
 359                       virtio_blk_rw_complete, mrb->reqs[start]);
 360    } else {
 361        blk_aio_readv(blk, sector_num, qiov, nb_sectors,
 362                      virtio_blk_rw_complete, mrb->reqs[start]);
 363    }
 364}
 365
 366static int multireq_compare(const void *a, const void *b)
 367{
 368    const VirtIOBlockReq *req1 = *(VirtIOBlockReq **)a,
 369                         *req2 = *(VirtIOBlockReq **)b;
 370
 371    /*
 372     * Note that we can't simply subtract sector_num1 from sector_num2
 373     * here as that could overflow the return value.
 374     */
 375    if (req1->sector_num > req2->sector_num) {
 376        return 1;
 377    } else if (req1->sector_num < req2->sector_num) {
 378        return -1;
 379    } else {
 380        return 0;
 381    }
 382}
 383
 384void virtio_blk_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb)
 385{
 386    int i = 0, start = 0, num_reqs = 0, niov = 0, nb_sectors = 0;
 387    int max_xfer_len = 0;
 388    int64_t sector_num = 0;
 389
 390    if (mrb->num_reqs == 1) {
 391        submit_requests(blk, mrb, 0, 1, -1);
 392        mrb->num_reqs = 0;
 393        return;
 394    }
 395
 396    max_xfer_len = blk_get_max_transfer_length(mrb->reqs[0]->dev->blk);
 397    max_xfer_len = MIN_NON_ZERO(max_xfer_len, BDRV_REQUEST_MAX_SECTORS);
 398
 399    qsort(mrb->reqs, mrb->num_reqs, sizeof(*mrb->reqs),
 400          &multireq_compare);
 401
 402    for (i = 0; i < mrb->num_reqs; i++) {
 403        VirtIOBlockReq *req = mrb->reqs[i];
 404        if (num_reqs > 0) {
 405            /*
 406             * NOTE: We cannot merge the requests in below situations:
 407             * 1. requests are not sequential
 408             * 2. merge would exceed maximum number of IOVs
 409             * 3. merge would exceed maximum transfer length of backend device
 410             */
 411            if (sector_num + nb_sectors != req->sector_num ||
 412                niov > blk_get_max_iov(blk) - req->qiov.niov ||
 413                req->qiov.size / BDRV_SECTOR_SIZE > max_xfer_len ||
 414                nb_sectors > max_xfer_len - req->qiov.size / BDRV_SECTOR_SIZE) {
 415                submit_requests(blk, mrb, start, num_reqs, niov);
 416                num_reqs = 0;
 417            }
 418        }
 419
 420        if (num_reqs == 0) {
 421            sector_num = req->sector_num;
 422            nb_sectors = niov = 0;
 423            start = i;
 424        }
 425
 426        nb_sectors += req->qiov.size / BDRV_SECTOR_SIZE;
 427        niov += req->qiov.niov;
 428        num_reqs++;
 429    }
 430
 431    submit_requests(blk, mrb, start, num_reqs, niov);
 432    mrb->num_reqs = 0;
 433}
 434
 435static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
 436{
 437    block_acct_start(blk_get_stats(req->dev->blk), &req->acct, 0,
 438                     BLOCK_ACCT_FLUSH);
 439
 440    /*
 441     * Make sure all outstanding writes are posted to the backing device.
 442     */
 443    if (mrb->is_write && mrb->num_reqs > 0) {
 444        virtio_blk_submit_multireq(req->dev->blk, mrb);
 445    }
 446    blk_aio_flush(req->dev->blk, virtio_blk_flush_complete, req);
 447}
 448
 449static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
 450                                     uint64_t sector, size_t size)
 451{
 452    uint64_t nb_sectors = size >> BDRV_SECTOR_BITS;
 453    uint64_t total_sectors;
 454
 455    if (nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
 456        return false;
 457    }
 458    if (sector & dev->sector_mask) {
 459        return false;
 460    }
 461    if (size % dev->conf.conf.logical_block_size) {
 462        return false;
 463    }
 464    blk_get_geometry(dev->blk, &total_sectors);
 465    if (sector > total_sectors || nb_sectors > total_sectors - sector) {
 466        return false;
 467    }
 468    return true;
 469}
 470
 471void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
 472{
 473    uint32_t type;
 474    struct iovec *in_iov = req->elem.in_sg;
 475    struct iovec *iov = req->elem.out_sg;
 476    unsigned in_num = req->elem.in_num;
 477    unsigned out_num = req->elem.out_num;
 478
 479    if (req->elem.out_num < 1 || req->elem.in_num < 1) {
 480        error_report("virtio-blk missing headers");
 481        exit(1);
 482    }
 483
 484    if (unlikely(iov_to_buf(iov, out_num, 0, &req->out,
 485                            sizeof(req->out)) != sizeof(req->out))) {
 486        error_report("virtio-blk request outhdr too short");
 487        exit(1);
 488    }
 489
 490    iov_discard_front(&iov, &out_num, sizeof(req->out));
 491
 492    if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
 493        error_report("virtio-blk request inhdr too short");
 494        exit(1);
 495    }
 496
 497    /* We always touch the last byte, so just see how big in_iov is.  */
 498    req->in_len = iov_size(in_iov, in_num);
 499    req->in = (void *)in_iov[in_num - 1].iov_base
 500              + in_iov[in_num - 1].iov_len
 501              - sizeof(struct virtio_blk_inhdr);
 502    iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
 503
 504    type = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
 505
 506    /* VIRTIO_BLK_T_OUT defines the command direction. VIRTIO_BLK_T_BARRIER
 507     * is an optional flag. Although a guest should not send this flag if
 508     * not negotiated we ignored it in the past. So keep ignoring it. */
 509    switch (type & ~(VIRTIO_BLK_T_OUT | VIRTIO_BLK_T_BARRIER)) {
 510    case VIRTIO_BLK_T_IN:
 511    {
 512        bool is_write = type & VIRTIO_BLK_T_OUT;
 513        req->sector_num = virtio_ldq_p(VIRTIO_DEVICE(req->dev),
 514                                       &req->out.sector);
 515
 516        if (is_write) {
 517            qemu_iovec_init_external(&req->qiov, iov, out_num);
 518            trace_virtio_blk_handle_write(req, req->sector_num,
 519                                          req->qiov.size / BDRV_SECTOR_SIZE);
 520        } else {
 521            qemu_iovec_init_external(&req->qiov, in_iov, in_num);
 522            trace_virtio_blk_handle_read(req, req->sector_num,
 523                                         req->qiov.size / BDRV_SECTOR_SIZE);
 524        }
 525
 526        if (!virtio_blk_sect_range_ok(req->dev, req->sector_num,
 527                                      req->qiov.size)) {
 528            virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
 529            block_acct_invalid(blk_get_stats(req->dev->blk),
 530                               is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
 531            virtio_blk_free_request(req);
 532            return;
 533        }
 534
 535        block_acct_start(blk_get_stats(req->dev->blk),
 536                         &req->acct, req->qiov.size,
 537                         is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
 538
 539        /* merge would exceed maximum number of requests or IO direction
 540         * changes */
 541        if (mrb->num_reqs > 0 && (mrb->num_reqs == VIRTIO_BLK_MAX_MERGE_REQS ||
 542                                  is_write != mrb->is_write ||
 543                                  !req->dev->conf.request_merging)) {
 544            virtio_blk_submit_multireq(req->dev->blk, mrb);
 545        }
 546
 547        assert(mrb->num_reqs < VIRTIO_BLK_MAX_MERGE_REQS);
 548        mrb->reqs[mrb->num_reqs++] = req;
 549        mrb->is_write = is_write;
 550        break;
 551    }
 552    case VIRTIO_BLK_T_FLUSH:
 553        virtio_blk_handle_flush(req, mrb);
 554        break;
 555    case VIRTIO_BLK_T_SCSI_CMD:
 556        virtio_blk_handle_scsi(req);
 557        break;
 558    case VIRTIO_BLK_T_GET_ID:
 559    {
 560        VirtIOBlock *s = req->dev;
 561
 562        /*
 563         * NB: per existing s/n string convention the string is
 564         * terminated by '\0' only when shorter than buffer.
 565         */
 566        const char *serial = s->conf.serial ? s->conf.serial : "";
 567        size_t size = MIN(strlen(serial) + 1,
 568                          MIN(iov_size(in_iov, in_num),
 569                              VIRTIO_BLK_ID_BYTES));
 570        iov_from_buf(in_iov, in_num, 0, serial, size);
 571        virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
 572        virtio_blk_free_request(req);
 573        break;
 574    }
 575    default:
 576        virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
 577        virtio_blk_free_request(req);
 578    }
 579}
 580
 581void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
 582{
 583    VirtIOBlockReq *req;
 584    MultiReqBuffer mrb = {};
 585
 586    blk_io_plug(s->blk);
 587
 588    while ((req = virtio_blk_get_request(s))) {
 589        virtio_blk_handle_request(req, &mrb);
 590    }
 591
 592    if (mrb.num_reqs) {
 593        virtio_blk_submit_multireq(s->blk, &mrb);
 594    }
 595
 596    blk_io_unplug(s->blk);
 597}
 598
 599static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
 600{
 601    VirtIOBlock *s = (VirtIOBlock *)vdev;
 602
 603    if (s->dataplane) {
 604        /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start
 605         * dataplane here instead of waiting for .set_status().
 606         */
 607        virtio_blk_data_plane_start(s->dataplane);
 608        if (!s->dataplane_disabled) {
 609            return;
 610        }
 611    }
 612    virtio_blk_handle_vq(s, vq);
 613}
 614
 615static void virtio_blk_dma_restart_bh(void *opaque)
 616{
 617    VirtIOBlock *s = opaque;
 618    VirtIOBlockReq *req = s->rq;
 619    MultiReqBuffer mrb = {};
 620
 621    qemu_bh_delete(s->bh);
 622    s->bh = NULL;
 623
 624    s->rq = NULL;
 625
 626    while (req) {
 627        VirtIOBlockReq *next = req->next;
 628        virtio_blk_handle_request(req, &mrb);
 629        req = next;
 630    }
 631
 632    if (mrb.num_reqs) {
 633        virtio_blk_submit_multireq(s->blk, &mrb);
 634    }
 635}
 636
 637static void virtio_blk_dma_restart_cb(void *opaque, int running,
 638                                      RunState state)
 639{
 640    VirtIOBlock *s = opaque;
 641
 642    if (!running) {
 643        return;
 644    }
 645
 646    if (!s->bh) {
 647        s->bh = aio_bh_new(blk_get_aio_context(s->conf.conf.blk),
 648                           virtio_blk_dma_restart_bh, s);
 649        qemu_bh_schedule(s->bh);
 650    }
 651}
 652
 653static void virtio_blk_reset(VirtIODevice *vdev)
 654{
 655    VirtIOBlock *s = VIRTIO_BLK(vdev);
 656    AioContext *ctx;
 657
 658    /*
 659     * This should cancel pending requests, but can't do nicely until there
 660     * are per-device request lists.
 661     */
 662    ctx = blk_get_aio_context(s->blk);
 663    aio_context_acquire(ctx);
 664    blk_drain(s->blk);
 665
 666    if (s->dataplane) {
 667        virtio_blk_data_plane_stop(s->dataplane);
 668    }
 669    aio_context_release(ctx);
 670
 671    blk_set_enable_write_cache(s->blk, s->original_wce);
 672}
 673
 674/* coalesce internal state, copy to pci i/o region 0
 675 */
 676static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
 677{
 678    VirtIOBlock *s = VIRTIO_BLK(vdev);
 679    BlockConf *conf = &s->conf.conf;
 680    struct virtio_blk_config blkcfg;
 681    uint64_t capacity;
 682    int blk_size = conf->logical_block_size;
 683
 684    blk_get_geometry(s->blk, &capacity);
 685    memset(&blkcfg, 0, sizeof(blkcfg));
 686    virtio_stq_p(vdev, &blkcfg.capacity, capacity);
 687    virtio_stl_p(vdev, &blkcfg.seg_max, 128 - 2);
 688    virtio_stw_p(vdev, &blkcfg.geometry.cylinders, conf->cyls);
 689    virtio_stl_p(vdev, &blkcfg.blk_size, blk_size);
 690    virtio_stw_p(vdev, &blkcfg.min_io_size, conf->min_io_size / blk_size);
 691    virtio_stw_p(vdev, &blkcfg.opt_io_size, conf->opt_io_size / blk_size);
 692    blkcfg.geometry.heads = conf->heads;
 693    /*
 694     * We must ensure that the block device capacity is a multiple of
 695     * the logical block size. If that is not the case, let's use
 696     * sector_mask to adopt the geometry to have a correct picture.
 697     * For those devices where the capacity is ok for the given geometry
 698     * we don't touch the sector value of the geometry, since some devices
 699     * (like s390 dasd) need a specific value. Here the capacity is already
 700     * cyls*heads*secs*blk_size and the sector value is not block size
 701     * divided by 512 - instead it is the amount of blk_size blocks
 702     * per track (cylinder).
 703     */
 704    if (blk_getlength(s->blk) /  conf->heads / conf->secs % blk_size) {
 705        blkcfg.geometry.sectors = conf->secs & ~s->sector_mask;
 706    } else {
 707        blkcfg.geometry.sectors = conf->secs;
 708    }
 709    blkcfg.size_max = 0;
 710    blkcfg.physical_block_exp = get_physical_block_exp(conf);
 711    blkcfg.alignment_offset = 0;
 712    blkcfg.wce = blk_enable_write_cache(s->blk);
 713    memcpy(config, &blkcfg, sizeof(struct virtio_blk_config));
 714}
 715
 716static void virtio_blk_set_config(VirtIODevice *vdev, const uint8_t *config)
 717{
 718    VirtIOBlock *s = VIRTIO_BLK(vdev);
 719    struct virtio_blk_config blkcfg;
 720
 721    memcpy(&blkcfg, config, sizeof(blkcfg));
 722
 723    aio_context_acquire(blk_get_aio_context(s->blk));
 724    blk_set_enable_write_cache(s->blk, blkcfg.wce != 0);
 725    aio_context_release(blk_get_aio_context(s->blk));
 726}
 727
 728static uint64_t virtio_blk_get_features(VirtIODevice *vdev, uint64_t features,
 729                                        Error **errp)
 730{
 731    VirtIOBlock *s = VIRTIO_BLK(vdev);
 732
 733    virtio_add_feature(&features, VIRTIO_BLK_F_SEG_MAX);
 734    virtio_add_feature(&features, VIRTIO_BLK_F_GEOMETRY);
 735    virtio_add_feature(&features, VIRTIO_BLK_F_TOPOLOGY);
 736    virtio_add_feature(&features, VIRTIO_BLK_F_BLK_SIZE);
 737    if (virtio_has_feature(features, VIRTIO_F_VERSION_1)) {
 738        if (s->conf.scsi) {
 739            error_setg(errp, "Please set scsi=off for virtio-blk devices in order to use virtio 1.0");
 740            return 0;
 741        }
 742    } else {
 743        virtio_clear_feature(&features, VIRTIO_F_ANY_LAYOUT);
 744        virtio_add_feature(&features, VIRTIO_BLK_F_SCSI);
 745    }
 746
 747    if (s->conf.config_wce) {
 748        virtio_add_feature(&features, VIRTIO_BLK_F_CONFIG_WCE);
 749    }
 750    if (blk_enable_write_cache(s->blk)) {
 751        virtio_add_feature(&features, VIRTIO_BLK_F_WCE);
 752    }
 753    if (blk_is_read_only(s->blk)) {
 754        virtio_add_feature(&features, VIRTIO_BLK_F_RO);
 755    }
 756
 757    return features;
 758}
 759
 760static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t status)
 761{
 762    VirtIOBlock *s = VIRTIO_BLK(vdev);
 763
 764    if (s->dataplane && !(status & (VIRTIO_CONFIG_S_DRIVER |
 765                                    VIRTIO_CONFIG_S_DRIVER_OK))) {
 766        virtio_blk_data_plane_stop(s->dataplane);
 767    }
 768
 769    if (!(status & VIRTIO_CONFIG_S_DRIVER_OK)) {
 770        return;
 771    }
 772
 773    /* A guest that supports VIRTIO_BLK_F_CONFIG_WCE must be able to send
 774     * cache flushes.  Thus, the "auto writethrough" behavior is never
 775     * necessary for guests that support the VIRTIO_BLK_F_CONFIG_WCE feature.
 776     * Leaving it enabled would break the following sequence:
 777     *
 778     *     Guest started with "-drive cache=writethrough"
 779     *     Guest sets status to 0
 780     *     Guest sets DRIVER bit in status field
 781     *     Guest reads host features (WCE=0, CONFIG_WCE=1)
 782     *     Guest writes guest features (WCE=0, CONFIG_WCE=1)
 783     *     Guest writes 1 to the WCE configuration field (writeback mode)
 784     *     Guest sets DRIVER_OK bit in status field
 785     *
 786     * s->blk would erroneously be placed in writethrough mode.
 787     */
 788    if (!virtio_vdev_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE)) {
 789        aio_context_acquire(blk_get_aio_context(s->blk));
 790        blk_set_enable_write_cache(s->blk,
 791                                   virtio_vdev_has_feature(vdev,
 792                                                           VIRTIO_BLK_F_WCE));
 793        aio_context_release(blk_get_aio_context(s->blk));
 794    }
 795}
 796
 797static void virtio_blk_save(QEMUFile *f, void *opaque)
 798{
 799    VirtIODevice *vdev = VIRTIO_DEVICE(opaque);
 800    VirtIOBlock *s = VIRTIO_BLK(vdev);
 801
 802    if (s->dataplane) {
 803        virtio_blk_data_plane_stop(s->dataplane);
 804    }
 805
 806    virtio_save(vdev, f);
 807}
 808    
 809static void virtio_blk_save_device(VirtIODevice *vdev, QEMUFile *f)
 810{
 811    VirtIOBlock *s = VIRTIO_BLK(vdev);
 812    VirtIOBlockReq *req = s->rq;
 813
 814    while (req) {
 815        qemu_put_sbyte(f, 1);
 816        qemu_put_virtqueue_element(f, &req->elem);
 817        req = req->next;
 818    }
 819    qemu_put_sbyte(f, 0);
 820}
 821
 822static int virtio_blk_load(QEMUFile *f, void *opaque, int version_id)
 823{
 824    VirtIOBlock *s = opaque;
 825    VirtIODevice *vdev = VIRTIO_DEVICE(s);
 826
 827    if (version_id != 2)
 828        return -EINVAL;
 829
 830    return virtio_load(vdev, f, version_id);
 831}
 832
 833static int virtio_blk_load_device(VirtIODevice *vdev, QEMUFile *f,
 834                                  int version_id)
 835{
 836    VirtIOBlock *s = VIRTIO_BLK(vdev);
 837
 838    while (qemu_get_sbyte(f)) {
 839        VirtIOBlockReq *req;
 840        req = qemu_get_virtqueue_element(f, sizeof(VirtIOBlockReq));
 841        virtio_blk_init_request(s, req);
 842        req->next = s->rq;
 843        s->rq = req;
 844    }
 845
 846    return 0;
 847}
 848
 849static void virtio_blk_resize(void *opaque)
 850{
 851    VirtIODevice *vdev = VIRTIO_DEVICE(opaque);
 852
 853    virtio_notify_config(vdev);
 854}
 855
 856static const BlockDevOps virtio_block_ops = {
 857    .resize_cb = virtio_blk_resize,
 858};
 859
 860static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
 861{
 862    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
 863    VirtIOBlock *s = VIRTIO_BLK(dev);
 864    VirtIOBlkConf *conf = &s->conf;
 865    Error *err = NULL;
 866    static int virtio_blk_id;
 867
 868    if (!conf->conf.blk) {
 869        error_setg(errp, "drive property not set");
 870        return;
 871    }
 872    if (!blk_is_inserted(conf->conf.blk)) {
 873        error_setg(errp, "Device needs media, but drive is empty");
 874        return;
 875    }
 876
 877    blkconf_serial(&conf->conf, &conf->serial);
 878    s->original_wce = blk_enable_write_cache(conf->conf.blk);
 879    blkconf_geometry(&conf->conf, NULL, 65535, 255, 255, &err);
 880    if (err) {
 881        error_propagate(errp, err);
 882        return;
 883    }
 884    blkconf_blocksizes(&conf->conf);
 885
 886    virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK,
 887                sizeof(struct virtio_blk_config));
 888
 889    s->blk = conf->conf.blk;
 890    s->rq = NULL;
 891    s->sector_mask = (s->conf.conf.logical_block_size / BDRV_SECTOR_SIZE) - 1;
 892
 893    s->vq = virtio_add_queue(vdev, 128, virtio_blk_handle_output);
 894    virtio_blk_data_plane_create(vdev, conf, &s->dataplane, &err);
 895    if (err != NULL) {
 896        error_propagate(errp, err);
 897        virtio_cleanup(vdev);
 898        return;
 899    }
 900
 901    s->change = qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s);
 902    register_savevm(dev, "virtio-blk", virtio_blk_id++, 2,
 903                    virtio_blk_save, virtio_blk_load, s);
 904    blk_set_dev_ops(s->blk, &virtio_block_ops, s);
 905    blk_set_guest_block_size(s->blk, s->conf.conf.logical_block_size);
 906
 907    blk_iostatus_enable(s->blk);
 908}
 909
 910static void virtio_blk_device_unrealize(DeviceState *dev, Error **errp)
 911{
 912    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
 913    VirtIOBlock *s = VIRTIO_BLK(dev);
 914
 915    virtio_blk_data_plane_destroy(s->dataplane);
 916    s->dataplane = NULL;
 917    qemu_del_vm_change_state_handler(s->change);
 918    unregister_savevm(dev, "virtio-blk", s);
 919    blockdev_mark_auto_del(s->blk);
 920    virtio_cleanup(vdev);
 921}
 922
 923static void virtio_blk_instance_init(Object *obj)
 924{
 925    VirtIOBlock *s = VIRTIO_BLK(obj);
 926
 927    object_property_add_link(obj, "iothread", TYPE_IOTHREAD,
 928                             (Object **)&s->conf.iothread,
 929                             qdev_prop_allow_set_link_before_realize,
 930                             OBJ_PROP_LINK_UNREF_ON_RELEASE, NULL);
 931    device_add_bootindex_property(obj, &s->conf.conf.bootindex,
 932                                  "bootindex", "/disk@0,0",
 933                                  DEVICE(obj), NULL);
 934}
 935
 936static Property virtio_blk_properties[] = {
 937    DEFINE_BLOCK_PROPERTIES(VirtIOBlock, conf.conf),
 938    DEFINE_BLOCK_CHS_PROPERTIES(VirtIOBlock, conf.conf),
 939    DEFINE_PROP_STRING("serial", VirtIOBlock, conf.serial),
 940    DEFINE_PROP_BIT("config-wce", VirtIOBlock, conf.config_wce, 0, true),
 941#ifdef __linux__
 942    DEFINE_PROP_BIT("scsi", VirtIOBlock, conf.scsi, 0, false),
 943#endif
 944    DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0,
 945                    true),
 946    DEFINE_PROP_END_OF_LIST(),
 947};
 948
 949static void virtio_blk_class_init(ObjectClass *klass, void *data)
 950{
 951    DeviceClass *dc = DEVICE_CLASS(klass);
 952    VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
 953
 954    dc->props = virtio_blk_properties;
 955    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
 956    vdc->realize = virtio_blk_device_realize;
 957    vdc->unrealize = virtio_blk_device_unrealize;
 958    vdc->get_config = virtio_blk_update_config;
 959    vdc->set_config = virtio_blk_set_config;
 960    vdc->get_features = virtio_blk_get_features;
 961    vdc->set_status = virtio_blk_set_status;
 962    vdc->reset = virtio_blk_reset;
 963    vdc->save = virtio_blk_save_device;
 964    vdc->load = virtio_blk_load_device;
 965}
 966
 967static const TypeInfo virtio_device_info = {
 968    .name = TYPE_VIRTIO_BLK,
 969    .parent = TYPE_VIRTIO_DEVICE,
 970    .instance_size = sizeof(VirtIOBlock),
 971    .instance_init = virtio_blk_instance_init,
 972    .class_init = virtio_blk_class_init,
 973};
 974
 975static void virtio_register_types(void)
 976{
 977    type_register_static(&virtio_device_info);
 978}
 979
 980type_init(virtio_register_types)
 981