qemu/block/qed.c
<<
>>
Prefs
   1/*
   2 * QEMU Enhanced Disk Format
   3 *
   4 * Copyright IBM, Corp. 2010
   5 *
   6 * Authors:
   7 *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
   8 *  Anthony Liguori   <aliguori@us.ibm.com>
   9 *
  10 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
  11 * See the COPYING.LIB file in the top-level directory.
  12 *
  13 */
  14
  15#include "qemu/osdep.h"
  16#include "qapi/error.h"
  17#include "qemu/timer.h"
  18#include "trace.h"
  19#include "qed.h"
  20#include "qapi/qmp/qerror.h"
  21#include "migration/migration.h"
  22#include "sysemu/block-backend.h"
  23
  24static const AIOCBInfo qed_aiocb_info = {
  25    .aiocb_size         = sizeof(QEDAIOCB),
  26};
  27
  28static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
  29                          const char *filename)
  30{
  31    const QEDHeader *header = (const QEDHeader *)buf;
  32
  33    if (buf_size < sizeof(*header)) {
  34        return 0;
  35    }
  36    if (le32_to_cpu(header->magic) != QED_MAGIC) {
  37        return 0;
  38    }
  39    return 100;
  40}
  41
  42/**
  43 * Check whether an image format is raw
  44 *
  45 * @fmt:    Backing file format, may be NULL
  46 */
  47static bool qed_fmt_is_raw(const char *fmt)
  48{
  49    return fmt && strcmp(fmt, "raw") == 0;
  50}
  51
  52static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu)
  53{
  54    cpu->magic = le32_to_cpu(le->magic);
  55    cpu->cluster_size = le32_to_cpu(le->cluster_size);
  56    cpu->table_size = le32_to_cpu(le->table_size);
  57    cpu->header_size = le32_to_cpu(le->header_size);
  58    cpu->features = le64_to_cpu(le->features);
  59    cpu->compat_features = le64_to_cpu(le->compat_features);
  60    cpu->autoclear_features = le64_to_cpu(le->autoclear_features);
  61    cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset);
  62    cpu->image_size = le64_to_cpu(le->image_size);
  63    cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset);
  64    cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size);
  65}
  66
  67static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le)
  68{
  69    le->magic = cpu_to_le32(cpu->magic);
  70    le->cluster_size = cpu_to_le32(cpu->cluster_size);
  71    le->table_size = cpu_to_le32(cpu->table_size);
  72    le->header_size = cpu_to_le32(cpu->header_size);
  73    le->features = cpu_to_le64(cpu->features);
  74    le->compat_features = cpu_to_le64(cpu->compat_features);
  75    le->autoclear_features = cpu_to_le64(cpu->autoclear_features);
  76    le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset);
  77    le->image_size = cpu_to_le64(cpu->image_size);
  78    le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset);
  79    le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size);
  80}
  81
  82int qed_write_header_sync(BDRVQEDState *s)
  83{
  84    QEDHeader le;
  85    int ret;
  86
  87    qed_header_cpu_to_le(&s->header, &le);
  88    ret = bdrv_pwrite(s->bs->file->bs, 0, &le, sizeof(le));
  89    if (ret != sizeof(le)) {
  90        return ret;
  91    }
  92    return 0;
  93}
  94
  95typedef struct {
  96    GenericCB gencb;
  97    BDRVQEDState *s;
  98    struct iovec iov;
  99    QEMUIOVector qiov;
 100    int nsectors;
 101    uint8_t *buf;
 102} QEDWriteHeaderCB;
 103
 104static void qed_write_header_cb(void *opaque, int ret)
 105{
 106    QEDWriteHeaderCB *write_header_cb = opaque;
 107
 108    qemu_vfree(write_header_cb->buf);
 109    gencb_complete(write_header_cb, ret);
 110}
 111
 112static void qed_write_header_read_cb(void *opaque, int ret)
 113{
 114    QEDWriteHeaderCB *write_header_cb = opaque;
 115    BDRVQEDState *s = write_header_cb->s;
 116
 117    if (ret) {
 118        qed_write_header_cb(write_header_cb, ret);
 119        return;
 120    }
 121
 122    /* Update header */
 123    qed_header_cpu_to_le(&s->header, (QEDHeader *)write_header_cb->buf);
 124
 125    bdrv_aio_writev(s->bs->file->bs, 0, &write_header_cb->qiov,
 126                    write_header_cb->nsectors, qed_write_header_cb,
 127                    write_header_cb);
 128}
 129
 130/**
 131 * Update header in-place (does not rewrite backing filename or other strings)
 132 *
 133 * This function only updates known header fields in-place and does not affect
 134 * extra data after the QED header.
 135 */
 136static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb,
 137                             void *opaque)
 138{
 139    /* We must write full sectors for O_DIRECT but cannot necessarily generate
 140     * the data following the header if an unrecognized compat feature is
 141     * active.  Therefore, first read the sectors containing the header, update
 142     * them, and write back.
 143     */
 144
 145    int nsectors = (sizeof(QEDHeader) + BDRV_SECTOR_SIZE - 1) /
 146                   BDRV_SECTOR_SIZE;
 147    size_t len = nsectors * BDRV_SECTOR_SIZE;
 148    QEDWriteHeaderCB *write_header_cb = gencb_alloc(sizeof(*write_header_cb),
 149                                                    cb, opaque);
 150
 151    write_header_cb->s = s;
 152    write_header_cb->nsectors = nsectors;
 153    write_header_cb->buf = qemu_blockalign(s->bs, len);
 154    write_header_cb->iov.iov_base = write_header_cb->buf;
 155    write_header_cb->iov.iov_len = len;
 156    qemu_iovec_init_external(&write_header_cb->qiov, &write_header_cb->iov, 1);
 157
 158    bdrv_aio_readv(s->bs->file->bs, 0, &write_header_cb->qiov, nsectors,
 159                   qed_write_header_read_cb, write_header_cb);
 160}
 161
 162static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
 163{
 164    uint64_t table_entries;
 165    uint64_t l2_size;
 166
 167    table_entries = (table_size * cluster_size) / sizeof(uint64_t);
 168    l2_size = table_entries * cluster_size;
 169
 170    return l2_size * table_entries;
 171}
 172
 173static bool qed_is_cluster_size_valid(uint32_t cluster_size)
 174{
 175    if (cluster_size < QED_MIN_CLUSTER_SIZE ||
 176        cluster_size > QED_MAX_CLUSTER_SIZE) {
 177        return false;
 178    }
 179    if (cluster_size & (cluster_size - 1)) {
 180        return false; /* not power of 2 */
 181    }
 182    return true;
 183}
 184
 185static bool qed_is_table_size_valid(uint32_t table_size)
 186{
 187    if (table_size < QED_MIN_TABLE_SIZE ||
 188        table_size > QED_MAX_TABLE_SIZE) {
 189        return false;
 190    }
 191    if (table_size & (table_size - 1)) {
 192        return false; /* not power of 2 */
 193    }
 194    return true;
 195}
 196
 197static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size,
 198                                    uint32_t table_size)
 199{
 200    if (image_size % BDRV_SECTOR_SIZE != 0) {
 201        return false; /* not multiple of sector size */
 202    }
 203    if (image_size > qed_max_image_size(cluster_size, table_size)) {
 204        return false; /* image is too large */
 205    }
 206    return true;
 207}
 208
 209/**
 210 * Read a string of known length from the image file
 211 *
 212 * @file:       Image file
 213 * @offset:     File offset to start of string, in bytes
 214 * @n:          String length in bytes
 215 * @buf:        Destination buffer
 216 * @buflen:     Destination buffer length in bytes
 217 * @ret:        0 on success, -errno on failure
 218 *
 219 * The string is NUL-terminated.
 220 */
 221static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n,
 222                           char *buf, size_t buflen)
 223{
 224    int ret;
 225    if (n >= buflen) {
 226        return -EINVAL;
 227    }
 228    ret = bdrv_pread(file, offset, buf, n);
 229    if (ret < 0) {
 230        return ret;
 231    }
 232    buf[n] = '\0';
 233    return 0;
 234}
 235
 236/**
 237 * Allocate new clusters
 238 *
 239 * @s:          QED state
 240 * @n:          Number of contiguous clusters to allocate
 241 * @ret:        Offset of first allocated cluster
 242 *
 243 * This function only produces the offset where the new clusters should be
 244 * written.  It updates BDRVQEDState but does not make any changes to the image
 245 * file.
 246 */
 247static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n)
 248{
 249    uint64_t offset = s->file_size;
 250    s->file_size += n * s->header.cluster_size;
 251    return offset;
 252}
 253
 254QEDTable *qed_alloc_table(BDRVQEDState *s)
 255{
 256    /* Honor O_DIRECT memory alignment requirements */
 257    return qemu_blockalign(s->bs,
 258                           s->header.cluster_size * s->header.table_size);
 259}
 260
 261/**
 262 * Allocate a new zeroed L2 table
 263 */
 264static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
 265{
 266    CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
 267
 268    l2_table->table = qed_alloc_table(s);
 269    l2_table->offset = qed_alloc_clusters(s, s->header.table_size);
 270
 271    memset(l2_table->table->offsets, 0,
 272           s->header.cluster_size * s->header.table_size);
 273    return l2_table;
 274}
 275
 276static void qed_aio_next_io(void *opaque, int ret);
 277
 278static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
 279{
 280    assert(!s->allocating_write_reqs_plugged);
 281
 282    s->allocating_write_reqs_plugged = true;
 283}
 284
 285static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
 286{
 287    QEDAIOCB *acb;
 288
 289    assert(s->allocating_write_reqs_plugged);
 290
 291    s->allocating_write_reqs_plugged = false;
 292
 293    acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
 294    if (acb) {
 295        qed_aio_next_io(acb, 0);
 296    }
 297}
 298
 299static void qed_finish_clear_need_check(void *opaque, int ret)
 300{
 301    /* Do nothing */
 302}
 303
 304static void qed_flush_after_clear_need_check(void *opaque, int ret)
 305{
 306    BDRVQEDState *s = opaque;
 307
 308    bdrv_aio_flush(s->bs, qed_finish_clear_need_check, s);
 309
 310    /* No need to wait until flush completes */
 311    qed_unplug_allocating_write_reqs(s);
 312}
 313
 314static void qed_clear_need_check(void *opaque, int ret)
 315{
 316    BDRVQEDState *s = opaque;
 317
 318    if (ret) {
 319        qed_unplug_allocating_write_reqs(s);
 320        return;
 321    }
 322
 323    s->header.features &= ~QED_F_NEED_CHECK;
 324    qed_write_header(s, qed_flush_after_clear_need_check, s);
 325}
 326
 327static void qed_need_check_timer_cb(void *opaque)
 328{
 329    BDRVQEDState *s = opaque;
 330
 331    /* The timer should only fire when allocating writes have drained */
 332    assert(!QSIMPLEQ_FIRST(&s->allocating_write_reqs));
 333
 334    trace_qed_need_check_timer_cb(s);
 335
 336    qed_plug_allocating_write_reqs(s);
 337
 338    /* Ensure writes are on disk before clearing flag */
 339    bdrv_aio_flush(s->bs, qed_clear_need_check, s);
 340}
 341
 342static void qed_start_need_check_timer(BDRVQEDState *s)
 343{
 344    trace_qed_start_need_check_timer(s);
 345
 346    /* Use QEMU_CLOCK_VIRTUAL so we don't alter the image file while suspended for
 347     * migration.
 348     */
 349    timer_mod(s->need_check_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 350                   NANOSECONDS_PER_SECOND * QED_NEED_CHECK_TIMEOUT);
 351}
 352
 353/* It's okay to call this multiple times or when no timer is started */
 354static void qed_cancel_need_check_timer(BDRVQEDState *s)
 355{
 356    trace_qed_cancel_need_check_timer(s);
 357    timer_del(s->need_check_timer);
 358}
 359
 360static void bdrv_qed_detach_aio_context(BlockDriverState *bs)
 361{
 362    BDRVQEDState *s = bs->opaque;
 363
 364    qed_cancel_need_check_timer(s);
 365    timer_free(s->need_check_timer);
 366}
 367
 368static void bdrv_qed_attach_aio_context(BlockDriverState *bs,
 369                                        AioContext *new_context)
 370{
 371    BDRVQEDState *s = bs->opaque;
 372
 373    s->need_check_timer = aio_timer_new(new_context,
 374                                        QEMU_CLOCK_VIRTUAL, SCALE_NS,
 375                                        qed_need_check_timer_cb, s);
 376    if (s->header.features & QED_F_NEED_CHECK) {
 377        qed_start_need_check_timer(s);
 378    }
 379}
 380
 381static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
 382                         Error **errp)
 383{
 384    BDRVQEDState *s = bs->opaque;
 385    QEDHeader le_header;
 386    int64_t file_size;
 387    int ret;
 388
 389    s->bs = bs;
 390    QSIMPLEQ_INIT(&s->allocating_write_reqs);
 391
 392    ret = bdrv_pread(bs->file->bs, 0, &le_header, sizeof(le_header));
 393    if (ret < 0) {
 394        return ret;
 395    }
 396    qed_header_le_to_cpu(&le_header, &s->header);
 397
 398    if (s->header.magic != QED_MAGIC) {
 399        error_setg(errp, "Image not in QED format");
 400        return -EINVAL;
 401    }
 402    if (s->header.features & ~QED_FEATURE_MASK) {
 403        /* image uses unsupported feature bits */
 404        error_setg(errp, "Unsupported QED features: %" PRIx64,
 405                   s->header.features & ~QED_FEATURE_MASK);
 406        return -ENOTSUP;
 407    }
 408    if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
 409        return -EINVAL;
 410    }
 411
 412    /* Round down file size to the last cluster */
 413    file_size = bdrv_getlength(bs->file->bs);
 414    if (file_size < 0) {
 415        return file_size;
 416    }
 417    s->file_size = qed_start_of_cluster(s, file_size);
 418
 419    if (!qed_is_table_size_valid(s->header.table_size)) {
 420        return -EINVAL;
 421    }
 422    if (!qed_is_image_size_valid(s->header.image_size,
 423                                 s->header.cluster_size,
 424                                 s->header.table_size)) {
 425        return -EINVAL;
 426    }
 427    if (!qed_check_table_offset(s, s->header.l1_table_offset)) {
 428        return -EINVAL;
 429    }
 430
 431    s->table_nelems = (s->header.cluster_size * s->header.table_size) /
 432                      sizeof(uint64_t);
 433    s->l2_shift = ctz32(s->header.cluster_size);
 434    s->l2_mask = s->table_nelems - 1;
 435    s->l1_shift = s->l2_shift + ctz32(s->table_nelems);
 436
 437    /* Header size calculation must not overflow uint32_t */
 438    if (s->header.header_size > UINT32_MAX / s->header.cluster_size) {
 439        return -EINVAL;
 440    }
 441
 442    if ((s->header.features & QED_F_BACKING_FILE)) {
 443        if ((uint64_t)s->header.backing_filename_offset +
 444            s->header.backing_filename_size >
 445            s->header.cluster_size * s->header.header_size) {
 446            return -EINVAL;
 447        }
 448
 449        ret = qed_read_string(bs->file->bs, s->header.backing_filename_offset,
 450                              s->header.backing_filename_size, bs->backing_file,
 451                              sizeof(bs->backing_file));
 452        if (ret < 0) {
 453            return ret;
 454        }
 455
 456        if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) {
 457            pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw");
 458        }
 459    }
 460
 461    /* Reset unknown autoclear feature bits.  This is a backwards
 462     * compatibility mechanism that allows images to be opened by older
 463     * programs, which "knock out" unknown feature bits.  When an image is
 464     * opened by a newer program again it can detect that the autoclear
 465     * feature is no longer valid.
 466     */
 467    if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 &&
 468        !bdrv_is_read_only(bs->file->bs) && !(flags & BDRV_O_INACTIVE)) {
 469        s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK;
 470
 471        ret = qed_write_header_sync(s);
 472        if (ret) {
 473            return ret;
 474        }
 475
 476        /* From here on only known autoclear feature bits are valid */
 477        bdrv_flush(bs->file->bs);
 478    }
 479
 480    s->l1_table = qed_alloc_table(s);
 481    qed_init_l2_cache(&s->l2_cache);
 482
 483    ret = qed_read_l1_table_sync(s);
 484    if (ret) {
 485        goto out;
 486    }
 487
 488    /* If image was not closed cleanly, check consistency */
 489    if (!(flags & BDRV_O_CHECK) && (s->header.features & QED_F_NEED_CHECK)) {
 490        /* Read-only images cannot be fixed.  There is no risk of corruption
 491         * since write operations are not possible.  Therefore, allow
 492         * potentially inconsistent images to be opened read-only.  This can
 493         * aid data recovery from an otherwise inconsistent image.
 494         */
 495        if (!bdrv_is_read_only(bs->file->bs) &&
 496            !(flags & BDRV_O_INACTIVE)) {
 497            BdrvCheckResult result = {0};
 498
 499            ret = qed_check(s, &result, true);
 500            if (ret) {
 501                goto out;
 502            }
 503        }
 504    }
 505
 506    bdrv_qed_attach_aio_context(bs, bdrv_get_aio_context(bs));
 507
 508out:
 509    if (ret) {
 510        qed_free_l2_cache(&s->l2_cache);
 511        qemu_vfree(s->l1_table);
 512    }
 513    return ret;
 514}
 515
 516static void bdrv_qed_refresh_limits(BlockDriverState *bs, Error **errp)
 517{
 518    BDRVQEDState *s = bs->opaque;
 519
 520    bs->bl.write_zeroes_alignment = s->header.cluster_size >> BDRV_SECTOR_BITS;
 521}
 522
 523/* We have nothing to do for QED reopen, stubs just return
 524 * success */
 525static int bdrv_qed_reopen_prepare(BDRVReopenState *state,
 526                                   BlockReopenQueue *queue, Error **errp)
 527{
 528    return 0;
 529}
 530
 531static void bdrv_qed_close(BlockDriverState *bs)
 532{
 533    BDRVQEDState *s = bs->opaque;
 534
 535    bdrv_qed_detach_aio_context(bs);
 536
 537    /* Ensure writes reach stable storage */
 538    bdrv_flush(bs->file->bs);
 539
 540    /* Clean shutdown, no check required on next open */
 541    if (s->header.features & QED_F_NEED_CHECK) {
 542        s->header.features &= ~QED_F_NEED_CHECK;
 543        qed_write_header_sync(s);
 544    }
 545
 546    qed_free_l2_cache(&s->l2_cache);
 547    qemu_vfree(s->l1_table);
 548}
 549
 550static int qed_create(const char *filename, uint32_t cluster_size,
 551                      uint64_t image_size, uint32_t table_size,
 552                      const char *backing_file, const char *backing_fmt,
 553                      QemuOpts *opts, Error **errp)
 554{
 555    QEDHeader header = {
 556        .magic = QED_MAGIC,
 557        .cluster_size = cluster_size,
 558        .table_size = table_size,
 559        .header_size = 1,
 560        .features = 0,
 561        .compat_features = 0,
 562        .l1_table_offset = cluster_size,
 563        .image_size = image_size,
 564    };
 565    QEDHeader le_header;
 566    uint8_t *l1_table = NULL;
 567    size_t l1_size = header.cluster_size * header.table_size;
 568    Error *local_err = NULL;
 569    int ret = 0;
 570    BlockBackend *blk;
 571
 572    ret = bdrv_create_file(filename, opts, &local_err);
 573    if (ret < 0) {
 574        error_propagate(errp, local_err);
 575        return ret;
 576    }
 577
 578    blk = blk_new_open(filename, NULL, NULL,
 579                       BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
 580    if (blk == NULL) {
 581        error_propagate(errp, local_err);
 582        return -EIO;
 583    }
 584
 585    blk_set_allow_write_beyond_eof(blk, true);
 586
 587    /* File must start empty and grow, check truncate is supported */
 588    ret = blk_truncate(blk, 0);
 589    if (ret < 0) {
 590        goto out;
 591    }
 592
 593    if (backing_file) {
 594        header.features |= QED_F_BACKING_FILE;
 595        header.backing_filename_offset = sizeof(le_header);
 596        header.backing_filename_size = strlen(backing_file);
 597
 598        if (qed_fmt_is_raw(backing_fmt)) {
 599            header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
 600        }
 601    }
 602
 603    qed_header_cpu_to_le(&header, &le_header);
 604    ret = blk_pwrite(blk, 0, &le_header, sizeof(le_header));
 605    if (ret < 0) {
 606        goto out;
 607    }
 608    ret = blk_pwrite(blk, sizeof(le_header), backing_file,
 609                     header.backing_filename_size);
 610    if (ret < 0) {
 611        goto out;
 612    }
 613
 614    l1_table = g_malloc0(l1_size);
 615    ret = blk_pwrite(blk, header.l1_table_offset, l1_table, l1_size);
 616    if (ret < 0) {
 617        goto out;
 618    }
 619
 620    ret = 0; /* success */
 621out:
 622    g_free(l1_table);
 623    blk_unref(blk);
 624    return ret;
 625}
 626
 627static int bdrv_qed_create(const char *filename, QemuOpts *opts, Error **errp)
 628{
 629    uint64_t image_size = 0;
 630    uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE;
 631    uint32_t table_size = QED_DEFAULT_TABLE_SIZE;
 632    char *backing_file = NULL;
 633    char *backing_fmt = NULL;
 634    int ret;
 635
 636    image_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
 637                          BDRV_SECTOR_SIZE);
 638    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
 639    backing_fmt = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FMT);
 640    cluster_size = qemu_opt_get_size_del(opts,
 641                                         BLOCK_OPT_CLUSTER_SIZE,
 642                                         QED_DEFAULT_CLUSTER_SIZE);
 643    table_size = qemu_opt_get_size_del(opts, BLOCK_OPT_TABLE_SIZE,
 644                                       QED_DEFAULT_TABLE_SIZE);
 645
 646    if (!qed_is_cluster_size_valid(cluster_size)) {
 647        error_setg(errp, "QED cluster size must be within range [%u, %u] "
 648                         "and power of 2",
 649                   QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE);
 650        ret = -EINVAL;
 651        goto finish;
 652    }
 653    if (!qed_is_table_size_valid(table_size)) {
 654        error_setg(errp, "QED table size must be within range [%u, %u] "
 655                         "and power of 2",
 656                   QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE);
 657        ret = -EINVAL;
 658        goto finish;
 659    }
 660    if (!qed_is_image_size_valid(image_size, cluster_size, table_size)) {
 661        error_setg(errp, "QED image size must be a non-zero multiple of "
 662                         "cluster size and less than %" PRIu64 " bytes",
 663                   qed_max_image_size(cluster_size, table_size));
 664        ret = -EINVAL;
 665        goto finish;
 666    }
 667
 668    ret = qed_create(filename, cluster_size, image_size, table_size,
 669                     backing_file, backing_fmt, opts, errp);
 670
 671finish:
 672    g_free(backing_file);
 673    g_free(backing_fmt);
 674    return ret;
 675}
 676
 677typedef struct {
 678    BlockDriverState *bs;
 679    Coroutine *co;
 680    uint64_t pos;
 681    int64_t status;
 682    int *pnum;
 683    BlockDriverState **file;
 684} QEDIsAllocatedCB;
 685
 686static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len)
 687{
 688    QEDIsAllocatedCB *cb = opaque;
 689    BDRVQEDState *s = cb->bs->opaque;
 690    *cb->pnum = len / BDRV_SECTOR_SIZE;
 691    switch (ret) {
 692    case QED_CLUSTER_FOUND:
 693        offset |= qed_offset_into_cluster(s, cb->pos);
 694        cb->status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
 695        *cb->file = cb->bs->file->bs;
 696        break;
 697    case QED_CLUSTER_ZERO:
 698        cb->status = BDRV_BLOCK_ZERO;
 699        break;
 700    case QED_CLUSTER_L2:
 701    case QED_CLUSTER_L1:
 702        cb->status = 0;
 703        break;
 704    default:
 705        assert(ret < 0);
 706        cb->status = ret;
 707        break;
 708    }
 709
 710    if (cb->co) {
 711        qemu_coroutine_enter(cb->co, NULL);
 712    }
 713}
 714
 715static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs,
 716                                                 int64_t sector_num,
 717                                                 int nb_sectors, int *pnum,
 718                                                 BlockDriverState **file)
 719{
 720    BDRVQEDState *s = bs->opaque;
 721    size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE;
 722    QEDIsAllocatedCB cb = {
 723        .bs = bs,
 724        .pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE,
 725        .status = BDRV_BLOCK_OFFSET_MASK,
 726        .pnum = pnum,
 727        .file = file,
 728    };
 729    QEDRequest request = { .l2_table = NULL };
 730
 731    qed_find_cluster(s, &request, cb.pos, len, qed_is_allocated_cb, &cb);
 732
 733    /* Now sleep if the callback wasn't invoked immediately */
 734    while (cb.status == BDRV_BLOCK_OFFSET_MASK) {
 735        cb.co = qemu_coroutine_self();
 736        qemu_coroutine_yield();
 737    }
 738
 739    qed_unref_l2_cache_entry(request.l2_table);
 740
 741    return cb.status;
 742}
 743
 744static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
 745{
 746    return acb->common.bs->opaque;
 747}
 748
 749/**
 750 * Read from the backing file or zero-fill if no backing file
 751 *
 752 * @s:              QED state
 753 * @pos:            Byte position in device
 754 * @qiov:           Destination I/O vector
 755 * @backing_qiov:   Possibly shortened copy of qiov, to be allocated here
 756 * @cb:             Completion function
 757 * @opaque:         User data for completion function
 758 *
 759 * This function reads qiov->size bytes starting at pos from the backing file.
 760 * If there is no backing file then zeroes are read.
 761 */
 762static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
 763                                  QEMUIOVector *qiov,
 764                                  QEMUIOVector **backing_qiov,
 765                                  BlockCompletionFunc *cb, void *opaque)
 766{
 767    uint64_t backing_length = 0;
 768    size_t size;
 769
 770    /* If there is a backing file, get its length.  Treat the absence of a
 771     * backing file like a zero length backing file.
 772     */
 773    if (s->bs->backing) {
 774        int64_t l = bdrv_getlength(s->bs->backing->bs);
 775        if (l < 0) {
 776            cb(opaque, l);
 777            return;
 778        }
 779        backing_length = l;
 780    }
 781
 782    /* Zero all sectors if reading beyond the end of the backing file */
 783    if (pos >= backing_length ||
 784        pos + qiov->size > backing_length) {
 785        qemu_iovec_memset(qiov, 0, 0, qiov->size);
 786    }
 787
 788    /* Complete now if there are no backing file sectors to read */
 789    if (pos >= backing_length) {
 790        cb(opaque, 0);
 791        return;
 792    }
 793
 794    /* If the read straddles the end of the backing file, shorten it */
 795    size = MIN((uint64_t)backing_length - pos, qiov->size);
 796
 797    assert(*backing_qiov == NULL);
 798    *backing_qiov = g_new(QEMUIOVector, 1);
 799    qemu_iovec_init(*backing_qiov, qiov->niov);
 800    qemu_iovec_concat(*backing_qiov, qiov, 0, size);
 801
 802    BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
 803    bdrv_aio_readv(s->bs->backing->bs, pos / BDRV_SECTOR_SIZE,
 804                   *backing_qiov, size / BDRV_SECTOR_SIZE, cb, opaque);
 805}
 806
 807typedef struct {
 808    GenericCB gencb;
 809    BDRVQEDState *s;
 810    QEMUIOVector qiov;
 811    QEMUIOVector *backing_qiov;
 812    struct iovec iov;
 813    uint64_t offset;
 814} CopyFromBackingFileCB;
 815
 816static void qed_copy_from_backing_file_cb(void *opaque, int ret)
 817{
 818    CopyFromBackingFileCB *copy_cb = opaque;
 819    qemu_vfree(copy_cb->iov.iov_base);
 820    gencb_complete(&copy_cb->gencb, ret);
 821}
 822
 823static void qed_copy_from_backing_file_write(void *opaque, int ret)
 824{
 825    CopyFromBackingFileCB *copy_cb = opaque;
 826    BDRVQEDState *s = copy_cb->s;
 827
 828    if (copy_cb->backing_qiov) {
 829        qemu_iovec_destroy(copy_cb->backing_qiov);
 830        g_free(copy_cb->backing_qiov);
 831        copy_cb->backing_qiov = NULL;
 832    }
 833
 834    if (ret) {
 835        qed_copy_from_backing_file_cb(copy_cb, ret);
 836        return;
 837    }
 838
 839    BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
 840    bdrv_aio_writev(s->bs->file->bs, copy_cb->offset / BDRV_SECTOR_SIZE,
 841                    &copy_cb->qiov, copy_cb->qiov.size / BDRV_SECTOR_SIZE,
 842                    qed_copy_from_backing_file_cb, copy_cb);
 843}
 844
 845/**
 846 * Copy data from backing file into the image
 847 *
 848 * @s:          QED state
 849 * @pos:        Byte position in device
 850 * @len:        Number of bytes
 851 * @offset:     Byte offset in image file
 852 * @cb:         Completion function
 853 * @opaque:     User data for completion function
 854 */
 855static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
 856                                       uint64_t len, uint64_t offset,
 857                                       BlockCompletionFunc *cb,
 858                                       void *opaque)
 859{
 860    CopyFromBackingFileCB *copy_cb;
 861
 862    /* Skip copy entirely if there is no work to do */
 863    if (len == 0) {
 864        cb(opaque, 0);
 865        return;
 866    }
 867
 868    copy_cb = gencb_alloc(sizeof(*copy_cb), cb, opaque);
 869    copy_cb->s = s;
 870    copy_cb->offset = offset;
 871    copy_cb->backing_qiov = NULL;
 872    copy_cb->iov.iov_base = qemu_blockalign(s->bs, len);
 873    copy_cb->iov.iov_len = len;
 874    qemu_iovec_init_external(&copy_cb->qiov, &copy_cb->iov, 1);
 875
 876    qed_read_backing_file(s, pos, &copy_cb->qiov, &copy_cb->backing_qiov,
 877                          qed_copy_from_backing_file_write, copy_cb);
 878}
 879
 880/**
 881 * Link one or more contiguous clusters into a table
 882 *
 883 * @s:              QED state
 884 * @table:          L2 table
 885 * @index:          First cluster index
 886 * @n:              Number of contiguous clusters
 887 * @cluster:        First cluster offset
 888 *
 889 * The cluster offset may be an allocated byte offset in the image file, the
 890 * zero cluster marker, or the unallocated cluster marker.
 891 */
 892static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
 893                                unsigned int n, uint64_t cluster)
 894{
 895    int i;
 896    for (i = index; i < index + n; i++) {
 897        table->offsets[i] = cluster;
 898        if (!qed_offset_is_unalloc_cluster(cluster) &&
 899            !qed_offset_is_zero_cluster(cluster)) {
 900            cluster += s->header.cluster_size;
 901        }
 902    }
 903}
 904
 905static void qed_aio_complete_bh(void *opaque)
 906{
 907    QEDAIOCB *acb = opaque;
 908    BlockCompletionFunc *cb = acb->common.cb;
 909    void *user_opaque = acb->common.opaque;
 910    int ret = acb->bh_ret;
 911
 912    qemu_bh_delete(acb->bh);
 913    qemu_aio_unref(acb);
 914
 915    /* Invoke callback */
 916    cb(user_opaque, ret);
 917}
 918
 919static void qed_aio_complete(QEDAIOCB *acb, int ret)
 920{
 921    BDRVQEDState *s = acb_to_s(acb);
 922
 923    trace_qed_aio_complete(s, acb, ret);
 924
 925    /* Free resources */
 926    qemu_iovec_destroy(&acb->cur_qiov);
 927    qed_unref_l2_cache_entry(acb->request.l2_table);
 928
 929    /* Free the buffer we may have allocated for zero writes */
 930    if (acb->flags & QED_AIOCB_ZERO) {
 931        qemu_vfree(acb->qiov->iov[0].iov_base);
 932        acb->qiov->iov[0].iov_base = NULL;
 933    }
 934
 935    /* Arrange for a bh to invoke the completion function */
 936    acb->bh_ret = ret;
 937    acb->bh = aio_bh_new(bdrv_get_aio_context(acb->common.bs),
 938                         qed_aio_complete_bh, acb);
 939    qemu_bh_schedule(acb->bh);
 940
 941    /* Start next allocating write request waiting behind this one.  Note that
 942     * requests enqueue themselves when they first hit an unallocated cluster
 943     * but they wait until the entire request is finished before waking up the
 944     * next request in the queue.  This ensures that we don't cycle through
 945     * requests multiple times but rather finish one at a time completely.
 946     */
 947    if (acb == QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
 948        QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next);
 949        acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
 950        if (acb) {
 951            qed_aio_next_io(acb, 0);
 952        } else if (s->header.features & QED_F_NEED_CHECK) {
 953            qed_start_need_check_timer(s);
 954        }
 955    }
 956}
 957
 958/**
 959 * Commit the current L2 table to the cache
 960 */
 961static void qed_commit_l2_update(void *opaque, int ret)
 962{
 963    QEDAIOCB *acb = opaque;
 964    BDRVQEDState *s = acb_to_s(acb);
 965    CachedL2Table *l2_table = acb->request.l2_table;
 966    uint64_t l2_offset = l2_table->offset;
 967
 968    qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
 969
 970    /* This is guaranteed to succeed because we just committed the entry to the
 971     * cache.
 972     */
 973    acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
 974    assert(acb->request.l2_table != NULL);
 975
 976    qed_aio_next_io(opaque, ret);
 977}
 978
 979/**
 980 * Update L1 table with new L2 table offset and write it out
 981 */
 982static void qed_aio_write_l1_update(void *opaque, int ret)
 983{
 984    QEDAIOCB *acb = opaque;
 985    BDRVQEDState *s = acb_to_s(acb);
 986    int index;
 987
 988    if (ret) {
 989        qed_aio_complete(acb, ret);
 990        return;
 991    }
 992
 993    index = qed_l1_index(s, acb->cur_pos);
 994    s->l1_table->offsets[index] = acb->request.l2_table->offset;
 995
 996    qed_write_l1_table(s, index, 1, qed_commit_l2_update, acb);
 997}
 998
 999/**
1000 * Update L2 table with new cluster offsets and write them out
1001 */
1002static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
1003{
1004    BDRVQEDState *s = acb_to_s(acb);
1005    bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
1006    int index;
1007
1008    if (ret) {
1009        goto err;
1010    }
1011
1012    if (need_alloc) {
1013        qed_unref_l2_cache_entry(acb->request.l2_table);
1014        acb->request.l2_table = qed_new_l2_table(s);
1015    }
1016
1017    index = qed_l2_index(s, acb->cur_pos);
1018    qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters,
1019                         offset);
1020
1021    if (need_alloc) {
1022        /* Write out the whole new L2 table */
1023        qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true,
1024                            qed_aio_write_l1_update, acb);
1025    } else {
1026        /* Write out only the updated part of the L2 table */
1027        qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false,
1028                            qed_aio_next_io, acb);
1029    }
1030    return;
1031
1032err:
1033    qed_aio_complete(acb, ret);
1034}
1035
1036static void qed_aio_write_l2_update_cb(void *opaque, int ret)
1037{
1038    QEDAIOCB *acb = opaque;
1039    qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
1040}
1041
1042/**
1043 * Flush new data clusters before updating the L2 table
1044 *
1045 * This flush is necessary when a backing file is in use.  A crash during an
1046 * allocating write could result in empty clusters in the image.  If the write
1047 * only touched a subregion of the cluster, then backing image sectors have
1048 * been lost in the untouched region.  The solution is to flush after writing a
1049 * new data cluster and before updating the L2 table.
1050 */
1051static void qed_aio_write_flush_before_l2_update(void *opaque, int ret)
1052{
1053    QEDAIOCB *acb = opaque;
1054    BDRVQEDState *s = acb_to_s(acb);
1055
1056    if (!bdrv_aio_flush(s->bs->file->bs, qed_aio_write_l2_update_cb, opaque)) {
1057        qed_aio_complete(acb, -EIO);
1058    }
1059}
1060
1061/**
1062 * Write data to the image file
1063 */
1064static void qed_aio_write_main(void *opaque, int ret)
1065{
1066    QEDAIOCB *acb = opaque;
1067    BDRVQEDState *s = acb_to_s(acb);
1068    uint64_t offset = acb->cur_cluster +
1069                      qed_offset_into_cluster(s, acb->cur_pos);
1070    BlockCompletionFunc *next_fn;
1071
1072    trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size);
1073
1074    if (ret) {
1075        qed_aio_complete(acb, ret);
1076        return;
1077    }
1078
1079    if (acb->find_cluster_ret == QED_CLUSTER_FOUND) {
1080        next_fn = qed_aio_next_io;
1081    } else {
1082        if (s->bs->backing) {
1083            next_fn = qed_aio_write_flush_before_l2_update;
1084        } else {
1085            next_fn = qed_aio_write_l2_update_cb;
1086        }
1087    }
1088
1089    BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
1090    bdrv_aio_writev(s->bs->file->bs, offset / BDRV_SECTOR_SIZE,
1091                    &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
1092                    next_fn, acb);
1093}
1094
1095/**
1096 * Populate back untouched region of new data cluster
1097 */
1098static void qed_aio_write_postfill(void *opaque, int ret)
1099{
1100    QEDAIOCB *acb = opaque;
1101    BDRVQEDState *s = acb_to_s(acb);
1102    uint64_t start = acb->cur_pos + acb->cur_qiov.size;
1103    uint64_t len =
1104        qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
1105    uint64_t offset = acb->cur_cluster +
1106                      qed_offset_into_cluster(s, acb->cur_pos) +
1107                      acb->cur_qiov.size;
1108
1109    if (ret) {
1110        qed_aio_complete(acb, ret);
1111        return;
1112    }
1113
1114    trace_qed_aio_write_postfill(s, acb, start, len, offset);
1115    qed_copy_from_backing_file(s, start, len, offset,
1116                                qed_aio_write_main, acb);
1117}
1118
1119/**
1120 * Populate front untouched region of new data cluster
1121 */
1122static void qed_aio_write_prefill(void *opaque, int ret)
1123{
1124    QEDAIOCB *acb = opaque;
1125    BDRVQEDState *s = acb_to_s(acb);
1126    uint64_t start = qed_start_of_cluster(s, acb->cur_pos);
1127    uint64_t len = qed_offset_into_cluster(s, acb->cur_pos);
1128
1129    trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
1130    qed_copy_from_backing_file(s, start, len, acb->cur_cluster,
1131                                qed_aio_write_postfill, acb);
1132}
1133
1134/**
1135 * Check if the QED_F_NEED_CHECK bit should be set during allocating write
1136 */
1137static bool qed_should_set_need_check(BDRVQEDState *s)
1138{
1139    /* The flush before L2 update path ensures consistency */
1140    if (s->bs->backing) {
1141        return false;
1142    }
1143
1144    return !(s->header.features & QED_F_NEED_CHECK);
1145}
1146
1147static void qed_aio_write_zero_cluster(void *opaque, int ret)
1148{
1149    QEDAIOCB *acb = opaque;
1150
1151    if (ret) {
1152        qed_aio_complete(acb, ret);
1153        return;
1154    }
1155
1156    qed_aio_write_l2_update(acb, 0, 1);
1157}
1158
1159/**
1160 * Write new data cluster
1161 *
1162 * @acb:        Write request
1163 * @len:        Length in bytes
1164 *
1165 * This path is taken when writing to previously unallocated clusters.
1166 */
1167static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
1168{
1169    BDRVQEDState *s = acb_to_s(acb);
1170    BlockCompletionFunc *cb;
1171
1172    /* Cancel timer when the first allocating request comes in */
1173    if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) {
1174        qed_cancel_need_check_timer(s);
1175    }
1176
1177    /* Freeze this request if another allocating write is in progress */
1178    if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
1179        QSIMPLEQ_INSERT_TAIL(&s->allocating_write_reqs, acb, next);
1180    }
1181    if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) ||
1182        s->allocating_write_reqs_plugged) {
1183        return; /* wait for existing request to finish */
1184    }
1185
1186    acb->cur_nclusters = qed_bytes_to_clusters(s,
1187            qed_offset_into_cluster(s, acb->cur_pos) + len);
1188    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1189
1190    if (acb->flags & QED_AIOCB_ZERO) {
1191        /* Skip ahead if the clusters are already zero */
1192        if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
1193            qed_aio_next_io(acb, 0);
1194            return;
1195        }
1196
1197        cb = qed_aio_write_zero_cluster;
1198    } else {
1199        cb = qed_aio_write_prefill;
1200        acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
1201    }
1202
1203    if (qed_should_set_need_check(s)) {
1204        s->header.features |= QED_F_NEED_CHECK;
1205        qed_write_header(s, cb, acb);
1206    } else {
1207        cb(acb, 0);
1208    }
1209}
1210
1211/**
1212 * Write data cluster in place
1213 *
1214 * @acb:        Write request
1215 * @offset:     Cluster offset in bytes
1216 * @len:        Length in bytes
1217 *
1218 * This path is taken when writing to already allocated clusters.
1219 */
1220static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
1221{
1222    /* Allocate buffer for zero writes */
1223    if (acb->flags & QED_AIOCB_ZERO) {
1224        struct iovec *iov = acb->qiov->iov;
1225
1226        if (!iov->iov_base) {
1227            iov->iov_base = qemu_try_blockalign(acb->common.bs, iov->iov_len);
1228            if (iov->iov_base == NULL) {
1229                qed_aio_complete(acb, -ENOMEM);
1230                return;
1231            }
1232            memset(iov->iov_base, 0, iov->iov_len);
1233        }
1234    }
1235
1236    /* Calculate the I/O vector */
1237    acb->cur_cluster = offset;
1238    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1239
1240    /* Do the actual write */
1241    qed_aio_write_main(acb, 0);
1242}
1243
1244/**
1245 * Write data cluster
1246 *
1247 * @opaque:     Write request
1248 * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1,
1249 *              or -errno
1250 * @offset:     Cluster offset in bytes
1251 * @len:        Length in bytes
1252 *
1253 * Callback from qed_find_cluster().
1254 */
1255static void qed_aio_write_data(void *opaque, int ret,
1256                               uint64_t offset, size_t len)
1257{
1258    QEDAIOCB *acb = opaque;
1259
1260    trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len);
1261
1262    acb->find_cluster_ret = ret;
1263
1264    switch (ret) {
1265    case QED_CLUSTER_FOUND:
1266        qed_aio_write_inplace(acb, offset, len);
1267        break;
1268
1269    case QED_CLUSTER_L2:
1270    case QED_CLUSTER_L1:
1271    case QED_CLUSTER_ZERO:
1272        qed_aio_write_alloc(acb, len);
1273        break;
1274
1275    default:
1276        qed_aio_complete(acb, ret);
1277        break;
1278    }
1279}
1280
1281/**
1282 * Read data cluster
1283 *
1284 * @opaque:     Read request
1285 * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1,
1286 *              or -errno
1287 * @offset:     Cluster offset in bytes
1288 * @len:        Length in bytes
1289 *
1290 * Callback from qed_find_cluster().
1291 */
1292static void qed_aio_read_data(void *opaque, int ret,
1293                              uint64_t offset, size_t len)
1294{
1295    QEDAIOCB *acb = opaque;
1296    BDRVQEDState *s = acb_to_s(acb);
1297    BlockDriverState *bs = acb->common.bs;
1298
1299    /* Adjust offset into cluster */
1300    offset += qed_offset_into_cluster(s, acb->cur_pos);
1301
1302    trace_qed_aio_read_data(s, acb, ret, offset, len);
1303
1304    if (ret < 0) {
1305        goto err;
1306    }
1307
1308    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1309
1310    /* Handle zero cluster and backing file reads */
1311    if (ret == QED_CLUSTER_ZERO) {
1312        qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
1313        qed_aio_next_io(acb, 0);
1314        return;
1315    } else if (ret != QED_CLUSTER_FOUND) {
1316        qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
1317                              &acb->backing_qiov, qed_aio_next_io, acb);
1318        return;
1319    }
1320
1321    BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
1322    bdrv_aio_readv(bs->file->bs, offset / BDRV_SECTOR_SIZE,
1323                   &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
1324                   qed_aio_next_io, acb);
1325    return;
1326
1327err:
1328    qed_aio_complete(acb, ret);
1329}
1330
1331/**
1332 * Begin next I/O or complete the request
1333 */
1334static void qed_aio_next_io(void *opaque, int ret)
1335{
1336    QEDAIOCB *acb = opaque;
1337    BDRVQEDState *s = acb_to_s(acb);
1338    QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
1339                                qed_aio_write_data : qed_aio_read_data;
1340
1341    trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size);
1342
1343    if (acb->backing_qiov) {
1344        qemu_iovec_destroy(acb->backing_qiov);
1345        g_free(acb->backing_qiov);
1346        acb->backing_qiov = NULL;
1347    }
1348
1349    /* Handle I/O error */
1350    if (ret) {
1351        qed_aio_complete(acb, ret);
1352        return;
1353    }
1354
1355    acb->qiov_offset += acb->cur_qiov.size;
1356    acb->cur_pos += acb->cur_qiov.size;
1357    qemu_iovec_reset(&acb->cur_qiov);
1358
1359    /* Complete request */
1360    if (acb->cur_pos >= acb->end_pos) {
1361        qed_aio_complete(acb, 0);
1362        return;
1363    }
1364
1365    /* Find next cluster and start I/O */
1366    qed_find_cluster(s, &acb->request,
1367                      acb->cur_pos, acb->end_pos - acb->cur_pos,
1368                      io_fn, acb);
1369}
1370
1371static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
1372                                 int64_t sector_num,
1373                                 QEMUIOVector *qiov, int nb_sectors,
1374                                 BlockCompletionFunc *cb,
1375                                 void *opaque, int flags)
1376{
1377    QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, cb, opaque);
1378
1379    trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors,
1380                        opaque, flags);
1381
1382    acb->flags = flags;
1383    acb->qiov = qiov;
1384    acb->qiov_offset = 0;
1385    acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
1386    acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE;
1387    acb->backing_qiov = NULL;
1388    acb->request.l2_table = NULL;
1389    qemu_iovec_init(&acb->cur_qiov, qiov->niov);
1390
1391    /* Start request */
1392    qed_aio_next_io(acb, 0);
1393    return &acb->common;
1394}
1395
1396static BlockAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs,
1397                                      int64_t sector_num,
1398                                      QEMUIOVector *qiov, int nb_sectors,
1399                                      BlockCompletionFunc *cb,
1400                                      void *opaque)
1401{
1402    return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
1403}
1404
1405static BlockAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
1406                                       int64_t sector_num,
1407                                       QEMUIOVector *qiov, int nb_sectors,
1408                                       BlockCompletionFunc *cb,
1409                                       void *opaque)
1410{
1411    return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb,
1412                         opaque, QED_AIOCB_WRITE);
1413}
1414
1415typedef struct {
1416    Coroutine *co;
1417    int ret;
1418    bool done;
1419} QEDWriteZeroesCB;
1420
1421static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret)
1422{
1423    QEDWriteZeroesCB *cb = opaque;
1424
1425    cb->done = true;
1426    cb->ret = ret;
1427    if (cb->co) {
1428        qemu_coroutine_enter(cb->co, NULL);
1429    }
1430}
1431
1432static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
1433                                                 int64_t sector_num,
1434                                                 int nb_sectors,
1435                                                 BdrvRequestFlags flags)
1436{
1437    BlockAIOCB *blockacb;
1438    BDRVQEDState *s = bs->opaque;
1439    QEDWriteZeroesCB cb = { .done = false };
1440    QEMUIOVector qiov;
1441    struct iovec iov;
1442
1443    /* Refuse if there are untouched backing file sectors */
1444    if (bs->backing) {
1445        if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) {
1446            return -ENOTSUP;
1447        }
1448        if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) {
1449            return -ENOTSUP;
1450        }
1451    }
1452
1453    /* Zero writes start without an I/O buffer.  If a buffer becomes necessary
1454     * then it will be allocated during request processing.
1455     */
1456    iov.iov_base = NULL,
1457    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE,
1458
1459    qemu_iovec_init_external(&qiov, &iov, 1);
1460    blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors,
1461                             qed_co_write_zeroes_cb, &cb,
1462                             QED_AIOCB_WRITE | QED_AIOCB_ZERO);
1463    if (!blockacb) {
1464        return -EIO;
1465    }
1466    if (!cb.done) {
1467        cb.co = qemu_coroutine_self();
1468        qemu_coroutine_yield();
1469    }
1470    assert(cb.done);
1471    return cb.ret;
1472}
1473
1474static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset)
1475{
1476    BDRVQEDState *s = bs->opaque;
1477    uint64_t old_image_size;
1478    int ret;
1479
1480    if (!qed_is_image_size_valid(offset, s->header.cluster_size,
1481                                 s->header.table_size)) {
1482        return -EINVAL;
1483    }
1484
1485    /* Shrinking is currently not supported */
1486    if ((uint64_t)offset < s->header.image_size) {
1487        return -ENOTSUP;
1488    }
1489
1490    old_image_size = s->header.image_size;
1491    s->header.image_size = offset;
1492    ret = qed_write_header_sync(s);
1493    if (ret < 0) {
1494        s->header.image_size = old_image_size;
1495    }
1496    return ret;
1497}
1498
1499static int64_t bdrv_qed_getlength(BlockDriverState *bs)
1500{
1501    BDRVQEDState *s = bs->opaque;
1502    return s->header.image_size;
1503}
1504
1505static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1506{
1507    BDRVQEDState *s = bs->opaque;
1508
1509    memset(bdi, 0, sizeof(*bdi));
1510    bdi->cluster_size = s->header.cluster_size;
1511    bdi->is_dirty = s->header.features & QED_F_NEED_CHECK;
1512    bdi->unallocated_blocks_are_zero = true;
1513    bdi->can_write_zeroes_with_unmap = true;
1514    return 0;
1515}
1516
1517static int bdrv_qed_change_backing_file(BlockDriverState *bs,
1518                                        const char *backing_file,
1519                                        const char *backing_fmt)
1520{
1521    BDRVQEDState *s = bs->opaque;
1522    QEDHeader new_header, le_header;
1523    void *buffer;
1524    size_t buffer_len, backing_file_len;
1525    int ret;
1526
1527    /* Refuse to set backing filename if unknown compat feature bits are
1528     * active.  If the image uses an unknown compat feature then we may not
1529     * know the layout of data following the header structure and cannot safely
1530     * add a new string.
1531     */
1532    if (backing_file && (s->header.compat_features &
1533                         ~QED_COMPAT_FEATURE_MASK)) {
1534        return -ENOTSUP;
1535    }
1536
1537    memcpy(&new_header, &s->header, sizeof(new_header));
1538
1539    new_header.features &= ~(QED_F_BACKING_FILE |
1540                             QED_F_BACKING_FORMAT_NO_PROBE);
1541
1542    /* Adjust feature flags */
1543    if (backing_file) {
1544        new_header.features |= QED_F_BACKING_FILE;
1545
1546        if (qed_fmt_is_raw(backing_fmt)) {
1547            new_header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
1548        }
1549    }
1550
1551    /* Calculate new header size */
1552    backing_file_len = 0;
1553
1554    if (backing_file) {
1555        backing_file_len = strlen(backing_file);
1556    }
1557
1558    buffer_len = sizeof(new_header);
1559    new_header.backing_filename_offset = buffer_len;
1560    new_header.backing_filename_size = backing_file_len;
1561    buffer_len += backing_file_len;
1562
1563    /* Make sure we can rewrite header without failing */
1564    if (buffer_len > new_header.header_size * new_header.cluster_size) {
1565        return -ENOSPC;
1566    }
1567
1568    /* Prepare new header */
1569    buffer = g_malloc(buffer_len);
1570
1571    qed_header_cpu_to_le(&new_header, &le_header);
1572    memcpy(buffer, &le_header, sizeof(le_header));
1573    buffer_len = sizeof(le_header);
1574
1575    if (backing_file) {
1576        memcpy(buffer + buffer_len, backing_file, backing_file_len);
1577        buffer_len += backing_file_len;
1578    }
1579
1580    /* Write new header */
1581    ret = bdrv_pwrite_sync(bs->file->bs, 0, buffer, buffer_len);
1582    g_free(buffer);
1583    if (ret == 0) {
1584        memcpy(&s->header, &new_header, sizeof(new_header));
1585    }
1586    return ret;
1587}
1588
1589static void bdrv_qed_invalidate_cache(BlockDriverState *bs, Error **errp)
1590{
1591    BDRVQEDState *s = bs->opaque;
1592    Error *local_err = NULL;
1593    int ret;
1594
1595    bdrv_qed_close(bs);
1596
1597    bdrv_invalidate_cache(bs->file->bs, &local_err);
1598    if (local_err) {
1599        error_propagate(errp, local_err);
1600        return;
1601    }
1602
1603    memset(s, 0, sizeof(BDRVQEDState));
1604    ret = bdrv_qed_open(bs, NULL, bs->open_flags, &local_err);
1605    if (local_err) {
1606        error_propagate(errp, local_err);
1607        error_prepend(errp, "Could not reopen qed layer: ");
1608        return;
1609    } else if (ret < 0) {
1610        error_setg_errno(errp, -ret, "Could not reopen qed layer");
1611        return;
1612    }
1613}
1614
1615static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result,
1616                          BdrvCheckMode fix)
1617{
1618    BDRVQEDState *s = bs->opaque;
1619
1620    return qed_check(s, result, !!fix);
1621}
1622
1623static QemuOptsList qed_create_opts = {
1624    .name = "qed-create-opts",
1625    .head = QTAILQ_HEAD_INITIALIZER(qed_create_opts.head),
1626    .desc = {
1627        {
1628            .name = BLOCK_OPT_SIZE,
1629            .type = QEMU_OPT_SIZE,
1630            .help = "Virtual disk size"
1631        },
1632        {
1633            .name = BLOCK_OPT_BACKING_FILE,
1634            .type = QEMU_OPT_STRING,
1635            .help = "File name of a base image"
1636        },
1637        {
1638            .name = BLOCK_OPT_BACKING_FMT,
1639            .type = QEMU_OPT_STRING,
1640            .help = "Image format of the base image"
1641        },
1642        {
1643            .name = BLOCK_OPT_CLUSTER_SIZE,
1644            .type = QEMU_OPT_SIZE,
1645            .help = "Cluster size (in bytes)",
1646            .def_value_str = stringify(QED_DEFAULT_CLUSTER_SIZE)
1647        },
1648        {
1649            .name = BLOCK_OPT_TABLE_SIZE,
1650            .type = QEMU_OPT_SIZE,
1651            .help = "L1/L2 table size (in clusters)"
1652        },
1653        { /* end of list */ }
1654    }
1655};
1656
1657static BlockDriver bdrv_qed = {
1658    .format_name              = "qed",
1659    .instance_size            = sizeof(BDRVQEDState),
1660    .create_opts              = &qed_create_opts,
1661    .supports_backing         = true,
1662
1663    .bdrv_probe               = bdrv_qed_probe,
1664    .bdrv_open                = bdrv_qed_open,
1665    .bdrv_close               = bdrv_qed_close,
1666    .bdrv_reopen_prepare      = bdrv_qed_reopen_prepare,
1667    .bdrv_create              = bdrv_qed_create,
1668    .bdrv_has_zero_init       = bdrv_has_zero_init_1,
1669    .bdrv_co_get_block_status = bdrv_qed_co_get_block_status,
1670    .bdrv_aio_readv           = bdrv_qed_aio_readv,
1671    .bdrv_aio_writev          = bdrv_qed_aio_writev,
1672    .bdrv_co_write_zeroes     = bdrv_qed_co_write_zeroes,
1673    .bdrv_truncate            = bdrv_qed_truncate,
1674    .bdrv_getlength           = bdrv_qed_getlength,
1675    .bdrv_get_info            = bdrv_qed_get_info,
1676    .bdrv_refresh_limits      = bdrv_qed_refresh_limits,
1677    .bdrv_change_backing_file = bdrv_qed_change_backing_file,
1678    .bdrv_invalidate_cache    = bdrv_qed_invalidate_cache,
1679    .bdrv_check               = bdrv_qed_check,
1680    .bdrv_detach_aio_context  = bdrv_qed_detach_aio_context,
1681    .bdrv_attach_aio_context  = bdrv_qed_attach_aio_context,
1682};
1683
1684static void bdrv_qed_init(void)
1685{
1686    bdrv_register(&bdrv_qed);
1687}
1688
1689block_init(bdrv_qed_init);
1690