qemu/block/qed.c
<<
>>
Prefs
   1/*
   2 * QEMU Enhanced Disk Format
   3 *
   4 * Copyright IBM, Corp. 2010
   5 *
   6 * Authors:
   7 *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
   8 *  Anthony Liguori   <aliguori@us.ibm.com>
   9 *
  10 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
  11 * See the COPYING.LIB file in the top-level directory.
  12 *
  13 */
  14
  15#include "qemu/timer.h"
  16#include "trace.h"
  17#include "qed.h"
  18#include "qapi/qmp/qerror.h"
  19#include "migration/migration.h"
  20
  21static const AIOCBInfo qed_aiocb_info = {
  22    .aiocb_size         = sizeof(QEDAIOCB),
  23};
  24
  25static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
  26                          const char *filename)
  27{
  28    const QEDHeader *header = (const QEDHeader *)buf;
  29
  30    if (buf_size < sizeof(*header)) {
  31        return 0;
  32    }
  33    if (le32_to_cpu(header->magic) != QED_MAGIC) {
  34        return 0;
  35    }
  36    return 100;
  37}
  38
  39/**
  40 * Check whether an image format is raw
  41 *
  42 * @fmt:    Backing file format, may be NULL
  43 */
  44static bool qed_fmt_is_raw(const char *fmt)
  45{
  46    return fmt && strcmp(fmt, "raw") == 0;
  47}
  48
  49static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu)
  50{
  51    cpu->magic = le32_to_cpu(le->magic);
  52    cpu->cluster_size = le32_to_cpu(le->cluster_size);
  53    cpu->table_size = le32_to_cpu(le->table_size);
  54    cpu->header_size = le32_to_cpu(le->header_size);
  55    cpu->features = le64_to_cpu(le->features);
  56    cpu->compat_features = le64_to_cpu(le->compat_features);
  57    cpu->autoclear_features = le64_to_cpu(le->autoclear_features);
  58    cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset);
  59    cpu->image_size = le64_to_cpu(le->image_size);
  60    cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset);
  61    cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size);
  62}
  63
  64static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le)
  65{
  66    le->magic = cpu_to_le32(cpu->magic);
  67    le->cluster_size = cpu_to_le32(cpu->cluster_size);
  68    le->table_size = cpu_to_le32(cpu->table_size);
  69    le->header_size = cpu_to_le32(cpu->header_size);
  70    le->features = cpu_to_le64(cpu->features);
  71    le->compat_features = cpu_to_le64(cpu->compat_features);
  72    le->autoclear_features = cpu_to_le64(cpu->autoclear_features);
  73    le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset);
  74    le->image_size = cpu_to_le64(cpu->image_size);
  75    le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset);
  76    le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size);
  77}
  78
  79int qed_write_header_sync(BDRVQEDState *s)
  80{
  81    QEDHeader le;
  82    int ret;
  83
  84    qed_header_cpu_to_le(&s->header, &le);
  85    ret = bdrv_pwrite(s->bs->file, 0, &le, sizeof(le));
  86    if (ret != sizeof(le)) {
  87        return ret;
  88    }
  89    return 0;
  90}
  91
  92typedef struct {
  93    GenericCB gencb;
  94    BDRVQEDState *s;
  95    struct iovec iov;
  96    QEMUIOVector qiov;
  97    int nsectors;
  98    uint8_t *buf;
  99} QEDWriteHeaderCB;
 100
 101static void qed_write_header_cb(void *opaque, int ret)
 102{
 103    QEDWriteHeaderCB *write_header_cb = opaque;
 104
 105    qemu_vfree(write_header_cb->buf);
 106    gencb_complete(write_header_cb, ret);
 107}
 108
 109static void qed_write_header_read_cb(void *opaque, int ret)
 110{
 111    QEDWriteHeaderCB *write_header_cb = opaque;
 112    BDRVQEDState *s = write_header_cb->s;
 113
 114    if (ret) {
 115        qed_write_header_cb(write_header_cb, ret);
 116        return;
 117    }
 118
 119    /* Update header */
 120    qed_header_cpu_to_le(&s->header, (QEDHeader *)write_header_cb->buf);
 121
 122    bdrv_aio_writev(s->bs->file, 0, &write_header_cb->qiov,
 123                    write_header_cb->nsectors, qed_write_header_cb,
 124                    write_header_cb);
 125}
 126
 127/**
 128 * Update header in-place (does not rewrite backing filename or other strings)
 129 *
 130 * This function only updates known header fields in-place and does not affect
 131 * extra data after the QED header.
 132 */
 133static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb,
 134                             void *opaque)
 135{
 136    /* We must write full sectors for O_DIRECT but cannot necessarily generate
 137     * the data following the header if an unrecognized compat feature is
 138     * active.  Therefore, first read the sectors containing the header, update
 139     * them, and write back.
 140     */
 141
 142    int nsectors = (sizeof(QEDHeader) + BDRV_SECTOR_SIZE - 1) /
 143                   BDRV_SECTOR_SIZE;
 144    size_t len = nsectors * BDRV_SECTOR_SIZE;
 145    QEDWriteHeaderCB *write_header_cb = gencb_alloc(sizeof(*write_header_cb),
 146                                                    cb, opaque);
 147
 148    write_header_cb->s = s;
 149    write_header_cb->nsectors = nsectors;
 150    write_header_cb->buf = qemu_blockalign(s->bs, len);
 151    write_header_cb->iov.iov_base = write_header_cb->buf;
 152    write_header_cb->iov.iov_len = len;
 153    qemu_iovec_init_external(&write_header_cb->qiov, &write_header_cb->iov, 1);
 154
 155    bdrv_aio_readv(s->bs->file, 0, &write_header_cb->qiov, nsectors,
 156                   qed_write_header_read_cb, write_header_cb);
 157}
 158
 159static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
 160{
 161    uint64_t table_entries;
 162    uint64_t l2_size;
 163
 164    table_entries = (table_size * cluster_size) / sizeof(uint64_t);
 165    l2_size = table_entries * cluster_size;
 166
 167    return l2_size * table_entries;
 168}
 169
 170static bool qed_is_cluster_size_valid(uint32_t cluster_size)
 171{
 172    if (cluster_size < QED_MIN_CLUSTER_SIZE ||
 173        cluster_size > QED_MAX_CLUSTER_SIZE) {
 174        return false;
 175    }
 176    if (cluster_size & (cluster_size - 1)) {
 177        return false; /* not power of 2 */
 178    }
 179    return true;
 180}
 181
 182static bool qed_is_table_size_valid(uint32_t table_size)
 183{
 184    if (table_size < QED_MIN_TABLE_SIZE ||
 185        table_size > QED_MAX_TABLE_SIZE) {
 186        return false;
 187    }
 188    if (table_size & (table_size - 1)) {
 189        return false; /* not power of 2 */
 190    }
 191    return true;
 192}
 193
 194static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size,
 195                                    uint32_t table_size)
 196{
 197    if (image_size % BDRV_SECTOR_SIZE != 0) {
 198        return false; /* not multiple of sector size */
 199    }
 200    if (image_size > qed_max_image_size(cluster_size, table_size)) {
 201        return false; /* image is too large */
 202    }
 203    return true;
 204}
 205
 206/**
 207 * Read a string of known length from the image file
 208 *
 209 * @file:       Image file
 210 * @offset:     File offset to start of string, in bytes
 211 * @n:          String length in bytes
 212 * @buf:        Destination buffer
 213 * @buflen:     Destination buffer length in bytes
 214 * @ret:        0 on success, -errno on failure
 215 *
 216 * The string is NUL-terminated.
 217 */
 218static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n,
 219                           char *buf, size_t buflen)
 220{
 221    int ret;
 222    if (n >= buflen) {
 223        return -EINVAL;
 224    }
 225    ret = bdrv_pread(file, offset, buf, n);
 226    if (ret < 0) {
 227        return ret;
 228    }
 229    buf[n] = '\0';
 230    return 0;
 231}
 232
 233/**
 234 * Allocate new clusters
 235 *
 236 * @s:          QED state
 237 * @n:          Number of contiguous clusters to allocate
 238 * @ret:        Offset of first allocated cluster
 239 *
 240 * This function only produces the offset where the new clusters should be
 241 * written.  It updates BDRVQEDState but does not make any changes to the image
 242 * file.
 243 */
 244static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n)
 245{
 246    uint64_t offset = s->file_size;
 247    s->file_size += n * s->header.cluster_size;
 248    return offset;
 249}
 250
 251QEDTable *qed_alloc_table(BDRVQEDState *s)
 252{
 253    /* Honor O_DIRECT memory alignment requirements */
 254    return qemu_blockalign(s->bs,
 255                           s->header.cluster_size * s->header.table_size);
 256}
 257
 258/**
 259 * Allocate a new zeroed L2 table
 260 */
 261static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
 262{
 263    CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
 264
 265    l2_table->table = qed_alloc_table(s);
 266    l2_table->offset = qed_alloc_clusters(s, s->header.table_size);
 267
 268    memset(l2_table->table->offsets, 0,
 269           s->header.cluster_size * s->header.table_size);
 270    return l2_table;
 271}
 272
 273static void qed_aio_next_io(void *opaque, int ret);
 274
 275static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
 276{
 277    assert(!s->allocating_write_reqs_plugged);
 278
 279    s->allocating_write_reqs_plugged = true;
 280}
 281
 282static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
 283{
 284    QEDAIOCB *acb;
 285
 286    assert(s->allocating_write_reqs_plugged);
 287
 288    s->allocating_write_reqs_plugged = false;
 289
 290    acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
 291    if (acb) {
 292        qed_aio_next_io(acb, 0);
 293    }
 294}
 295
 296static void qed_finish_clear_need_check(void *opaque, int ret)
 297{
 298    /* Do nothing */
 299}
 300
 301static void qed_flush_after_clear_need_check(void *opaque, int ret)
 302{
 303    BDRVQEDState *s = opaque;
 304
 305    bdrv_aio_flush(s->bs, qed_finish_clear_need_check, s);
 306
 307    /* No need to wait until flush completes */
 308    qed_unplug_allocating_write_reqs(s);
 309}
 310
 311static void qed_clear_need_check(void *opaque, int ret)
 312{
 313    BDRVQEDState *s = opaque;
 314
 315    if (ret) {
 316        qed_unplug_allocating_write_reqs(s);
 317        return;
 318    }
 319
 320    s->header.features &= ~QED_F_NEED_CHECK;
 321    qed_write_header(s, qed_flush_after_clear_need_check, s);
 322}
 323
 324static void qed_need_check_timer_cb(void *opaque)
 325{
 326    BDRVQEDState *s = opaque;
 327
 328    /* The timer should only fire when allocating writes have drained */
 329    assert(!QSIMPLEQ_FIRST(&s->allocating_write_reqs));
 330
 331    trace_qed_need_check_timer_cb(s);
 332
 333    qed_plug_allocating_write_reqs(s);
 334
 335    /* Ensure writes are on disk before clearing flag */
 336    bdrv_aio_flush(s->bs, qed_clear_need_check, s);
 337}
 338
 339static void qed_start_need_check_timer(BDRVQEDState *s)
 340{
 341    trace_qed_start_need_check_timer(s);
 342
 343    /* Use QEMU_CLOCK_VIRTUAL so we don't alter the image file while suspended for
 344     * migration.
 345     */
 346    timer_mod(s->need_check_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 347                   get_ticks_per_sec() * QED_NEED_CHECK_TIMEOUT);
 348}
 349
 350/* It's okay to call this multiple times or when no timer is started */
 351static void qed_cancel_need_check_timer(BDRVQEDState *s)
 352{
 353    trace_qed_cancel_need_check_timer(s);
 354    timer_del(s->need_check_timer);
 355}
 356
 357static void bdrv_qed_rebind(BlockDriverState *bs)
 358{
 359    BDRVQEDState *s = bs->opaque;
 360    s->bs = bs;
 361}
 362
 363static void bdrv_qed_detach_aio_context(BlockDriverState *bs)
 364{
 365    BDRVQEDState *s = bs->opaque;
 366
 367    qed_cancel_need_check_timer(s);
 368    timer_free(s->need_check_timer);
 369}
 370
 371static void bdrv_qed_attach_aio_context(BlockDriverState *bs,
 372                                        AioContext *new_context)
 373{
 374    BDRVQEDState *s = bs->opaque;
 375
 376    s->need_check_timer = aio_timer_new(new_context,
 377                                        QEMU_CLOCK_VIRTUAL, SCALE_NS,
 378                                        qed_need_check_timer_cb, s);
 379    if (s->header.features & QED_F_NEED_CHECK) {
 380        qed_start_need_check_timer(s);
 381    }
 382}
 383
 384static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
 385                         Error **errp)
 386{
 387    BDRVQEDState *s = bs->opaque;
 388    QEDHeader le_header;
 389    int64_t file_size;
 390    int ret;
 391
 392    s->bs = bs;
 393    QSIMPLEQ_INIT(&s->allocating_write_reqs);
 394
 395    ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header));
 396    if (ret < 0) {
 397        return ret;
 398    }
 399    qed_header_le_to_cpu(&le_header, &s->header);
 400
 401    if (s->header.magic != QED_MAGIC) {
 402        error_setg(errp, "Image not in QED format");
 403        return -EINVAL;
 404    }
 405    if (s->header.features & ~QED_FEATURE_MASK) {
 406        /* image uses unsupported feature bits */
 407        char buf[64];
 408        snprintf(buf, sizeof(buf), "%" PRIx64,
 409            s->header.features & ~QED_FEATURE_MASK);
 410        error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
 411                   bdrv_get_device_or_node_name(bs), "QED", buf);
 412        return -ENOTSUP;
 413    }
 414    if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
 415        return -EINVAL;
 416    }
 417
 418    /* Round down file size to the last cluster */
 419    file_size = bdrv_getlength(bs->file);
 420    if (file_size < 0) {
 421        return file_size;
 422    }
 423    s->file_size = qed_start_of_cluster(s, file_size);
 424
 425    if (!qed_is_table_size_valid(s->header.table_size)) {
 426        return -EINVAL;
 427    }
 428    if (!qed_is_image_size_valid(s->header.image_size,
 429                                 s->header.cluster_size,
 430                                 s->header.table_size)) {
 431        return -EINVAL;
 432    }
 433    if (!qed_check_table_offset(s, s->header.l1_table_offset)) {
 434        return -EINVAL;
 435    }
 436
 437    s->table_nelems = (s->header.cluster_size * s->header.table_size) /
 438                      sizeof(uint64_t);
 439    s->l2_shift = ctz32(s->header.cluster_size);
 440    s->l2_mask = s->table_nelems - 1;
 441    s->l1_shift = s->l2_shift + ctz32(s->table_nelems);
 442
 443    /* Header size calculation must not overflow uint32_t */
 444    if (s->header.header_size > UINT32_MAX / s->header.cluster_size) {
 445        return -EINVAL;
 446    }
 447
 448    if ((s->header.features & QED_F_BACKING_FILE)) {
 449        if ((uint64_t)s->header.backing_filename_offset +
 450            s->header.backing_filename_size >
 451            s->header.cluster_size * s->header.header_size) {
 452            return -EINVAL;
 453        }
 454
 455        ret = qed_read_string(bs->file, s->header.backing_filename_offset,
 456                              s->header.backing_filename_size, bs->backing_file,
 457                              sizeof(bs->backing_file));
 458        if (ret < 0) {
 459            return ret;
 460        }
 461
 462        if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) {
 463            pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw");
 464        }
 465    }
 466
 467    /* Reset unknown autoclear feature bits.  This is a backwards
 468     * compatibility mechanism that allows images to be opened by older
 469     * programs, which "knock out" unknown feature bits.  When an image is
 470     * opened by a newer program again it can detect that the autoclear
 471     * feature is no longer valid.
 472     */
 473    if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 &&
 474        !bdrv_is_read_only(bs->file) && !(flags & BDRV_O_INCOMING)) {
 475        s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK;
 476
 477        ret = qed_write_header_sync(s);
 478        if (ret) {
 479            return ret;
 480        }
 481
 482        /* From here on only known autoclear feature bits are valid */
 483        bdrv_flush(bs->file);
 484    }
 485
 486    s->l1_table = qed_alloc_table(s);
 487    qed_init_l2_cache(&s->l2_cache);
 488
 489    ret = qed_read_l1_table_sync(s);
 490    if (ret) {
 491        goto out;
 492    }
 493
 494    /* If image was not closed cleanly, check consistency */
 495    if (!(flags & BDRV_O_CHECK) && (s->header.features & QED_F_NEED_CHECK)) {
 496        /* Read-only images cannot be fixed.  There is no risk of corruption
 497         * since write operations are not possible.  Therefore, allow
 498         * potentially inconsistent images to be opened read-only.  This can
 499         * aid data recovery from an otherwise inconsistent image.
 500         */
 501        if (!bdrv_is_read_only(bs->file) &&
 502            !(flags & BDRV_O_INCOMING)) {
 503            BdrvCheckResult result = {0};
 504
 505            ret = qed_check(s, &result, true);
 506            if (ret) {
 507                goto out;
 508            }
 509        }
 510    }
 511
 512    bdrv_qed_attach_aio_context(bs, bdrv_get_aio_context(bs));
 513
 514out:
 515    if (ret) {
 516        qed_free_l2_cache(&s->l2_cache);
 517        qemu_vfree(s->l1_table);
 518    }
 519    return ret;
 520}
 521
 522static void bdrv_qed_refresh_limits(BlockDriverState *bs, Error **errp)
 523{
 524    BDRVQEDState *s = bs->opaque;
 525
 526    bs->bl.write_zeroes_alignment = s->header.cluster_size >> BDRV_SECTOR_BITS;
 527}
 528
 529/* We have nothing to do for QED reopen, stubs just return
 530 * success */
 531static int bdrv_qed_reopen_prepare(BDRVReopenState *state,
 532                                   BlockReopenQueue *queue, Error **errp)
 533{
 534    return 0;
 535}
 536
 537static void bdrv_qed_close(BlockDriverState *bs)
 538{
 539    BDRVQEDState *s = bs->opaque;
 540
 541    bdrv_qed_detach_aio_context(bs);
 542
 543    /* Ensure writes reach stable storage */
 544    bdrv_flush(bs->file);
 545
 546    /* Clean shutdown, no check required on next open */
 547    if (s->header.features & QED_F_NEED_CHECK) {
 548        s->header.features &= ~QED_F_NEED_CHECK;
 549        qed_write_header_sync(s);
 550    }
 551
 552    qed_free_l2_cache(&s->l2_cache);
 553    qemu_vfree(s->l1_table);
 554}
 555
 556static int qed_create(const char *filename, uint32_t cluster_size,
 557                      uint64_t image_size, uint32_t table_size,
 558                      const char *backing_file, const char *backing_fmt,
 559                      QemuOpts *opts, Error **errp)
 560{
 561    QEDHeader header = {
 562        .magic = QED_MAGIC,
 563        .cluster_size = cluster_size,
 564        .table_size = table_size,
 565        .header_size = 1,
 566        .features = 0,
 567        .compat_features = 0,
 568        .l1_table_offset = cluster_size,
 569        .image_size = image_size,
 570    };
 571    QEDHeader le_header;
 572    uint8_t *l1_table = NULL;
 573    size_t l1_size = header.cluster_size * header.table_size;
 574    Error *local_err = NULL;
 575    int ret = 0;
 576    BlockDriverState *bs;
 577
 578    ret = bdrv_create_file(filename, opts, &local_err);
 579    if (ret < 0) {
 580        error_propagate(errp, local_err);
 581        return ret;
 582    }
 583
 584    bs = NULL;
 585    ret = bdrv_open(&bs, filename, NULL, NULL,
 586                    BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, NULL,
 587                    &local_err);
 588    if (ret < 0) {
 589        error_propagate(errp, local_err);
 590        return ret;
 591    }
 592
 593    /* File must start empty and grow, check truncate is supported */
 594    ret = bdrv_truncate(bs, 0);
 595    if (ret < 0) {
 596        goto out;
 597    }
 598
 599    if (backing_file) {
 600        header.features |= QED_F_BACKING_FILE;
 601        header.backing_filename_offset = sizeof(le_header);
 602        header.backing_filename_size = strlen(backing_file);
 603
 604        if (qed_fmt_is_raw(backing_fmt)) {
 605            header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
 606        }
 607    }
 608
 609    qed_header_cpu_to_le(&header, &le_header);
 610    ret = bdrv_pwrite(bs, 0, &le_header, sizeof(le_header));
 611    if (ret < 0) {
 612        goto out;
 613    }
 614    ret = bdrv_pwrite(bs, sizeof(le_header), backing_file,
 615                      header.backing_filename_size);
 616    if (ret < 0) {
 617        goto out;
 618    }
 619
 620    l1_table = g_malloc0(l1_size);
 621    ret = bdrv_pwrite(bs, header.l1_table_offset, l1_table, l1_size);
 622    if (ret < 0) {
 623        goto out;
 624    }
 625
 626    ret = 0; /* success */
 627out:
 628    g_free(l1_table);
 629    bdrv_unref(bs);
 630    return ret;
 631}
 632
 633static int bdrv_qed_create(const char *filename, QemuOpts *opts, Error **errp)
 634{
 635    uint64_t image_size = 0;
 636    uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE;
 637    uint32_t table_size = QED_DEFAULT_TABLE_SIZE;
 638    char *backing_file = NULL;
 639    char *backing_fmt = NULL;
 640    int ret;
 641
 642    image_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
 643                          BDRV_SECTOR_SIZE);
 644    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
 645    backing_fmt = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FMT);
 646    cluster_size = qemu_opt_get_size_del(opts,
 647                                         BLOCK_OPT_CLUSTER_SIZE,
 648                                         QED_DEFAULT_CLUSTER_SIZE);
 649    table_size = qemu_opt_get_size_del(opts, BLOCK_OPT_TABLE_SIZE,
 650                                       QED_DEFAULT_TABLE_SIZE);
 651
 652    if (!qed_is_cluster_size_valid(cluster_size)) {
 653        error_setg(errp, "QED cluster size must be within range [%u, %u] "
 654                         "and power of 2",
 655                   QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE);
 656        ret = -EINVAL;
 657        goto finish;
 658    }
 659    if (!qed_is_table_size_valid(table_size)) {
 660        error_setg(errp, "QED table size must be within range [%u, %u] "
 661                         "and power of 2",
 662                   QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE);
 663        ret = -EINVAL;
 664        goto finish;
 665    }
 666    if (!qed_is_image_size_valid(image_size, cluster_size, table_size)) {
 667        error_setg(errp, "QED image size must be a non-zero multiple of "
 668                         "cluster size and less than %" PRIu64 " bytes",
 669                   qed_max_image_size(cluster_size, table_size));
 670        ret = -EINVAL;
 671        goto finish;
 672    }
 673
 674    ret = qed_create(filename, cluster_size, image_size, table_size,
 675                     backing_file, backing_fmt, opts, errp);
 676
 677finish:
 678    g_free(backing_file);
 679    g_free(backing_fmt);
 680    return ret;
 681}
 682
 683typedef struct {
 684    BlockDriverState *bs;
 685    Coroutine *co;
 686    uint64_t pos;
 687    int64_t status;
 688    int *pnum;
 689} QEDIsAllocatedCB;
 690
 691static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len)
 692{
 693    QEDIsAllocatedCB *cb = opaque;
 694    BDRVQEDState *s = cb->bs->opaque;
 695    *cb->pnum = len / BDRV_SECTOR_SIZE;
 696    switch (ret) {
 697    case QED_CLUSTER_FOUND:
 698        offset |= qed_offset_into_cluster(s, cb->pos);
 699        cb->status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
 700        break;
 701    case QED_CLUSTER_ZERO:
 702        cb->status = BDRV_BLOCK_ZERO;
 703        break;
 704    case QED_CLUSTER_L2:
 705    case QED_CLUSTER_L1:
 706        cb->status = 0;
 707        break;
 708    default:
 709        assert(ret < 0);
 710        cb->status = ret;
 711        break;
 712    }
 713
 714    if (cb->co) {
 715        qemu_coroutine_enter(cb->co, NULL);
 716    }
 717}
 718
 719static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs,
 720                                                 int64_t sector_num,
 721                                                 int nb_sectors, int *pnum)
 722{
 723    BDRVQEDState *s = bs->opaque;
 724    size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE;
 725    QEDIsAllocatedCB cb = {
 726        .bs = bs,
 727        .pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE,
 728        .status = BDRV_BLOCK_OFFSET_MASK,
 729        .pnum = pnum,
 730    };
 731    QEDRequest request = { .l2_table = NULL };
 732
 733    qed_find_cluster(s, &request, cb.pos, len, qed_is_allocated_cb, &cb);
 734
 735    /* Now sleep if the callback wasn't invoked immediately */
 736    while (cb.status == BDRV_BLOCK_OFFSET_MASK) {
 737        cb.co = qemu_coroutine_self();
 738        qemu_coroutine_yield();
 739    }
 740
 741    qed_unref_l2_cache_entry(request.l2_table);
 742
 743    return cb.status;
 744}
 745
 746static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
 747{
 748    return acb->common.bs->opaque;
 749}
 750
 751/**
 752 * Read from the backing file or zero-fill if no backing file
 753 *
 754 * @s:              QED state
 755 * @pos:            Byte position in device
 756 * @qiov:           Destination I/O vector
 757 * @backing_qiov:   Possibly shortened copy of qiov, to be allocated here
 758 * @cb:             Completion function
 759 * @opaque:         User data for completion function
 760 *
 761 * This function reads qiov->size bytes starting at pos from the backing file.
 762 * If there is no backing file then zeroes are read.
 763 */
 764static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
 765                                  QEMUIOVector *qiov,
 766                                  QEMUIOVector **backing_qiov,
 767                                  BlockCompletionFunc *cb, void *opaque)
 768{
 769    uint64_t backing_length = 0;
 770    size_t size;
 771
 772    /* If there is a backing file, get its length.  Treat the absence of a
 773     * backing file like a zero length backing file.
 774     */
 775    if (s->bs->backing_hd) {
 776        int64_t l = bdrv_getlength(s->bs->backing_hd);
 777        if (l < 0) {
 778            cb(opaque, l);
 779            return;
 780        }
 781        backing_length = l;
 782    }
 783
 784    /* Zero all sectors if reading beyond the end of the backing file */
 785    if (pos >= backing_length ||
 786        pos + qiov->size > backing_length) {
 787        qemu_iovec_memset(qiov, 0, 0, qiov->size);
 788    }
 789
 790    /* Complete now if there are no backing file sectors to read */
 791    if (pos >= backing_length) {
 792        cb(opaque, 0);
 793        return;
 794    }
 795
 796    /* If the read straddles the end of the backing file, shorten it */
 797    size = MIN((uint64_t)backing_length - pos, qiov->size);
 798
 799    assert(*backing_qiov == NULL);
 800    *backing_qiov = g_new(QEMUIOVector, 1);
 801    qemu_iovec_init(*backing_qiov, qiov->niov);
 802    qemu_iovec_concat(*backing_qiov, qiov, 0, size);
 803
 804    BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
 805    bdrv_aio_readv(s->bs->backing_hd, pos / BDRV_SECTOR_SIZE,
 806                   *backing_qiov, size / BDRV_SECTOR_SIZE, cb, opaque);
 807}
 808
 809typedef struct {
 810    GenericCB gencb;
 811    BDRVQEDState *s;
 812    QEMUIOVector qiov;
 813    QEMUIOVector *backing_qiov;
 814    struct iovec iov;
 815    uint64_t offset;
 816} CopyFromBackingFileCB;
 817
 818static void qed_copy_from_backing_file_cb(void *opaque, int ret)
 819{
 820    CopyFromBackingFileCB *copy_cb = opaque;
 821    qemu_vfree(copy_cb->iov.iov_base);
 822    gencb_complete(&copy_cb->gencb, ret);
 823}
 824
 825static void qed_copy_from_backing_file_write(void *opaque, int ret)
 826{
 827    CopyFromBackingFileCB *copy_cb = opaque;
 828    BDRVQEDState *s = copy_cb->s;
 829
 830    if (copy_cb->backing_qiov) {
 831        qemu_iovec_destroy(copy_cb->backing_qiov);
 832        g_free(copy_cb->backing_qiov);
 833        copy_cb->backing_qiov = NULL;
 834    }
 835
 836    if (ret) {
 837        qed_copy_from_backing_file_cb(copy_cb, ret);
 838        return;
 839    }
 840
 841    BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
 842    bdrv_aio_writev(s->bs->file, copy_cb->offset / BDRV_SECTOR_SIZE,
 843                    &copy_cb->qiov, copy_cb->qiov.size / BDRV_SECTOR_SIZE,
 844                    qed_copy_from_backing_file_cb, copy_cb);
 845}
 846
 847/**
 848 * Copy data from backing file into the image
 849 *
 850 * @s:          QED state
 851 * @pos:        Byte position in device
 852 * @len:        Number of bytes
 853 * @offset:     Byte offset in image file
 854 * @cb:         Completion function
 855 * @opaque:     User data for completion function
 856 */
 857static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
 858                                       uint64_t len, uint64_t offset,
 859                                       BlockCompletionFunc *cb,
 860                                       void *opaque)
 861{
 862    CopyFromBackingFileCB *copy_cb;
 863
 864    /* Skip copy entirely if there is no work to do */
 865    if (len == 0) {
 866        cb(opaque, 0);
 867        return;
 868    }
 869
 870    copy_cb = gencb_alloc(sizeof(*copy_cb), cb, opaque);
 871    copy_cb->s = s;
 872    copy_cb->offset = offset;
 873    copy_cb->backing_qiov = NULL;
 874    copy_cb->iov.iov_base = qemu_blockalign(s->bs, len);
 875    copy_cb->iov.iov_len = len;
 876    qemu_iovec_init_external(&copy_cb->qiov, &copy_cb->iov, 1);
 877
 878    qed_read_backing_file(s, pos, &copy_cb->qiov, &copy_cb->backing_qiov,
 879                          qed_copy_from_backing_file_write, copy_cb);
 880}
 881
 882/**
 883 * Link one or more contiguous clusters into a table
 884 *
 885 * @s:              QED state
 886 * @table:          L2 table
 887 * @index:          First cluster index
 888 * @n:              Number of contiguous clusters
 889 * @cluster:        First cluster offset
 890 *
 891 * The cluster offset may be an allocated byte offset in the image file, the
 892 * zero cluster marker, or the unallocated cluster marker.
 893 */
 894static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
 895                                unsigned int n, uint64_t cluster)
 896{
 897    int i;
 898    for (i = index; i < index + n; i++) {
 899        table->offsets[i] = cluster;
 900        if (!qed_offset_is_unalloc_cluster(cluster) &&
 901            !qed_offset_is_zero_cluster(cluster)) {
 902            cluster += s->header.cluster_size;
 903        }
 904    }
 905}
 906
 907static void qed_aio_complete_bh(void *opaque)
 908{
 909    QEDAIOCB *acb = opaque;
 910    BlockCompletionFunc *cb = acb->common.cb;
 911    void *user_opaque = acb->common.opaque;
 912    int ret = acb->bh_ret;
 913
 914    qemu_bh_delete(acb->bh);
 915    qemu_aio_unref(acb);
 916
 917    /* Invoke callback */
 918    cb(user_opaque, ret);
 919}
 920
 921static void qed_aio_complete(QEDAIOCB *acb, int ret)
 922{
 923    BDRVQEDState *s = acb_to_s(acb);
 924
 925    trace_qed_aio_complete(s, acb, ret);
 926
 927    /* Free resources */
 928    qemu_iovec_destroy(&acb->cur_qiov);
 929    qed_unref_l2_cache_entry(acb->request.l2_table);
 930
 931    /* Free the buffer we may have allocated for zero writes */
 932    if (acb->flags & QED_AIOCB_ZERO) {
 933        qemu_vfree(acb->qiov->iov[0].iov_base);
 934        acb->qiov->iov[0].iov_base = NULL;
 935    }
 936
 937    /* Arrange for a bh to invoke the completion function */
 938    acb->bh_ret = ret;
 939    acb->bh = aio_bh_new(bdrv_get_aio_context(acb->common.bs),
 940                         qed_aio_complete_bh, acb);
 941    qemu_bh_schedule(acb->bh);
 942
 943    /* Start next allocating write request waiting behind this one.  Note that
 944     * requests enqueue themselves when they first hit an unallocated cluster
 945     * but they wait until the entire request is finished before waking up the
 946     * next request in the queue.  This ensures that we don't cycle through
 947     * requests multiple times but rather finish one at a time completely.
 948     */
 949    if (acb == QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
 950        QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next);
 951        acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
 952        if (acb) {
 953            qed_aio_next_io(acb, 0);
 954        } else if (s->header.features & QED_F_NEED_CHECK) {
 955            qed_start_need_check_timer(s);
 956        }
 957    }
 958}
 959
 960/**
 961 * Commit the current L2 table to the cache
 962 */
 963static void qed_commit_l2_update(void *opaque, int ret)
 964{
 965    QEDAIOCB *acb = opaque;
 966    BDRVQEDState *s = acb_to_s(acb);
 967    CachedL2Table *l2_table = acb->request.l2_table;
 968    uint64_t l2_offset = l2_table->offset;
 969
 970    qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
 971
 972    /* This is guaranteed to succeed because we just committed the entry to the
 973     * cache.
 974     */
 975    acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
 976    assert(acb->request.l2_table != NULL);
 977
 978    qed_aio_next_io(opaque, ret);
 979}
 980
 981/**
 982 * Update L1 table with new L2 table offset and write it out
 983 */
 984static void qed_aio_write_l1_update(void *opaque, int ret)
 985{
 986    QEDAIOCB *acb = opaque;
 987    BDRVQEDState *s = acb_to_s(acb);
 988    int index;
 989
 990    if (ret) {
 991        qed_aio_complete(acb, ret);
 992        return;
 993    }
 994
 995    index = qed_l1_index(s, acb->cur_pos);
 996    s->l1_table->offsets[index] = acb->request.l2_table->offset;
 997
 998    qed_write_l1_table(s, index, 1, qed_commit_l2_update, acb);
 999}
1000
1001/**
1002 * Update L2 table with new cluster offsets and write them out
1003 */
1004static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
1005{
1006    BDRVQEDState *s = acb_to_s(acb);
1007    bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
1008    int index;
1009
1010    if (ret) {
1011        goto err;
1012    }
1013
1014    if (need_alloc) {
1015        qed_unref_l2_cache_entry(acb->request.l2_table);
1016        acb->request.l2_table = qed_new_l2_table(s);
1017    }
1018
1019    index = qed_l2_index(s, acb->cur_pos);
1020    qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters,
1021                         offset);
1022
1023    if (need_alloc) {
1024        /* Write out the whole new L2 table */
1025        qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true,
1026                            qed_aio_write_l1_update, acb);
1027    } else {
1028        /* Write out only the updated part of the L2 table */
1029        qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false,
1030                            qed_aio_next_io, acb);
1031    }
1032    return;
1033
1034err:
1035    qed_aio_complete(acb, ret);
1036}
1037
1038static void qed_aio_write_l2_update_cb(void *opaque, int ret)
1039{
1040    QEDAIOCB *acb = opaque;
1041    qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
1042}
1043
1044/**
1045 * Flush new data clusters before updating the L2 table
1046 *
1047 * This flush is necessary when a backing file is in use.  A crash during an
1048 * allocating write could result in empty clusters in the image.  If the write
1049 * only touched a subregion of the cluster, then backing image sectors have
1050 * been lost in the untouched region.  The solution is to flush after writing a
1051 * new data cluster and before updating the L2 table.
1052 */
1053static void qed_aio_write_flush_before_l2_update(void *opaque, int ret)
1054{
1055    QEDAIOCB *acb = opaque;
1056    BDRVQEDState *s = acb_to_s(acb);
1057
1058    if (!bdrv_aio_flush(s->bs->file, qed_aio_write_l2_update_cb, opaque)) {
1059        qed_aio_complete(acb, -EIO);
1060    }
1061}
1062
1063/**
1064 * Write data to the image file
1065 */
1066static void qed_aio_write_main(void *opaque, int ret)
1067{
1068    QEDAIOCB *acb = opaque;
1069    BDRVQEDState *s = acb_to_s(acb);
1070    uint64_t offset = acb->cur_cluster +
1071                      qed_offset_into_cluster(s, acb->cur_pos);
1072    BlockCompletionFunc *next_fn;
1073
1074    trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size);
1075
1076    if (ret) {
1077        qed_aio_complete(acb, ret);
1078        return;
1079    }
1080
1081    if (acb->find_cluster_ret == QED_CLUSTER_FOUND) {
1082        next_fn = qed_aio_next_io;
1083    } else {
1084        if (s->bs->backing_hd) {
1085            next_fn = qed_aio_write_flush_before_l2_update;
1086        } else {
1087            next_fn = qed_aio_write_l2_update_cb;
1088        }
1089    }
1090
1091    BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
1092    bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
1093                    &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
1094                    next_fn, acb);
1095}
1096
1097/**
1098 * Populate back untouched region of new data cluster
1099 */
1100static void qed_aio_write_postfill(void *opaque, int ret)
1101{
1102    QEDAIOCB *acb = opaque;
1103    BDRVQEDState *s = acb_to_s(acb);
1104    uint64_t start = acb->cur_pos + acb->cur_qiov.size;
1105    uint64_t len =
1106        qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
1107    uint64_t offset = acb->cur_cluster +
1108                      qed_offset_into_cluster(s, acb->cur_pos) +
1109                      acb->cur_qiov.size;
1110
1111    if (ret) {
1112        qed_aio_complete(acb, ret);
1113        return;
1114    }
1115
1116    trace_qed_aio_write_postfill(s, acb, start, len, offset);
1117    qed_copy_from_backing_file(s, start, len, offset,
1118                                qed_aio_write_main, acb);
1119}
1120
1121/**
1122 * Populate front untouched region of new data cluster
1123 */
1124static void qed_aio_write_prefill(void *opaque, int ret)
1125{
1126    QEDAIOCB *acb = opaque;
1127    BDRVQEDState *s = acb_to_s(acb);
1128    uint64_t start = qed_start_of_cluster(s, acb->cur_pos);
1129    uint64_t len = qed_offset_into_cluster(s, acb->cur_pos);
1130
1131    trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
1132    qed_copy_from_backing_file(s, start, len, acb->cur_cluster,
1133                                qed_aio_write_postfill, acb);
1134}
1135
1136/**
1137 * Check if the QED_F_NEED_CHECK bit should be set during allocating write
1138 */
1139static bool qed_should_set_need_check(BDRVQEDState *s)
1140{
1141    /* The flush before L2 update path ensures consistency */
1142    if (s->bs->backing_hd) {
1143        return false;
1144    }
1145
1146    return !(s->header.features & QED_F_NEED_CHECK);
1147}
1148
1149static void qed_aio_write_zero_cluster(void *opaque, int ret)
1150{
1151    QEDAIOCB *acb = opaque;
1152
1153    if (ret) {
1154        qed_aio_complete(acb, ret);
1155        return;
1156    }
1157
1158    qed_aio_write_l2_update(acb, 0, 1);
1159}
1160
1161/**
1162 * Write new data cluster
1163 *
1164 * @acb:        Write request
1165 * @len:        Length in bytes
1166 *
1167 * This path is taken when writing to previously unallocated clusters.
1168 */
1169static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
1170{
1171    BDRVQEDState *s = acb_to_s(acb);
1172    BlockCompletionFunc *cb;
1173
1174    /* Cancel timer when the first allocating request comes in */
1175    if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) {
1176        qed_cancel_need_check_timer(s);
1177    }
1178
1179    /* Freeze this request if another allocating write is in progress */
1180    if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
1181        QSIMPLEQ_INSERT_TAIL(&s->allocating_write_reqs, acb, next);
1182    }
1183    if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) ||
1184        s->allocating_write_reqs_plugged) {
1185        return; /* wait for existing request to finish */
1186    }
1187
1188    acb->cur_nclusters = qed_bytes_to_clusters(s,
1189            qed_offset_into_cluster(s, acb->cur_pos) + len);
1190    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1191
1192    if (acb->flags & QED_AIOCB_ZERO) {
1193        /* Skip ahead if the clusters are already zero */
1194        if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
1195            qed_aio_next_io(acb, 0);
1196            return;
1197        }
1198
1199        cb = qed_aio_write_zero_cluster;
1200    } else {
1201        cb = qed_aio_write_prefill;
1202        acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
1203    }
1204
1205    if (qed_should_set_need_check(s)) {
1206        s->header.features |= QED_F_NEED_CHECK;
1207        qed_write_header(s, cb, acb);
1208    } else {
1209        cb(acb, 0);
1210    }
1211}
1212
1213/**
1214 * Write data cluster in place
1215 *
1216 * @acb:        Write request
1217 * @offset:     Cluster offset in bytes
1218 * @len:        Length in bytes
1219 *
1220 * This path is taken when writing to already allocated clusters.
1221 */
1222static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
1223{
1224    /* Allocate buffer for zero writes */
1225    if (acb->flags & QED_AIOCB_ZERO) {
1226        struct iovec *iov = acb->qiov->iov;
1227
1228        if (!iov->iov_base) {
1229            iov->iov_base = qemu_try_blockalign(acb->common.bs, iov->iov_len);
1230            if (iov->iov_base == NULL) {
1231                qed_aio_complete(acb, -ENOMEM);
1232                return;
1233            }
1234            memset(iov->iov_base, 0, iov->iov_len);
1235        }
1236    }
1237
1238    /* Calculate the I/O vector */
1239    acb->cur_cluster = offset;
1240    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1241
1242    /* Do the actual write */
1243    qed_aio_write_main(acb, 0);
1244}
1245
1246/**
1247 * Write data cluster
1248 *
1249 * @opaque:     Write request
1250 * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1,
1251 *              or -errno
1252 * @offset:     Cluster offset in bytes
1253 * @len:        Length in bytes
1254 *
1255 * Callback from qed_find_cluster().
1256 */
1257static void qed_aio_write_data(void *opaque, int ret,
1258                               uint64_t offset, size_t len)
1259{
1260    QEDAIOCB *acb = opaque;
1261
1262    trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len);
1263
1264    acb->find_cluster_ret = ret;
1265
1266    switch (ret) {
1267    case QED_CLUSTER_FOUND:
1268        qed_aio_write_inplace(acb, offset, len);
1269        break;
1270
1271    case QED_CLUSTER_L2:
1272    case QED_CLUSTER_L1:
1273    case QED_CLUSTER_ZERO:
1274        qed_aio_write_alloc(acb, len);
1275        break;
1276
1277    default:
1278        qed_aio_complete(acb, ret);
1279        break;
1280    }
1281}
1282
1283/**
1284 * Read data cluster
1285 *
1286 * @opaque:     Read request
1287 * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1,
1288 *              or -errno
1289 * @offset:     Cluster offset in bytes
1290 * @len:        Length in bytes
1291 *
1292 * Callback from qed_find_cluster().
1293 */
1294static void qed_aio_read_data(void *opaque, int ret,
1295                              uint64_t offset, size_t len)
1296{
1297    QEDAIOCB *acb = opaque;
1298    BDRVQEDState *s = acb_to_s(acb);
1299    BlockDriverState *bs = acb->common.bs;
1300
1301    /* Adjust offset into cluster */
1302    offset += qed_offset_into_cluster(s, acb->cur_pos);
1303
1304    trace_qed_aio_read_data(s, acb, ret, offset, len);
1305
1306    if (ret < 0) {
1307        goto err;
1308    }
1309
1310    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1311
1312    /* Handle zero cluster and backing file reads */
1313    if (ret == QED_CLUSTER_ZERO) {
1314        qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
1315        qed_aio_next_io(acb, 0);
1316        return;
1317    } else if (ret != QED_CLUSTER_FOUND) {
1318        qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
1319                              &acb->backing_qiov, qed_aio_next_io, acb);
1320        return;
1321    }
1322
1323    BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
1324    bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE,
1325                   &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
1326                   qed_aio_next_io, acb);
1327    return;
1328
1329err:
1330    qed_aio_complete(acb, ret);
1331}
1332
1333/**
1334 * Begin next I/O or complete the request
1335 */
1336static void qed_aio_next_io(void *opaque, int ret)
1337{
1338    QEDAIOCB *acb = opaque;
1339    BDRVQEDState *s = acb_to_s(acb);
1340    QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
1341                                qed_aio_write_data : qed_aio_read_data;
1342
1343    trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size);
1344
1345    if (acb->backing_qiov) {
1346        qemu_iovec_destroy(acb->backing_qiov);
1347        g_free(acb->backing_qiov);
1348        acb->backing_qiov = NULL;
1349    }
1350
1351    /* Handle I/O error */
1352    if (ret) {
1353        qed_aio_complete(acb, ret);
1354        return;
1355    }
1356
1357    acb->qiov_offset += acb->cur_qiov.size;
1358    acb->cur_pos += acb->cur_qiov.size;
1359    qemu_iovec_reset(&acb->cur_qiov);
1360
1361    /* Complete request */
1362    if (acb->cur_pos >= acb->end_pos) {
1363        qed_aio_complete(acb, 0);
1364        return;
1365    }
1366
1367    /* Find next cluster and start I/O */
1368    qed_find_cluster(s, &acb->request,
1369                      acb->cur_pos, acb->end_pos - acb->cur_pos,
1370                      io_fn, acb);
1371}
1372
1373static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
1374                                 int64_t sector_num,
1375                                 QEMUIOVector *qiov, int nb_sectors,
1376                                 BlockCompletionFunc *cb,
1377                                 void *opaque, int flags)
1378{
1379    QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, cb, opaque);
1380
1381    trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors,
1382                        opaque, flags);
1383
1384    acb->flags = flags;
1385    acb->qiov = qiov;
1386    acb->qiov_offset = 0;
1387    acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
1388    acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE;
1389    acb->backing_qiov = NULL;
1390    acb->request.l2_table = NULL;
1391    qemu_iovec_init(&acb->cur_qiov, qiov->niov);
1392
1393    /* Start request */
1394    qed_aio_next_io(acb, 0);
1395    return &acb->common;
1396}
1397
1398static BlockAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs,
1399                                      int64_t sector_num,
1400                                      QEMUIOVector *qiov, int nb_sectors,
1401                                      BlockCompletionFunc *cb,
1402                                      void *opaque)
1403{
1404    return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
1405}
1406
1407static BlockAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
1408                                       int64_t sector_num,
1409                                       QEMUIOVector *qiov, int nb_sectors,
1410                                       BlockCompletionFunc *cb,
1411                                       void *opaque)
1412{
1413    return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb,
1414                         opaque, QED_AIOCB_WRITE);
1415}
1416
1417typedef struct {
1418    Coroutine *co;
1419    int ret;
1420    bool done;
1421} QEDWriteZeroesCB;
1422
1423static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret)
1424{
1425    QEDWriteZeroesCB *cb = opaque;
1426
1427    cb->done = true;
1428    cb->ret = ret;
1429    if (cb->co) {
1430        qemu_coroutine_enter(cb->co, NULL);
1431    }
1432}
1433
1434static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
1435                                                 int64_t sector_num,
1436                                                 int nb_sectors,
1437                                                 BdrvRequestFlags flags)
1438{
1439    BlockAIOCB *blockacb;
1440    BDRVQEDState *s = bs->opaque;
1441    QEDWriteZeroesCB cb = { .done = false };
1442    QEMUIOVector qiov;
1443    struct iovec iov;
1444
1445    /* Refuse if there are untouched backing file sectors */
1446    if (bs->backing_hd) {
1447        if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) {
1448            return -ENOTSUP;
1449        }
1450        if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) {
1451            return -ENOTSUP;
1452        }
1453    }
1454
1455    /* Zero writes start without an I/O buffer.  If a buffer becomes necessary
1456     * then it will be allocated during request processing.
1457     */
1458    iov.iov_base = NULL,
1459    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE,
1460
1461    qemu_iovec_init_external(&qiov, &iov, 1);
1462    blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors,
1463                             qed_co_write_zeroes_cb, &cb,
1464                             QED_AIOCB_WRITE | QED_AIOCB_ZERO);
1465    if (!blockacb) {
1466        return -EIO;
1467    }
1468    if (!cb.done) {
1469        cb.co = qemu_coroutine_self();
1470        qemu_coroutine_yield();
1471    }
1472    assert(cb.done);
1473    return cb.ret;
1474}
1475
1476static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset)
1477{
1478    BDRVQEDState *s = bs->opaque;
1479    uint64_t old_image_size;
1480    int ret;
1481
1482    if (!qed_is_image_size_valid(offset, s->header.cluster_size,
1483                                 s->header.table_size)) {
1484        return -EINVAL;
1485    }
1486
1487    /* Shrinking is currently not supported */
1488    if ((uint64_t)offset < s->header.image_size) {
1489        return -ENOTSUP;
1490    }
1491
1492    old_image_size = s->header.image_size;
1493    s->header.image_size = offset;
1494    ret = qed_write_header_sync(s);
1495    if (ret < 0) {
1496        s->header.image_size = old_image_size;
1497    }
1498    return ret;
1499}
1500
1501static int64_t bdrv_qed_getlength(BlockDriverState *bs)
1502{
1503    BDRVQEDState *s = bs->opaque;
1504    return s->header.image_size;
1505}
1506
1507static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1508{
1509    BDRVQEDState *s = bs->opaque;
1510
1511    memset(bdi, 0, sizeof(*bdi));
1512    bdi->cluster_size = s->header.cluster_size;
1513    bdi->is_dirty = s->header.features & QED_F_NEED_CHECK;
1514    bdi->unallocated_blocks_are_zero = true;
1515    bdi->can_write_zeroes_with_unmap = true;
1516    return 0;
1517}
1518
1519static int bdrv_qed_change_backing_file(BlockDriverState *bs,
1520                                        const char *backing_file,
1521                                        const char *backing_fmt)
1522{
1523    BDRVQEDState *s = bs->opaque;
1524    QEDHeader new_header, le_header;
1525    void *buffer;
1526    size_t buffer_len, backing_file_len;
1527    int ret;
1528
1529    /* Refuse to set backing filename if unknown compat feature bits are
1530     * active.  If the image uses an unknown compat feature then we may not
1531     * know the layout of data following the header structure and cannot safely
1532     * add a new string.
1533     */
1534    if (backing_file && (s->header.compat_features &
1535                         ~QED_COMPAT_FEATURE_MASK)) {
1536        return -ENOTSUP;
1537    }
1538
1539    memcpy(&new_header, &s->header, sizeof(new_header));
1540
1541    new_header.features &= ~(QED_F_BACKING_FILE |
1542                             QED_F_BACKING_FORMAT_NO_PROBE);
1543
1544    /* Adjust feature flags */
1545    if (backing_file) {
1546        new_header.features |= QED_F_BACKING_FILE;
1547
1548        if (qed_fmt_is_raw(backing_fmt)) {
1549            new_header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
1550        }
1551    }
1552
1553    /* Calculate new header size */
1554    backing_file_len = 0;
1555
1556    if (backing_file) {
1557        backing_file_len = strlen(backing_file);
1558    }
1559
1560    buffer_len = sizeof(new_header);
1561    new_header.backing_filename_offset = buffer_len;
1562    new_header.backing_filename_size = backing_file_len;
1563    buffer_len += backing_file_len;
1564
1565    /* Make sure we can rewrite header without failing */
1566    if (buffer_len > new_header.header_size * new_header.cluster_size) {
1567        return -ENOSPC;
1568    }
1569
1570    /* Prepare new header */
1571    buffer = g_malloc(buffer_len);
1572
1573    qed_header_cpu_to_le(&new_header, &le_header);
1574    memcpy(buffer, &le_header, sizeof(le_header));
1575    buffer_len = sizeof(le_header);
1576
1577    if (backing_file) {
1578        memcpy(buffer + buffer_len, backing_file, backing_file_len);
1579        buffer_len += backing_file_len;
1580    }
1581
1582    /* Write new header */
1583    ret = bdrv_pwrite_sync(bs->file, 0, buffer, buffer_len);
1584    g_free(buffer);
1585    if (ret == 0) {
1586        memcpy(&s->header, &new_header, sizeof(new_header));
1587    }
1588    return ret;
1589}
1590
1591static void bdrv_qed_invalidate_cache(BlockDriverState *bs, Error **errp)
1592{
1593    BDRVQEDState *s = bs->opaque;
1594    Error *local_err = NULL;
1595    int ret;
1596
1597    bdrv_qed_close(bs);
1598
1599    bdrv_invalidate_cache(bs->file, &local_err);
1600    if (local_err) {
1601        error_propagate(errp, local_err);
1602        return;
1603    }
1604
1605    memset(s, 0, sizeof(BDRVQEDState));
1606    ret = bdrv_qed_open(bs, NULL, bs->open_flags, &local_err);
1607    if (local_err) {
1608        error_setg(errp, "Could not reopen qed layer: %s",
1609                   error_get_pretty(local_err));
1610        error_free(local_err);
1611        return;
1612    } else if (ret < 0) {
1613        error_setg_errno(errp, -ret, "Could not reopen qed layer");
1614        return;
1615    }
1616}
1617
1618static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result,
1619                          BdrvCheckMode fix)
1620{
1621    BDRVQEDState *s = bs->opaque;
1622
1623    return qed_check(s, result, !!fix);
1624}
1625
1626static QemuOptsList qed_create_opts = {
1627    .name = "qed-create-opts",
1628    .head = QTAILQ_HEAD_INITIALIZER(qed_create_opts.head),
1629    .desc = {
1630        {
1631            .name = BLOCK_OPT_SIZE,
1632            .type = QEMU_OPT_SIZE,
1633            .help = "Virtual disk size"
1634        },
1635        {
1636            .name = BLOCK_OPT_BACKING_FILE,
1637            .type = QEMU_OPT_STRING,
1638            .help = "File name of a base image"
1639        },
1640        {
1641            .name = BLOCK_OPT_BACKING_FMT,
1642            .type = QEMU_OPT_STRING,
1643            .help = "Image format of the base image"
1644        },
1645        {
1646            .name = BLOCK_OPT_CLUSTER_SIZE,
1647            .type = QEMU_OPT_SIZE,
1648            .help = "Cluster size (in bytes)",
1649            .def_value_str = stringify(QED_DEFAULT_CLUSTER_SIZE)
1650        },
1651        {
1652            .name = BLOCK_OPT_TABLE_SIZE,
1653            .type = QEMU_OPT_SIZE,
1654            .help = "L1/L2 table size (in clusters)"
1655        },
1656        { /* end of list */ }
1657    }
1658};
1659
1660static BlockDriver bdrv_qed = {
1661    .format_name              = "qed",
1662    .instance_size            = sizeof(BDRVQEDState),
1663    .create_opts              = &qed_create_opts,
1664    .supports_backing         = true,
1665
1666    .bdrv_probe               = bdrv_qed_probe,
1667    .bdrv_rebind              = bdrv_qed_rebind,
1668    .bdrv_open                = bdrv_qed_open,
1669    .bdrv_close               = bdrv_qed_close,
1670    .bdrv_reopen_prepare      = bdrv_qed_reopen_prepare,
1671    .bdrv_create              = bdrv_qed_create,
1672    .bdrv_has_zero_init       = bdrv_has_zero_init_1,
1673    .bdrv_co_get_block_status = bdrv_qed_co_get_block_status,
1674    .bdrv_aio_readv           = bdrv_qed_aio_readv,
1675    .bdrv_aio_writev          = bdrv_qed_aio_writev,
1676    .bdrv_co_write_zeroes     = bdrv_qed_co_write_zeroes,
1677    .bdrv_truncate            = bdrv_qed_truncate,
1678    .bdrv_getlength           = bdrv_qed_getlength,
1679    .bdrv_get_info            = bdrv_qed_get_info,
1680    .bdrv_refresh_limits      = bdrv_qed_refresh_limits,
1681    .bdrv_change_backing_file = bdrv_qed_change_backing_file,
1682    .bdrv_invalidate_cache    = bdrv_qed_invalidate_cache,
1683    .bdrv_check               = bdrv_qed_check,
1684    .bdrv_detach_aio_context  = bdrv_qed_detach_aio_context,
1685    .bdrv_attach_aio_context  = bdrv_qed_attach_aio_context,
1686};
1687
1688static void bdrv_qed_init(void)
1689{
1690    bdrv_register(&bdrv_qed);
1691}
1692
1693block_init(bdrv_qed_init);
1694