qemu/block/qcow2.c
<<
>>
Prefs
   1/*
   2 * Block driver for the QCOW version 2 format
   3 *
   4 * Copyright (c) 2004-2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26
  27#define ZLIB_CONST
  28#include <zlib.h>
  29
  30#include "block/block_int.h"
  31#include "block/qdict.h"
  32#include "sysemu/block-backend.h"
  33#include "qemu/module.h"
  34#include "qcow2.h"
  35#include "qemu/error-report.h"
  36#include "qapi/error.h"
  37#include "qapi/qapi-events-block-core.h"
  38#include "qapi/qmp/qdict.h"
  39#include "qapi/qmp/qstring.h"
  40#include "trace.h"
  41#include "qemu/option_int.h"
  42#include "qemu/cutils.h"
  43#include "qemu/bswap.h"
  44#include "qapi/qobject-input-visitor.h"
  45#include "qapi/qapi-visit-block-core.h"
  46#include "crypto.h"
  47#include "block/thread-pool.h"
  48
  49/*
  50  Differences with QCOW:
  51
  52  - Support for multiple incremental snapshots.
  53  - Memory management by reference counts.
  54  - Clusters which have a reference count of one have the bit
  55    QCOW_OFLAG_COPIED to optimize write performance.
  56  - Size of compressed clusters is stored in sectors to reduce bit usage
  57    in the cluster offsets.
  58  - Support for storing additional data (such as the VM state) in the
  59    snapshots.
  60  - If a backing store is used, the cluster size is not constrained
  61    (could be backported to QCOW).
  62  - L2 tables have always a size of one cluster.
  63*/
  64
  65
  66typedef struct {
  67    uint32_t magic;
  68    uint32_t len;
  69} QEMU_PACKED QCowExtension;
  70
  71#define  QCOW2_EXT_MAGIC_END 0
  72#define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
  73#define  QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
  74#define  QCOW2_EXT_MAGIC_CRYPTO_HEADER 0x0537be77
  75#define  QCOW2_EXT_MAGIC_BITMAPS 0x23852875
  76
  77static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
  78{
  79    const QCowHeader *cow_header = (const void *)buf;
  80
  81    if (buf_size >= sizeof(QCowHeader) &&
  82        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
  83        be32_to_cpu(cow_header->version) >= 2)
  84        return 100;
  85    else
  86        return 0;
  87}
  88
  89
  90static ssize_t qcow2_crypto_hdr_read_func(QCryptoBlock *block, size_t offset,
  91                                          uint8_t *buf, size_t buflen,
  92                                          void *opaque, Error **errp)
  93{
  94    BlockDriverState *bs = opaque;
  95    BDRVQcow2State *s = bs->opaque;
  96    ssize_t ret;
  97
  98    if ((offset + buflen) > s->crypto_header.length) {
  99        error_setg(errp, "Request for data outside of extension header");
 100        return -1;
 101    }
 102
 103    ret = bdrv_pread(bs->file,
 104                     s->crypto_header.offset + offset, buf, buflen);
 105    if (ret < 0) {
 106        error_setg_errno(errp, -ret, "Could not read encryption header");
 107        return -1;
 108    }
 109    return ret;
 110}
 111
 112
 113static ssize_t qcow2_crypto_hdr_init_func(QCryptoBlock *block, size_t headerlen,
 114                                          void *opaque, Error **errp)
 115{
 116    BlockDriverState *bs = opaque;
 117    BDRVQcow2State *s = bs->opaque;
 118    int64_t ret;
 119    int64_t clusterlen;
 120
 121    ret = qcow2_alloc_clusters(bs, headerlen);
 122    if (ret < 0) {
 123        error_setg_errno(errp, -ret,
 124                         "Cannot allocate cluster for LUKS header size %zu",
 125                         headerlen);
 126        return -1;
 127    }
 128
 129    s->crypto_header.length = headerlen;
 130    s->crypto_header.offset = ret;
 131
 132    /* Zero fill remaining space in cluster so it has predictable
 133     * content in case of future spec changes */
 134    clusterlen = size_to_clusters(s, headerlen) * s->cluster_size;
 135    assert(qcow2_pre_write_overlap_check(bs, 0, ret, clusterlen) == 0);
 136    ret = bdrv_pwrite_zeroes(bs->file,
 137                             ret + headerlen,
 138                             clusterlen - headerlen, 0);
 139    if (ret < 0) {
 140        error_setg_errno(errp, -ret, "Could not zero fill encryption header");
 141        return -1;
 142    }
 143
 144    return ret;
 145}
 146
 147
 148static ssize_t qcow2_crypto_hdr_write_func(QCryptoBlock *block, size_t offset,
 149                                           const uint8_t *buf, size_t buflen,
 150                                           void *opaque, Error **errp)
 151{
 152    BlockDriverState *bs = opaque;
 153    BDRVQcow2State *s = bs->opaque;
 154    ssize_t ret;
 155
 156    if ((offset + buflen) > s->crypto_header.length) {
 157        error_setg(errp, "Request for data outside of extension header");
 158        return -1;
 159    }
 160
 161    ret = bdrv_pwrite(bs->file,
 162                      s->crypto_header.offset + offset, buf, buflen);
 163    if (ret < 0) {
 164        error_setg_errno(errp, -ret, "Could not read encryption header");
 165        return -1;
 166    }
 167    return ret;
 168}
 169
 170
 171/* 
 172 * read qcow2 extension and fill bs
 173 * start reading from start_offset
 174 * finish reading upon magic of value 0 or when end_offset reached
 175 * unknown magic is skipped (future extension this version knows nothing about)
 176 * return 0 upon success, non-0 otherwise
 177 */
 178static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
 179                                 uint64_t end_offset, void **p_feature_table,
 180                                 int flags, bool *need_update_header,
 181                                 Error **errp)
 182{
 183    BDRVQcow2State *s = bs->opaque;
 184    QCowExtension ext;
 185    uint64_t offset;
 186    int ret;
 187    Qcow2BitmapHeaderExt bitmaps_ext;
 188
 189    if (need_update_header != NULL) {
 190        *need_update_header = false;
 191    }
 192
 193#ifdef DEBUG_EXT
 194    printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
 195#endif
 196    offset = start_offset;
 197    while (offset < end_offset) {
 198
 199#ifdef DEBUG_EXT
 200        /* Sanity check */
 201        if (offset > s->cluster_size)
 202            printf("qcow2_read_extension: suspicious offset %lu\n", offset);
 203
 204        printf("attempting to read extended header in offset %lu\n", offset);
 205#endif
 206
 207        ret = bdrv_pread(bs->file, offset, &ext, sizeof(ext));
 208        if (ret < 0) {
 209            error_setg_errno(errp, -ret, "qcow2_read_extension: ERROR: "
 210                             "pread fail from offset %" PRIu64, offset);
 211            return 1;
 212        }
 213        ext.magic = be32_to_cpu(ext.magic);
 214        ext.len = be32_to_cpu(ext.len);
 215        offset += sizeof(ext);
 216#ifdef DEBUG_EXT
 217        printf("ext.magic = 0x%x\n", ext.magic);
 218#endif
 219        if (offset > end_offset || ext.len > end_offset - offset) {
 220            error_setg(errp, "Header extension too large");
 221            return -EINVAL;
 222        }
 223
 224        switch (ext.magic) {
 225        case QCOW2_EXT_MAGIC_END:
 226            return 0;
 227
 228        case QCOW2_EXT_MAGIC_BACKING_FORMAT:
 229            if (ext.len >= sizeof(bs->backing_format)) {
 230                error_setg(errp, "ERROR: ext_backing_format: len=%" PRIu32
 231                           " too large (>=%zu)", ext.len,
 232                           sizeof(bs->backing_format));
 233                return 2;
 234            }
 235            ret = bdrv_pread(bs->file, offset, bs->backing_format, ext.len);
 236            if (ret < 0) {
 237                error_setg_errno(errp, -ret, "ERROR: ext_backing_format: "
 238                                 "Could not read format name");
 239                return 3;
 240            }
 241            bs->backing_format[ext.len] = '\0';
 242            s->image_backing_format = g_strdup(bs->backing_format);
 243#ifdef DEBUG_EXT
 244            printf("Qcow2: Got format extension %s\n", bs->backing_format);
 245#endif
 246            break;
 247
 248        case QCOW2_EXT_MAGIC_FEATURE_TABLE:
 249            if (p_feature_table != NULL) {
 250                void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature));
 251                ret = bdrv_pread(bs->file, offset , feature_table, ext.len);
 252                if (ret < 0) {
 253                    error_setg_errno(errp, -ret, "ERROR: ext_feature_table: "
 254                                     "Could not read table");
 255                    return ret;
 256                }
 257
 258                *p_feature_table = feature_table;
 259            }
 260            break;
 261
 262        case QCOW2_EXT_MAGIC_CRYPTO_HEADER: {
 263            unsigned int cflags = 0;
 264            if (s->crypt_method_header != QCOW_CRYPT_LUKS) {
 265                error_setg(errp, "CRYPTO header extension only "
 266                           "expected with LUKS encryption method");
 267                return -EINVAL;
 268            }
 269            if (ext.len != sizeof(Qcow2CryptoHeaderExtension)) {
 270                error_setg(errp, "CRYPTO header extension size %u, "
 271                           "but expected size %zu", ext.len,
 272                           sizeof(Qcow2CryptoHeaderExtension));
 273                return -EINVAL;
 274            }
 275
 276            ret = bdrv_pread(bs->file, offset, &s->crypto_header, ext.len);
 277            if (ret < 0) {
 278                error_setg_errno(errp, -ret,
 279                                 "Unable to read CRYPTO header extension");
 280                return ret;
 281            }
 282            s->crypto_header.offset = be64_to_cpu(s->crypto_header.offset);
 283            s->crypto_header.length = be64_to_cpu(s->crypto_header.length);
 284
 285            if ((s->crypto_header.offset % s->cluster_size) != 0) {
 286                error_setg(errp, "Encryption header offset '%" PRIu64 "' is "
 287                           "not a multiple of cluster size '%u'",
 288                           s->crypto_header.offset, s->cluster_size);
 289                return -EINVAL;
 290            }
 291
 292            if (flags & BDRV_O_NO_IO) {
 293                cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
 294            }
 295            s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.",
 296                                           qcow2_crypto_hdr_read_func,
 297                                           bs, cflags, errp);
 298            if (!s->crypto) {
 299                return -EINVAL;
 300            }
 301        }   break;
 302
 303        case QCOW2_EXT_MAGIC_BITMAPS:
 304            if (ext.len != sizeof(bitmaps_ext)) {
 305                error_setg_errno(errp, -ret, "bitmaps_ext: "
 306                                 "Invalid extension length");
 307                return -EINVAL;
 308            }
 309
 310            if (!(s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS)) {
 311                if (s->qcow_version < 3) {
 312                    /* Let's be a bit more specific */
 313                    warn_report("This qcow2 v2 image contains bitmaps, but "
 314                                "they may have been modified by a program "
 315                                "without persistent bitmap support; so now "
 316                                "they must all be considered inconsistent");
 317                } else {
 318                    warn_report("a program lacking bitmap support "
 319                                "modified this file, so all bitmaps are now "
 320                                "considered inconsistent");
 321                }
 322                error_printf("Some clusters may be leaked, "
 323                             "run 'qemu-img check -r' on the image "
 324                             "file to fix.");
 325                if (need_update_header != NULL) {
 326                    /* Updating is needed to drop invalid bitmap extension. */
 327                    *need_update_header = true;
 328                }
 329                break;
 330            }
 331
 332            ret = bdrv_pread(bs->file, offset, &bitmaps_ext, ext.len);
 333            if (ret < 0) {
 334                error_setg_errno(errp, -ret, "bitmaps_ext: "
 335                                 "Could not read ext header");
 336                return ret;
 337            }
 338
 339            if (bitmaps_ext.reserved32 != 0) {
 340                error_setg_errno(errp, -ret, "bitmaps_ext: "
 341                                 "Reserved field is not zero");
 342                return -EINVAL;
 343            }
 344
 345            bitmaps_ext.nb_bitmaps = be32_to_cpu(bitmaps_ext.nb_bitmaps);
 346            bitmaps_ext.bitmap_directory_size =
 347                be64_to_cpu(bitmaps_ext.bitmap_directory_size);
 348            bitmaps_ext.bitmap_directory_offset =
 349                be64_to_cpu(bitmaps_ext.bitmap_directory_offset);
 350
 351            if (bitmaps_ext.nb_bitmaps > QCOW2_MAX_BITMAPS) {
 352                error_setg(errp,
 353                           "bitmaps_ext: Image has %" PRIu32 " bitmaps, "
 354                           "exceeding the QEMU supported maximum of %d",
 355                           bitmaps_ext.nb_bitmaps, QCOW2_MAX_BITMAPS);
 356                return -EINVAL;
 357            }
 358
 359            if (bitmaps_ext.nb_bitmaps == 0) {
 360                error_setg(errp, "found bitmaps extension with zero bitmaps");
 361                return -EINVAL;
 362            }
 363
 364            if (bitmaps_ext.bitmap_directory_offset & (s->cluster_size - 1)) {
 365                error_setg(errp, "bitmaps_ext: "
 366                                 "invalid bitmap directory offset");
 367                return -EINVAL;
 368            }
 369
 370            if (bitmaps_ext.bitmap_directory_size >
 371                QCOW2_MAX_BITMAP_DIRECTORY_SIZE) {
 372                error_setg(errp, "bitmaps_ext: "
 373                                 "bitmap directory size (%" PRIu64 ") exceeds "
 374                                 "the maximum supported size (%d)",
 375                                 bitmaps_ext.bitmap_directory_size,
 376                                 QCOW2_MAX_BITMAP_DIRECTORY_SIZE);
 377                return -EINVAL;
 378            }
 379
 380            s->nb_bitmaps = bitmaps_ext.nb_bitmaps;
 381            s->bitmap_directory_offset =
 382                    bitmaps_ext.bitmap_directory_offset;
 383            s->bitmap_directory_size =
 384                    bitmaps_ext.bitmap_directory_size;
 385
 386#ifdef DEBUG_EXT
 387            printf("Qcow2: Got bitmaps extension: "
 388                   "offset=%" PRIu64 " nb_bitmaps=%" PRIu32 "\n",
 389                   s->bitmap_directory_offset, s->nb_bitmaps);
 390#endif
 391            break;
 392
 393        default:
 394            /* unknown magic - save it in case we need to rewrite the header */
 395            /* If you add a new feature, make sure to also update the fast
 396             * path of qcow2_make_empty() to deal with it. */
 397            {
 398                Qcow2UnknownHeaderExtension *uext;
 399
 400                uext = g_malloc0(sizeof(*uext)  + ext.len);
 401                uext->magic = ext.magic;
 402                uext->len = ext.len;
 403                QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next);
 404
 405                ret = bdrv_pread(bs->file, offset , uext->data, uext->len);
 406                if (ret < 0) {
 407                    error_setg_errno(errp, -ret, "ERROR: unknown extension: "
 408                                     "Could not read data");
 409                    return ret;
 410                }
 411            }
 412            break;
 413        }
 414
 415        offset += ((ext.len + 7) & ~7);
 416    }
 417
 418    return 0;
 419}
 420
 421static void cleanup_unknown_header_ext(BlockDriverState *bs)
 422{
 423    BDRVQcow2State *s = bs->opaque;
 424    Qcow2UnknownHeaderExtension *uext, *next;
 425
 426    QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) {
 427        QLIST_REMOVE(uext, next);
 428        g_free(uext);
 429    }
 430}
 431
 432static void report_unsupported_feature(Error **errp, Qcow2Feature *table,
 433                                       uint64_t mask)
 434{
 435    char *features = g_strdup("");
 436    char *old;
 437
 438    while (table && table->name[0] != '\0') {
 439        if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) {
 440            if (mask & (1ULL << table->bit)) {
 441                old = features;
 442                features = g_strdup_printf("%s%s%.46s", old, *old ? ", " : "",
 443                                           table->name);
 444                g_free(old);
 445                mask &= ~(1ULL << table->bit);
 446            }
 447        }
 448        table++;
 449    }
 450
 451    if (mask) {
 452        old = features;
 453        features = g_strdup_printf("%s%sUnknown incompatible feature: %" PRIx64,
 454                                   old, *old ? ", " : "", mask);
 455        g_free(old);
 456    }
 457
 458    error_setg(errp, "Unsupported qcow2 feature(s): %s", features);
 459    g_free(features);
 460}
 461
 462/*
 463 * Sets the dirty bit and flushes afterwards if necessary.
 464 *
 465 * The incompatible_features bit is only set if the image file header was
 466 * updated successfully.  Therefore it is not required to check the return
 467 * value of this function.
 468 */
 469int qcow2_mark_dirty(BlockDriverState *bs)
 470{
 471    BDRVQcow2State *s = bs->opaque;
 472    uint64_t val;
 473    int ret;
 474
 475    assert(s->qcow_version >= 3);
 476
 477    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
 478        return 0; /* already dirty */
 479    }
 480
 481    val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY);
 482    ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features),
 483                      &val, sizeof(val));
 484    if (ret < 0) {
 485        return ret;
 486    }
 487    ret = bdrv_flush(bs->file->bs);
 488    if (ret < 0) {
 489        return ret;
 490    }
 491
 492    /* Only treat image as dirty if the header was updated successfully */
 493    s->incompatible_features |= QCOW2_INCOMPAT_DIRTY;
 494    return 0;
 495}
 496
 497/*
 498 * Clears the dirty bit and flushes before if necessary.  Only call this
 499 * function when there are no pending requests, it does not guard against
 500 * concurrent requests dirtying the image.
 501 */
 502static int qcow2_mark_clean(BlockDriverState *bs)
 503{
 504    BDRVQcow2State *s = bs->opaque;
 505
 506    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
 507        int ret;
 508
 509        s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY;
 510
 511        ret = qcow2_flush_caches(bs);
 512        if (ret < 0) {
 513            return ret;
 514        }
 515
 516        return qcow2_update_header(bs);
 517    }
 518    return 0;
 519}
 520
 521/*
 522 * Marks the image as corrupt.
 523 */
 524int qcow2_mark_corrupt(BlockDriverState *bs)
 525{
 526    BDRVQcow2State *s = bs->opaque;
 527
 528    s->incompatible_features |= QCOW2_INCOMPAT_CORRUPT;
 529    return qcow2_update_header(bs);
 530}
 531
 532/*
 533 * Marks the image as consistent, i.e., unsets the corrupt bit, and flushes
 534 * before if necessary.
 535 */
 536int qcow2_mark_consistent(BlockDriverState *bs)
 537{
 538    BDRVQcow2State *s = bs->opaque;
 539
 540    if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
 541        int ret = qcow2_flush_caches(bs);
 542        if (ret < 0) {
 543            return ret;
 544        }
 545
 546        s->incompatible_features &= ~QCOW2_INCOMPAT_CORRUPT;
 547        return qcow2_update_header(bs);
 548    }
 549    return 0;
 550}
 551
 552static int coroutine_fn qcow2_co_check_locked(BlockDriverState *bs,
 553                                              BdrvCheckResult *result,
 554                                              BdrvCheckMode fix)
 555{
 556    int ret = qcow2_check_refcounts(bs, result, fix);
 557    if (ret < 0) {
 558        return ret;
 559    }
 560
 561    if (fix && result->check_errors == 0 && result->corruptions == 0) {
 562        ret = qcow2_mark_clean(bs);
 563        if (ret < 0) {
 564            return ret;
 565        }
 566        return qcow2_mark_consistent(bs);
 567    }
 568    return ret;
 569}
 570
 571static int coroutine_fn qcow2_co_check(BlockDriverState *bs,
 572                                       BdrvCheckResult *result,
 573                                       BdrvCheckMode fix)
 574{
 575    BDRVQcow2State *s = bs->opaque;
 576    int ret;
 577
 578    qemu_co_mutex_lock(&s->lock);
 579    ret = qcow2_co_check_locked(bs, result, fix);
 580    qemu_co_mutex_unlock(&s->lock);
 581    return ret;
 582}
 583
 584int qcow2_validate_table(BlockDriverState *bs, uint64_t offset,
 585                         uint64_t entries, size_t entry_len,
 586                         int64_t max_size_bytes, const char *table_name,
 587                         Error **errp)
 588{
 589    BDRVQcow2State *s = bs->opaque;
 590
 591    if (entries > max_size_bytes / entry_len) {
 592        error_setg(errp, "%s too large", table_name);
 593        return -EFBIG;
 594    }
 595
 596    /* Use signed INT64_MAX as the maximum even for uint64_t header fields,
 597     * because values will be passed to qemu functions taking int64_t. */
 598    if ((INT64_MAX - entries * entry_len < offset) ||
 599        (offset_into_cluster(s, offset) != 0)) {
 600        error_setg(errp, "%s offset invalid", table_name);
 601        return -EINVAL;
 602    }
 603
 604    return 0;
 605}
 606
 607static QemuOptsList qcow2_runtime_opts = {
 608    .name = "qcow2",
 609    .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head),
 610    .desc = {
 611        {
 612            .name = QCOW2_OPT_LAZY_REFCOUNTS,
 613            .type = QEMU_OPT_BOOL,
 614            .help = "Postpone refcount updates",
 615        },
 616        {
 617            .name = QCOW2_OPT_DISCARD_REQUEST,
 618            .type = QEMU_OPT_BOOL,
 619            .help = "Pass guest discard requests to the layer below",
 620        },
 621        {
 622            .name = QCOW2_OPT_DISCARD_SNAPSHOT,
 623            .type = QEMU_OPT_BOOL,
 624            .help = "Generate discard requests when snapshot related space "
 625                    "is freed",
 626        },
 627        {
 628            .name = QCOW2_OPT_DISCARD_OTHER,
 629            .type = QEMU_OPT_BOOL,
 630            .help = "Generate discard requests when other clusters are freed",
 631        },
 632        {
 633            .name = QCOW2_OPT_OVERLAP,
 634            .type = QEMU_OPT_STRING,
 635            .help = "Selects which overlap checks to perform from a range of "
 636                    "templates (none, constant, cached, all)",
 637        },
 638        {
 639            .name = QCOW2_OPT_OVERLAP_TEMPLATE,
 640            .type = QEMU_OPT_STRING,
 641            .help = "Selects which overlap checks to perform from a range of "
 642                    "templates (none, constant, cached, all)",
 643        },
 644        {
 645            .name = QCOW2_OPT_OVERLAP_MAIN_HEADER,
 646            .type = QEMU_OPT_BOOL,
 647            .help = "Check for unintended writes into the main qcow2 header",
 648        },
 649        {
 650            .name = QCOW2_OPT_OVERLAP_ACTIVE_L1,
 651            .type = QEMU_OPT_BOOL,
 652            .help = "Check for unintended writes into the active L1 table",
 653        },
 654        {
 655            .name = QCOW2_OPT_OVERLAP_ACTIVE_L2,
 656            .type = QEMU_OPT_BOOL,
 657            .help = "Check for unintended writes into an active L2 table",
 658        },
 659        {
 660            .name = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
 661            .type = QEMU_OPT_BOOL,
 662            .help = "Check for unintended writes into the refcount table",
 663        },
 664        {
 665            .name = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
 666            .type = QEMU_OPT_BOOL,
 667            .help = "Check for unintended writes into a refcount block",
 668        },
 669        {
 670            .name = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
 671            .type = QEMU_OPT_BOOL,
 672            .help = "Check for unintended writes into the snapshot table",
 673        },
 674        {
 675            .name = QCOW2_OPT_OVERLAP_INACTIVE_L1,
 676            .type = QEMU_OPT_BOOL,
 677            .help = "Check for unintended writes into an inactive L1 table",
 678        },
 679        {
 680            .name = QCOW2_OPT_OVERLAP_INACTIVE_L2,
 681            .type = QEMU_OPT_BOOL,
 682            .help = "Check for unintended writes into an inactive L2 table",
 683        },
 684        {
 685            .name = QCOW2_OPT_OVERLAP_BITMAP_DIRECTORY,
 686            .type = QEMU_OPT_BOOL,
 687            .help = "Check for unintended writes into the bitmap directory",
 688        },
 689        {
 690            .name = QCOW2_OPT_CACHE_SIZE,
 691            .type = QEMU_OPT_SIZE,
 692            .help = "Maximum combined metadata (L2 tables and refcount blocks) "
 693                    "cache size",
 694        },
 695        {
 696            .name = QCOW2_OPT_L2_CACHE_SIZE,
 697            .type = QEMU_OPT_SIZE,
 698            .help = "Maximum L2 table cache size",
 699        },
 700        {
 701            .name = QCOW2_OPT_L2_CACHE_ENTRY_SIZE,
 702            .type = QEMU_OPT_SIZE,
 703            .help = "Size of each entry in the L2 cache",
 704        },
 705        {
 706            .name = QCOW2_OPT_REFCOUNT_CACHE_SIZE,
 707            .type = QEMU_OPT_SIZE,
 708            .help = "Maximum refcount block cache size",
 709        },
 710        {
 711            .name = QCOW2_OPT_CACHE_CLEAN_INTERVAL,
 712            .type = QEMU_OPT_NUMBER,
 713            .help = "Clean unused cache entries after this time (in seconds)",
 714        },
 715        BLOCK_CRYPTO_OPT_DEF_KEY_SECRET("encrypt.",
 716            "ID of secret providing qcow2 AES key or LUKS passphrase"),
 717        { /* end of list */ }
 718    },
 719};
 720
 721static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = {
 722    [QCOW2_OL_MAIN_HEADER_BITNR]      = QCOW2_OPT_OVERLAP_MAIN_HEADER,
 723    [QCOW2_OL_ACTIVE_L1_BITNR]        = QCOW2_OPT_OVERLAP_ACTIVE_L1,
 724    [QCOW2_OL_ACTIVE_L2_BITNR]        = QCOW2_OPT_OVERLAP_ACTIVE_L2,
 725    [QCOW2_OL_REFCOUNT_TABLE_BITNR]   = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
 726    [QCOW2_OL_REFCOUNT_BLOCK_BITNR]   = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
 727    [QCOW2_OL_SNAPSHOT_TABLE_BITNR]   = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
 728    [QCOW2_OL_INACTIVE_L1_BITNR]      = QCOW2_OPT_OVERLAP_INACTIVE_L1,
 729    [QCOW2_OL_INACTIVE_L2_BITNR]      = QCOW2_OPT_OVERLAP_INACTIVE_L2,
 730    [QCOW2_OL_BITMAP_DIRECTORY_BITNR] = QCOW2_OPT_OVERLAP_BITMAP_DIRECTORY,
 731};
 732
 733static void cache_clean_timer_cb(void *opaque)
 734{
 735    BlockDriverState *bs = opaque;
 736    BDRVQcow2State *s = bs->opaque;
 737    qcow2_cache_clean_unused(s->l2_table_cache);
 738    qcow2_cache_clean_unused(s->refcount_block_cache);
 739    timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
 740              (int64_t) s->cache_clean_interval * 1000);
 741}
 742
 743static void cache_clean_timer_init(BlockDriverState *bs, AioContext *context)
 744{
 745    BDRVQcow2State *s = bs->opaque;
 746    if (s->cache_clean_interval > 0) {
 747        s->cache_clean_timer = aio_timer_new(context, QEMU_CLOCK_VIRTUAL,
 748                                             SCALE_MS, cache_clean_timer_cb,
 749                                             bs);
 750        timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
 751                  (int64_t) s->cache_clean_interval * 1000);
 752    }
 753}
 754
 755static void cache_clean_timer_del(BlockDriverState *bs)
 756{
 757    BDRVQcow2State *s = bs->opaque;
 758    if (s->cache_clean_timer) {
 759        timer_del(s->cache_clean_timer);
 760        timer_free(s->cache_clean_timer);
 761        s->cache_clean_timer = NULL;
 762    }
 763}
 764
 765static void qcow2_detach_aio_context(BlockDriverState *bs)
 766{
 767    cache_clean_timer_del(bs);
 768}
 769
 770static void qcow2_attach_aio_context(BlockDriverState *bs,
 771                                     AioContext *new_context)
 772{
 773    cache_clean_timer_init(bs, new_context);
 774}
 775
 776static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
 777                             uint64_t *l2_cache_size,
 778                             uint64_t *l2_cache_entry_size,
 779                             uint64_t *refcount_cache_size, Error **errp)
 780{
 781    BDRVQcow2State *s = bs->opaque;
 782    uint64_t combined_cache_size, l2_cache_max_setting;
 783    bool l2_cache_size_set, refcount_cache_size_set, combined_cache_size_set;
 784    int min_refcount_cache = MIN_REFCOUNT_CACHE_SIZE * s->cluster_size;
 785    uint64_t virtual_disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
 786    uint64_t max_l2_cache = virtual_disk_size / (s->cluster_size / 8);
 787
 788    combined_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_CACHE_SIZE);
 789    l2_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_L2_CACHE_SIZE);
 790    refcount_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_REFCOUNT_CACHE_SIZE);
 791
 792    combined_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_CACHE_SIZE, 0);
 793    l2_cache_max_setting = qemu_opt_get_size(opts, QCOW2_OPT_L2_CACHE_SIZE,
 794                                             DEFAULT_L2_CACHE_MAX_SIZE);
 795    *refcount_cache_size = qemu_opt_get_size(opts,
 796                                             QCOW2_OPT_REFCOUNT_CACHE_SIZE, 0);
 797
 798    *l2_cache_entry_size = qemu_opt_get_size(
 799        opts, QCOW2_OPT_L2_CACHE_ENTRY_SIZE, s->cluster_size);
 800
 801    *l2_cache_size = MIN(max_l2_cache, l2_cache_max_setting);
 802
 803    if (combined_cache_size_set) {
 804        if (l2_cache_size_set && refcount_cache_size_set) {
 805            error_setg(errp, QCOW2_OPT_CACHE_SIZE ", " QCOW2_OPT_L2_CACHE_SIZE
 806                       " and " QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not be set "
 807                       "at the same time");
 808            return;
 809        } else if (l2_cache_size_set &&
 810                   (l2_cache_max_setting > combined_cache_size)) {
 811            error_setg(errp, QCOW2_OPT_L2_CACHE_SIZE " may not exceed "
 812                       QCOW2_OPT_CACHE_SIZE);
 813            return;
 814        } else if (*refcount_cache_size > combined_cache_size) {
 815            error_setg(errp, QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not exceed "
 816                       QCOW2_OPT_CACHE_SIZE);
 817            return;
 818        }
 819
 820        if (l2_cache_size_set) {
 821            *refcount_cache_size = combined_cache_size - *l2_cache_size;
 822        } else if (refcount_cache_size_set) {
 823            *l2_cache_size = combined_cache_size - *refcount_cache_size;
 824        } else {
 825            /* Assign as much memory as possible to the L2 cache, and
 826             * use the remainder for the refcount cache */
 827            if (combined_cache_size >= max_l2_cache + min_refcount_cache) {
 828                *l2_cache_size = max_l2_cache;
 829                *refcount_cache_size = combined_cache_size - *l2_cache_size;
 830            } else {
 831                *refcount_cache_size =
 832                    MIN(combined_cache_size, min_refcount_cache);
 833                *l2_cache_size = combined_cache_size - *refcount_cache_size;
 834            }
 835        }
 836    }
 837    /* l2_cache_size and refcount_cache_size are ensured to have at least
 838     * their minimum values in qcow2_update_options_prepare() */
 839
 840    if (*l2_cache_entry_size < (1 << MIN_CLUSTER_BITS) ||
 841        *l2_cache_entry_size > s->cluster_size ||
 842        !is_power_of_2(*l2_cache_entry_size)) {
 843        error_setg(errp, "L2 cache entry size must be a power of two "
 844                   "between %d and the cluster size (%d)",
 845                   1 << MIN_CLUSTER_BITS, s->cluster_size);
 846        return;
 847    }
 848}
 849
 850typedef struct Qcow2ReopenState {
 851    Qcow2Cache *l2_table_cache;
 852    Qcow2Cache *refcount_block_cache;
 853    int l2_slice_size; /* Number of entries in a slice of the L2 table */
 854    bool use_lazy_refcounts;
 855    int overlap_check;
 856    bool discard_passthrough[QCOW2_DISCARD_MAX];
 857    uint64_t cache_clean_interval;
 858    QCryptoBlockOpenOptions *crypto_opts; /* Disk encryption runtime options */
 859} Qcow2ReopenState;
 860
 861static int qcow2_update_options_prepare(BlockDriverState *bs,
 862                                        Qcow2ReopenState *r,
 863                                        QDict *options, int flags,
 864                                        Error **errp)
 865{
 866    BDRVQcow2State *s = bs->opaque;
 867    QemuOpts *opts = NULL;
 868    const char *opt_overlap_check, *opt_overlap_check_template;
 869    int overlap_check_template = 0;
 870    uint64_t l2_cache_size, l2_cache_entry_size, refcount_cache_size;
 871    int i;
 872    const char *encryptfmt;
 873    QDict *encryptopts = NULL;
 874    Error *local_err = NULL;
 875    int ret;
 876
 877    qdict_extract_subqdict(options, &encryptopts, "encrypt.");
 878    encryptfmt = qdict_get_try_str(encryptopts, "format");
 879
 880    opts = qemu_opts_create(&qcow2_runtime_opts, NULL, 0, &error_abort);
 881    qemu_opts_absorb_qdict(opts, options, &local_err);
 882    if (local_err) {
 883        error_propagate(errp, local_err);
 884        ret = -EINVAL;
 885        goto fail;
 886    }
 887
 888    /* get L2 table/refcount block cache size from command line options */
 889    read_cache_sizes(bs, opts, &l2_cache_size, &l2_cache_entry_size,
 890                     &refcount_cache_size, &local_err);
 891    if (local_err) {
 892        error_propagate(errp, local_err);
 893        ret = -EINVAL;
 894        goto fail;
 895    }
 896
 897    l2_cache_size /= l2_cache_entry_size;
 898    if (l2_cache_size < MIN_L2_CACHE_SIZE) {
 899        l2_cache_size = MIN_L2_CACHE_SIZE;
 900    }
 901    if (l2_cache_size > INT_MAX) {
 902        error_setg(errp, "L2 cache size too big");
 903        ret = -EINVAL;
 904        goto fail;
 905    }
 906
 907    refcount_cache_size /= s->cluster_size;
 908    if (refcount_cache_size < MIN_REFCOUNT_CACHE_SIZE) {
 909        refcount_cache_size = MIN_REFCOUNT_CACHE_SIZE;
 910    }
 911    if (refcount_cache_size > INT_MAX) {
 912        error_setg(errp, "Refcount cache size too big");
 913        ret = -EINVAL;
 914        goto fail;
 915    }
 916
 917    /* alloc new L2 table/refcount block cache, flush old one */
 918    if (s->l2_table_cache) {
 919        ret = qcow2_cache_flush(bs, s->l2_table_cache);
 920        if (ret) {
 921            error_setg_errno(errp, -ret, "Failed to flush the L2 table cache");
 922            goto fail;
 923        }
 924    }
 925
 926    if (s->refcount_block_cache) {
 927        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 928        if (ret) {
 929            error_setg_errno(errp, -ret,
 930                             "Failed to flush the refcount block cache");
 931            goto fail;
 932        }
 933    }
 934
 935    r->l2_slice_size = l2_cache_entry_size / sizeof(uint64_t);
 936    r->l2_table_cache = qcow2_cache_create(bs, l2_cache_size,
 937                                           l2_cache_entry_size);
 938    r->refcount_block_cache = qcow2_cache_create(bs, refcount_cache_size,
 939                                                 s->cluster_size);
 940    if (r->l2_table_cache == NULL || r->refcount_block_cache == NULL) {
 941        error_setg(errp, "Could not allocate metadata caches");
 942        ret = -ENOMEM;
 943        goto fail;
 944    }
 945
 946    /* New interval for cache cleanup timer */
 947    r->cache_clean_interval =
 948        qemu_opt_get_number(opts, QCOW2_OPT_CACHE_CLEAN_INTERVAL,
 949                            DEFAULT_CACHE_CLEAN_INTERVAL);
 950#ifndef CONFIG_LINUX
 951    if (r->cache_clean_interval != 0) {
 952        error_setg(errp, QCOW2_OPT_CACHE_CLEAN_INTERVAL
 953                   " not supported on this host");
 954        ret = -EINVAL;
 955        goto fail;
 956    }
 957#endif
 958    if (r->cache_clean_interval > UINT_MAX) {
 959        error_setg(errp, "Cache clean interval too big");
 960        ret = -EINVAL;
 961        goto fail;
 962    }
 963
 964    /* lazy-refcounts; flush if going from enabled to disabled */
 965    r->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS,
 966        (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS));
 967    if (r->use_lazy_refcounts && s->qcow_version < 3) {
 968        error_setg(errp, "Lazy refcounts require a qcow2 image with at least "
 969                   "qemu 1.1 compatibility level");
 970        ret = -EINVAL;
 971        goto fail;
 972    }
 973
 974    if (s->use_lazy_refcounts && !r->use_lazy_refcounts) {
 975        ret = qcow2_mark_clean(bs);
 976        if (ret < 0) {
 977            error_setg_errno(errp, -ret, "Failed to disable lazy refcounts");
 978            goto fail;
 979        }
 980    }
 981
 982    /* Overlap check options */
 983    opt_overlap_check = qemu_opt_get(opts, QCOW2_OPT_OVERLAP);
 984    opt_overlap_check_template = qemu_opt_get(opts, QCOW2_OPT_OVERLAP_TEMPLATE);
 985    if (opt_overlap_check_template && opt_overlap_check &&
 986        strcmp(opt_overlap_check_template, opt_overlap_check))
 987    {
 988        error_setg(errp, "Conflicting values for qcow2 options '"
 989                   QCOW2_OPT_OVERLAP "' ('%s') and '" QCOW2_OPT_OVERLAP_TEMPLATE
 990                   "' ('%s')", opt_overlap_check, opt_overlap_check_template);
 991        ret = -EINVAL;
 992        goto fail;
 993    }
 994    if (!opt_overlap_check) {
 995        opt_overlap_check = opt_overlap_check_template ?: "cached";
 996    }
 997
 998    if (!strcmp(opt_overlap_check, "none")) {
 999        overlap_check_template = 0;
1000    } else if (!strcmp(opt_overlap_check, "constant")) {
1001        overlap_check_template = QCOW2_OL_CONSTANT;
1002    } else if (!strcmp(opt_overlap_check, "cached")) {
1003        overlap_check_template = QCOW2_OL_CACHED;
1004    } else if (!strcmp(opt_overlap_check, "all")) {
1005        overlap_check_template = QCOW2_OL_ALL;
1006    } else {
1007        error_setg(errp, "Unsupported value '%s' for qcow2 option "
1008                   "'overlap-check'. Allowed are any of the following: "
1009                   "none, constant, cached, all", opt_overlap_check);
1010        ret = -EINVAL;
1011        goto fail;
1012    }
1013
1014    r->overlap_check = 0;
1015    for (i = 0; i < QCOW2_OL_MAX_BITNR; i++) {
1016        /* overlap-check defines a template bitmask, but every flag may be
1017         * overwritten through the associated boolean option */
1018        r->overlap_check |=
1019            qemu_opt_get_bool(opts, overlap_bool_option_names[i],
1020                              overlap_check_template & (1 << i)) << i;
1021    }
1022
1023    r->discard_passthrough[QCOW2_DISCARD_NEVER] = false;
1024    r->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true;
1025    r->discard_passthrough[QCOW2_DISCARD_REQUEST] =
1026        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST,
1027                          flags & BDRV_O_UNMAP);
1028    r->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] =
1029        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true);
1030    r->discard_passthrough[QCOW2_DISCARD_OTHER] =
1031        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false);
1032
1033    switch (s->crypt_method_header) {
1034    case QCOW_CRYPT_NONE:
1035        if (encryptfmt) {
1036            error_setg(errp, "No encryption in image header, but options "
1037                       "specified format '%s'", encryptfmt);
1038            ret = -EINVAL;
1039            goto fail;
1040        }
1041        break;
1042
1043    case QCOW_CRYPT_AES:
1044        if (encryptfmt && !g_str_equal(encryptfmt, "aes")) {
1045            error_setg(errp,
1046                       "Header reported 'aes' encryption format but "
1047                       "options specify '%s'", encryptfmt);
1048            ret = -EINVAL;
1049            goto fail;
1050        }
1051        qdict_put_str(encryptopts, "format", "qcow");
1052        r->crypto_opts = block_crypto_open_opts_init(encryptopts, errp);
1053        break;
1054
1055    case QCOW_CRYPT_LUKS:
1056        if (encryptfmt && !g_str_equal(encryptfmt, "luks")) {
1057            error_setg(errp,
1058                       "Header reported 'luks' encryption format but "
1059                       "options specify '%s'", encryptfmt);
1060            ret = -EINVAL;
1061            goto fail;
1062        }
1063        qdict_put_str(encryptopts, "format", "luks");
1064        r->crypto_opts = block_crypto_open_opts_init(encryptopts, errp);
1065        break;
1066
1067    default:
1068        error_setg(errp, "Unsupported encryption method %d",
1069                   s->crypt_method_header);
1070        break;
1071    }
1072    if (s->crypt_method_header != QCOW_CRYPT_NONE && !r->crypto_opts) {
1073        ret = -EINVAL;
1074        goto fail;
1075    }
1076
1077    ret = 0;
1078fail:
1079    qobject_unref(encryptopts);
1080    qemu_opts_del(opts);
1081    opts = NULL;
1082    return ret;
1083}
1084
1085static void qcow2_update_options_commit(BlockDriverState *bs,
1086                                        Qcow2ReopenState *r)
1087{
1088    BDRVQcow2State *s = bs->opaque;
1089    int i;
1090
1091    if (s->l2_table_cache) {
1092        qcow2_cache_destroy(s->l2_table_cache);
1093    }
1094    if (s->refcount_block_cache) {
1095        qcow2_cache_destroy(s->refcount_block_cache);
1096    }
1097    s->l2_table_cache = r->l2_table_cache;
1098    s->refcount_block_cache = r->refcount_block_cache;
1099    s->l2_slice_size = r->l2_slice_size;
1100
1101    s->overlap_check = r->overlap_check;
1102    s->use_lazy_refcounts = r->use_lazy_refcounts;
1103
1104    for (i = 0; i < QCOW2_DISCARD_MAX; i++) {
1105        s->discard_passthrough[i] = r->discard_passthrough[i];
1106    }
1107
1108    if (s->cache_clean_interval != r->cache_clean_interval) {
1109        cache_clean_timer_del(bs);
1110        s->cache_clean_interval = r->cache_clean_interval;
1111        cache_clean_timer_init(bs, bdrv_get_aio_context(bs));
1112    }
1113
1114    qapi_free_QCryptoBlockOpenOptions(s->crypto_opts);
1115    s->crypto_opts = r->crypto_opts;
1116}
1117
1118static void qcow2_update_options_abort(BlockDriverState *bs,
1119                                       Qcow2ReopenState *r)
1120{
1121    if (r->l2_table_cache) {
1122        qcow2_cache_destroy(r->l2_table_cache);
1123    }
1124    if (r->refcount_block_cache) {
1125        qcow2_cache_destroy(r->refcount_block_cache);
1126    }
1127    qapi_free_QCryptoBlockOpenOptions(r->crypto_opts);
1128}
1129
1130static int qcow2_update_options(BlockDriverState *bs, QDict *options,
1131                                int flags, Error **errp)
1132{
1133    Qcow2ReopenState r = {};
1134    int ret;
1135
1136    ret = qcow2_update_options_prepare(bs, &r, options, flags, errp);
1137    if (ret >= 0) {
1138        qcow2_update_options_commit(bs, &r);
1139    } else {
1140        qcow2_update_options_abort(bs, &r);
1141    }
1142
1143    return ret;
1144}
1145
1146/* Called with s->lock held.  */
1147static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options,
1148                                      int flags, Error **errp)
1149{
1150    BDRVQcow2State *s = bs->opaque;
1151    unsigned int len, i;
1152    int ret = 0;
1153    QCowHeader header;
1154    Error *local_err = NULL;
1155    uint64_t ext_end;
1156    uint64_t l1_vm_state_index;
1157    bool update_header = false;
1158
1159    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
1160    if (ret < 0) {
1161        error_setg_errno(errp, -ret, "Could not read qcow2 header");
1162        goto fail;
1163    }
1164    header.magic = be32_to_cpu(header.magic);
1165    header.version = be32_to_cpu(header.version);
1166    header.backing_file_offset = be64_to_cpu(header.backing_file_offset);
1167    header.backing_file_size = be32_to_cpu(header.backing_file_size);
1168    header.size = be64_to_cpu(header.size);
1169    header.cluster_bits = be32_to_cpu(header.cluster_bits);
1170    header.crypt_method = be32_to_cpu(header.crypt_method);
1171    header.l1_table_offset = be64_to_cpu(header.l1_table_offset);
1172    header.l1_size = be32_to_cpu(header.l1_size);
1173    header.refcount_table_offset = be64_to_cpu(header.refcount_table_offset);
1174    header.refcount_table_clusters =
1175        be32_to_cpu(header.refcount_table_clusters);
1176    header.snapshots_offset = be64_to_cpu(header.snapshots_offset);
1177    header.nb_snapshots = be32_to_cpu(header.nb_snapshots);
1178
1179    if (header.magic != QCOW_MAGIC) {
1180        error_setg(errp, "Image is not in qcow2 format");
1181        ret = -EINVAL;
1182        goto fail;
1183    }
1184    if (header.version < 2 || header.version > 3) {
1185        error_setg(errp, "Unsupported qcow2 version %" PRIu32, header.version);
1186        ret = -ENOTSUP;
1187        goto fail;
1188    }
1189
1190    s->qcow_version = header.version;
1191
1192    /* Initialise cluster size */
1193    if (header.cluster_bits < MIN_CLUSTER_BITS ||
1194        header.cluster_bits > MAX_CLUSTER_BITS) {
1195        error_setg(errp, "Unsupported cluster size: 2^%" PRIu32,
1196                   header.cluster_bits);
1197        ret = -EINVAL;
1198        goto fail;
1199    }
1200
1201    s->cluster_bits = header.cluster_bits;
1202    s->cluster_size = 1 << s->cluster_bits;
1203    s->cluster_sectors = 1 << (s->cluster_bits - BDRV_SECTOR_BITS);
1204
1205    /* Initialise version 3 header fields */
1206    if (header.version == 2) {
1207        header.incompatible_features    = 0;
1208        header.compatible_features      = 0;
1209        header.autoclear_features       = 0;
1210        header.refcount_order           = 4;
1211        header.header_length            = 72;
1212    } else {
1213        header.incompatible_features =
1214            be64_to_cpu(header.incompatible_features);
1215        header.compatible_features = be64_to_cpu(header.compatible_features);
1216        header.autoclear_features = be64_to_cpu(header.autoclear_features);
1217        header.refcount_order = be32_to_cpu(header.refcount_order);
1218        header.header_length = be32_to_cpu(header.header_length);
1219
1220        if (header.header_length < 104) {
1221            error_setg(errp, "qcow2 header too short");
1222            ret = -EINVAL;
1223            goto fail;
1224        }
1225    }
1226
1227    if (header.header_length > s->cluster_size) {
1228        error_setg(errp, "qcow2 header exceeds cluster size");
1229        ret = -EINVAL;
1230        goto fail;
1231    }
1232
1233    if (header.header_length > sizeof(header)) {
1234        s->unknown_header_fields_size = header.header_length - sizeof(header);
1235        s->unknown_header_fields = g_malloc(s->unknown_header_fields_size);
1236        ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields,
1237                         s->unknown_header_fields_size);
1238        if (ret < 0) {
1239            error_setg_errno(errp, -ret, "Could not read unknown qcow2 header "
1240                             "fields");
1241            goto fail;
1242        }
1243    }
1244
1245    if (header.backing_file_offset > s->cluster_size) {
1246        error_setg(errp, "Invalid backing file offset");
1247        ret = -EINVAL;
1248        goto fail;
1249    }
1250
1251    if (header.backing_file_offset) {
1252        ext_end = header.backing_file_offset;
1253    } else {
1254        ext_end = 1 << header.cluster_bits;
1255    }
1256
1257    /* Handle feature bits */
1258    s->incompatible_features    = header.incompatible_features;
1259    s->compatible_features      = header.compatible_features;
1260    s->autoclear_features       = header.autoclear_features;
1261
1262    if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) {
1263        void *feature_table = NULL;
1264        qcow2_read_extensions(bs, header.header_length, ext_end,
1265                              &feature_table, flags, NULL, NULL);
1266        report_unsupported_feature(errp, feature_table,
1267                                   s->incompatible_features &
1268                                   ~QCOW2_INCOMPAT_MASK);
1269        ret = -ENOTSUP;
1270        g_free(feature_table);
1271        goto fail;
1272    }
1273
1274    if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
1275        /* Corrupt images may not be written to unless they are being repaired
1276         */
1277        if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) {
1278            error_setg(errp, "qcow2: Image is corrupt; cannot be opened "
1279                       "read/write");
1280            ret = -EACCES;
1281            goto fail;
1282        }
1283    }
1284
1285    /* Check support for various header values */
1286    if (header.refcount_order > 6) {
1287        error_setg(errp, "Reference count entry width too large; may not "
1288                   "exceed 64 bits");
1289        ret = -EINVAL;
1290        goto fail;
1291    }
1292    s->refcount_order = header.refcount_order;
1293    s->refcount_bits = 1 << s->refcount_order;
1294    s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
1295    s->refcount_max += s->refcount_max - 1;
1296
1297    s->crypt_method_header = header.crypt_method;
1298    if (s->crypt_method_header) {
1299        if (bdrv_uses_whitelist() &&
1300            s->crypt_method_header == QCOW_CRYPT_AES) {
1301            error_setg(errp,
1302                       "Use of AES-CBC encrypted qcow2 images is no longer "
1303                       "supported in system emulators");
1304            error_append_hint(errp,
1305                              "You can use 'qemu-img convert' to convert your "
1306                              "image to an alternative supported format, such "
1307                              "as unencrypted qcow2, or raw with the LUKS "
1308                              "format instead.\n");
1309            ret = -ENOSYS;
1310            goto fail;
1311        }
1312
1313        if (s->crypt_method_header == QCOW_CRYPT_AES) {
1314            s->crypt_physical_offset = false;
1315        } else {
1316            /* Assuming LUKS and any future crypt methods we
1317             * add will all use physical offsets, due to the
1318             * fact that the alternative is insecure...  */
1319            s->crypt_physical_offset = true;
1320        }
1321
1322        bs->encrypted = true;
1323    }
1324
1325    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
1326    s->l2_size = 1 << s->l2_bits;
1327    /* 2^(s->refcount_order - 3) is the refcount width in bytes */
1328    s->refcount_block_bits = s->cluster_bits - (s->refcount_order - 3);
1329    s->refcount_block_size = 1 << s->refcount_block_bits;
1330    bs->total_sectors = header.size / BDRV_SECTOR_SIZE;
1331    s->csize_shift = (62 - (s->cluster_bits - 8));
1332    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
1333    s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
1334
1335    s->refcount_table_offset = header.refcount_table_offset;
1336    s->refcount_table_size =
1337        header.refcount_table_clusters << (s->cluster_bits - 3);
1338
1339    if (header.refcount_table_clusters == 0 && !(flags & BDRV_O_CHECK)) {
1340        error_setg(errp, "Image does not contain a reference count table");
1341        ret = -EINVAL;
1342        goto fail;
1343    }
1344
1345    ret = qcow2_validate_table(bs, s->refcount_table_offset,
1346                               header.refcount_table_clusters,
1347                               s->cluster_size, QCOW_MAX_REFTABLE_SIZE,
1348                               "Reference count table", errp);
1349    if (ret < 0) {
1350        goto fail;
1351    }
1352
1353    /* The total size in bytes of the snapshot table is checked in
1354     * qcow2_read_snapshots() because the size of each snapshot is
1355     * variable and we don't know it yet.
1356     * Here we only check the offset and number of snapshots. */
1357    ret = qcow2_validate_table(bs, header.snapshots_offset,
1358                               header.nb_snapshots,
1359                               sizeof(QCowSnapshotHeader),
1360                               sizeof(QCowSnapshotHeader) * QCOW_MAX_SNAPSHOTS,
1361                               "Snapshot table", errp);
1362    if (ret < 0) {
1363        goto fail;
1364    }
1365
1366    /* read the level 1 table */
1367    ret = qcow2_validate_table(bs, header.l1_table_offset,
1368                               header.l1_size, sizeof(uint64_t),
1369                               QCOW_MAX_L1_SIZE, "Active L1 table", errp);
1370    if (ret < 0) {
1371        goto fail;
1372    }
1373    s->l1_size = header.l1_size;
1374    s->l1_table_offset = header.l1_table_offset;
1375
1376    l1_vm_state_index = size_to_l1(s, header.size);
1377    if (l1_vm_state_index > INT_MAX) {
1378        error_setg(errp, "Image is too big");
1379        ret = -EFBIG;
1380        goto fail;
1381    }
1382    s->l1_vm_state_index = l1_vm_state_index;
1383
1384    /* the L1 table must contain at least enough entries to put
1385       header.size bytes */
1386    if (s->l1_size < s->l1_vm_state_index) {
1387        error_setg(errp, "L1 table is too small");
1388        ret = -EINVAL;
1389        goto fail;
1390    }
1391
1392    if (s->l1_size > 0) {
1393        s->l1_table = qemu_try_blockalign(bs->file->bs,
1394            ROUND_UP(s->l1_size * sizeof(uint64_t), 512));
1395        if (s->l1_table == NULL) {
1396            error_setg(errp, "Could not allocate L1 table");
1397            ret = -ENOMEM;
1398            goto fail;
1399        }
1400        ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
1401                         s->l1_size * sizeof(uint64_t));
1402        if (ret < 0) {
1403            error_setg_errno(errp, -ret, "Could not read L1 table");
1404            goto fail;
1405        }
1406        for(i = 0;i < s->l1_size; i++) {
1407            s->l1_table[i] = be64_to_cpu(s->l1_table[i]);
1408        }
1409    }
1410
1411    /* Parse driver-specific options */
1412    ret = qcow2_update_options(bs, options, flags, errp);
1413    if (ret < 0) {
1414        goto fail;
1415    }
1416
1417    s->cluster_cache_offset = -1;
1418    s->flags = flags;
1419
1420    ret = qcow2_refcount_init(bs);
1421    if (ret != 0) {
1422        error_setg_errno(errp, -ret, "Could not initialize refcount handling");
1423        goto fail;
1424    }
1425
1426    QLIST_INIT(&s->cluster_allocs);
1427    QTAILQ_INIT(&s->discards);
1428
1429    /* read qcow2 extensions */
1430    if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL,
1431                              flags, &update_header, &local_err)) {
1432        error_propagate(errp, local_err);
1433        ret = -EINVAL;
1434        goto fail;
1435    }
1436
1437    /* qcow2_read_extension may have set up the crypto context
1438     * if the crypt method needs a header region, some methods
1439     * don't need header extensions, so must check here
1440     */
1441    if (s->crypt_method_header && !s->crypto) {
1442        if (s->crypt_method_header == QCOW_CRYPT_AES) {
1443            unsigned int cflags = 0;
1444            if (flags & BDRV_O_NO_IO) {
1445                cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
1446            }
1447            s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.",
1448                                           NULL, NULL, cflags, errp);
1449            if (!s->crypto) {
1450                ret = -EINVAL;
1451                goto fail;
1452            }
1453        } else if (!(flags & BDRV_O_NO_IO)) {
1454            error_setg(errp, "Missing CRYPTO header for crypt method %d",
1455                       s->crypt_method_header);
1456            ret = -EINVAL;
1457            goto fail;
1458        }
1459    }
1460
1461    /* read the backing file name */
1462    if (header.backing_file_offset != 0) {
1463        len = header.backing_file_size;
1464        if (len > MIN(1023, s->cluster_size - header.backing_file_offset) ||
1465            len >= sizeof(bs->backing_file)) {
1466            error_setg(errp, "Backing file name too long");
1467            ret = -EINVAL;
1468            goto fail;
1469        }
1470        ret = bdrv_pread(bs->file, header.backing_file_offset,
1471                         bs->backing_file, len);
1472        if (ret < 0) {
1473            error_setg_errno(errp, -ret, "Could not read backing file name");
1474            goto fail;
1475        }
1476        bs->backing_file[len] = '\0';
1477        s->image_backing_file = g_strdup(bs->backing_file);
1478    }
1479
1480    /* Internal snapshots */
1481    s->snapshots_offset = header.snapshots_offset;
1482    s->nb_snapshots = header.nb_snapshots;
1483
1484    ret = qcow2_read_snapshots(bs);
1485    if (ret < 0) {
1486        error_setg_errno(errp, -ret, "Could not read snapshots");
1487        goto fail;
1488    }
1489
1490    /* Clear unknown autoclear feature bits */
1491    update_header |= s->autoclear_features & ~QCOW2_AUTOCLEAR_MASK;
1492    update_header =
1493        update_header && !bs->read_only && !(flags & BDRV_O_INACTIVE);
1494    if (update_header) {
1495        s->autoclear_features &= QCOW2_AUTOCLEAR_MASK;
1496    }
1497
1498    /* == Handle persistent dirty bitmaps ==
1499     *
1500     * We want load dirty bitmaps in three cases:
1501     *
1502     * 1. Normal open of the disk in active mode, not related to invalidation
1503     *    after migration.
1504     *
1505     * 2. Invalidation of the target vm after pre-copy phase of migration, if
1506     *    bitmaps are _not_ migrating through migration channel, i.e.
1507     *    'dirty-bitmaps' capability is disabled.
1508     *
1509     * 3. Invalidation of source vm after failed or canceled migration.
1510     *    This is a very interesting case. There are two possible types of
1511     *    bitmaps:
1512     *
1513     *    A. Stored on inactivation and removed. They should be loaded from the
1514     *       image.
1515     *
1516     *    B. Not stored: not-persistent bitmaps and bitmaps, migrated through
1517     *       the migration channel (with dirty-bitmaps capability).
1518     *
1519     *    On the other hand, there are two possible sub-cases:
1520     *
1521     *    3.1 disk was changed by somebody else while were inactive. In this
1522     *        case all in-RAM dirty bitmaps (both persistent and not) are
1523     *        definitely invalid. And we don't have any method to determine
1524     *        this.
1525     *
1526     *        Simple and safe thing is to just drop all the bitmaps of type B on
1527     *        inactivation. But in this case we lose bitmaps in valid 4.2 case.
1528     *
1529     *        On the other hand, resuming source vm, if disk was already changed
1530     *        is a bad thing anyway: not only bitmaps, the whole vm state is
1531     *        out of sync with disk.
1532     *
1533     *        This means, that user or management tool, who for some reason
1534     *        decided to resume source vm, after disk was already changed by
1535     *        target vm, should at least drop all dirty bitmaps by hand.
1536     *
1537     *        So, we can ignore this case for now, but TODO: "generation"
1538     *        extension for qcow2, to determine, that image was changed after
1539     *        last inactivation. And if it is changed, we will drop (or at least
1540     *        mark as 'invalid' all the bitmaps of type B, both persistent
1541     *        and not).
1542     *
1543     *    3.2 disk was _not_ changed while were inactive. Bitmaps may be saved
1544     *        to disk ('dirty-bitmaps' capability disabled), or not saved
1545     *        ('dirty-bitmaps' capability enabled), but we don't need to care
1546     *        of: let's load bitmaps as always: stored bitmaps will be loaded,
1547     *        and not stored has flag IN_USE=1 in the image and will be skipped
1548     *        on loading.
1549     *
1550     * One remaining possible case when we don't want load bitmaps:
1551     *
1552     * 4. Open disk in inactive mode in target vm (bitmaps are migrating or
1553     *    will be loaded on invalidation, no needs try loading them before)
1554     */
1555
1556    if (!(bdrv_get_flags(bs) & BDRV_O_INACTIVE)) {
1557        /* It's case 1, 2 or 3.2. Or 3.1 which is BUG in management layer. */
1558        bool header_updated = qcow2_load_dirty_bitmaps(bs, &local_err);
1559
1560        update_header = update_header && !header_updated;
1561    }
1562    if (local_err != NULL) {
1563        error_propagate(errp, local_err);
1564        ret = -EINVAL;
1565        goto fail;
1566    }
1567
1568    if (update_header) {
1569        ret = qcow2_update_header(bs);
1570        if (ret < 0) {
1571            error_setg_errno(errp, -ret, "Could not update qcow2 header");
1572            goto fail;
1573        }
1574    }
1575
1576    bs->supported_zero_flags = header.version >= 3 ? BDRV_REQ_MAY_UNMAP : 0;
1577
1578    /* Repair image if dirty */
1579    if (!(flags & (BDRV_O_CHECK | BDRV_O_INACTIVE)) && !bs->read_only &&
1580        (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) {
1581        BdrvCheckResult result = {0};
1582
1583        ret = qcow2_co_check_locked(bs, &result,
1584                                    BDRV_FIX_ERRORS | BDRV_FIX_LEAKS);
1585        if (ret < 0 || result.check_errors) {
1586            if (ret >= 0) {
1587                ret = -EIO;
1588            }
1589            error_setg_errno(errp, -ret, "Could not repair dirty image");
1590            goto fail;
1591        }
1592    }
1593
1594#ifdef DEBUG_ALLOC
1595    {
1596        BdrvCheckResult result = {0};
1597        qcow2_check_refcounts(bs, &result, 0);
1598    }
1599#endif
1600
1601    qemu_co_queue_init(&s->compress_wait_queue);
1602
1603    return ret;
1604
1605 fail:
1606    g_free(s->unknown_header_fields);
1607    cleanup_unknown_header_ext(bs);
1608    qcow2_free_snapshots(bs);
1609    qcow2_refcount_close(bs);
1610    qemu_vfree(s->l1_table);
1611    /* else pre-write overlap checks in cache_destroy may crash */
1612    s->l1_table = NULL;
1613    cache_clean_timer_del(bs);
1614    if (s->l2_table_cache) {
1615        qcow2_cache_destroy(s->l2_table_cache);
1616    }
1617    if (s->refcount_block_cache) {
1618        qcow2_cache_destroy(s->refcount_block_cache);
1619    }
1620    qcrypto_block_free(s->crypto);
1621    qapi_free_QCryptoBlockOpenOptions(s->crypto_opts);
1622    return ret;
1623}
1624
1625typedef struct QCow2OpenCo {
1626    BlockDriverState *bs;
1627    QDict *options;
1628    int flags;
1629    Error **errp;
1630    int ret;
1631} QCow2OpenCo;
1632
1633static void coroutine_fn qcow2_open_entry(void *opaque)
1634{
1635    QCow2OpenCo *qoc = opaque;
1636    BDRVQcow2State *s = qoc->bs->opaque;
1637
1638    qemu_co_mutex_lock(&s->lock);
1639    qoc->ret = qcow2_do_open(qoc->bs, qoc->options, qoc->flags, qoc->errp);
1640    qemu_co_mutex_unlock(&s->lock);
1641}
1642
1643static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
1644                      Error **errp)
1645{
1646    BDRVQcow2State *s = bs->opaque;
1647    QCow2OpenCo qoc = {
1648        .bs = bs,
1649        .options = options,
1650        .flags = flags,
1651        .errp = errp,
1652        .ret = -EINPROGRESS
1653    };
1654
1655    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
1656                               false, errp);
1657    if (!bs->file) {
1658        return -EINVAL;
1659    }
1660
1661    /* Initialise locks */
1662    qemu_co_mutex_init(&s->lock);
1663
1664    if (qemu_in_coroutine()) {
1665        /* From bdrv_co_create.  */
1666        qcow2_open_entry(&qoc);
1667    } else {
1668        qemu_coroutine_enter(qemu_coroutine_create(qcow2_open_entry, &qoc));
1669        BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
1670    }
1671    return qoc.ret;
1672}
1673
1674static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp)
1675{
1676    BDRVQcow2State *s = bs->opaque;
1677
1678    if (bs->encrypted) {
1679        /* Encryption works on a sector granularity */
1680        bs->bl.request_alignment = qcrypto_block_get_sector_size(s->crypto);
1681    }
1682    bs->bl.pwrite_zeroes_alignment = s->cluster_size;
1683    bs->bl.pdiscard_alignment = s->cluster_size;
1684}
1685
1686static int qcow2_reopen_prepare(BDRVReopenState *state,
1687                                BlockReopenQueue *queue, Error **errp)
1688{
1689    Qcow2ReopenState *r;
1690    int ret;
1691
1692    r = g_new0(Qcow2ReopenState, 1);
1693    state->opaque = r;
1694
1695    ret = qcow2_update_options_prepare(state->bs, r, state->options,
1696                                       state->flags, errp);
1697    if (ret < 0) {
1698        goto fail;
1699    }
1700
1701    /* We need to write out any unwritten data if we reopen read-only. */
1702    if ((state->flags & BDRV_O_RDWR) == 0) {
1703        ret = qcow2_reopen_bitmaps_ro(state->bs, errp);
1704        if (ret < 0) {
1705            goto fail;
1706        }
1707
1708        ret = bdrv_flush(state->bs);
1709        if (ret < 0) {
1710            goto fail;
1711        }
1712
1713        ret = qcow2_mark_clean(state->bs);
1714        if (ret < 0) {
1715            goto fail;
1716        }
1717    }
1718
1719    return 0;
1720
1721fail:
1722    qcow2_update_options_abort(state->bs, r);
1723    g_free(r);
1724    return ret;
1725}
1726
1727static void qcow2_reopen_commit(BDRVReopenState *state)
1728{
1729    qcow2_update_options_commit(state->bs, state->opaque);
1730    g_free(state->opaque);
1731}
1732
1733static void qcow2_reopen_abort(BDRVReopenState *state)
1734{
1735    qcow2_update_options_abort(state->bs, state->opaque);
1736    g_free(state->opaque);
1737}
1738
1739static void qcow2_join_options(QDict *options, QDict *old_options)
1740{
1741    bool has_new_overlap_template =
1742        qdict_haskey(options, QCOW2_OPT_OVERLAP) ||
1743        qdict_haskey(options, QCOW2_OPT_OVERLAP_TEMPLATE);
1744    bool has_new_total_cache_size =
1745        qdict_haskey(options, QCOW2_OPT_CACHE_SIZE);
1746    bool has_all_cache_options;
1747
1748    /* New overlap template overrides all old overlap options */
1749    if (has_new_overlap_template) {
1750        qdict_del(old_options, QCOW2_OPT_OVERLAP);
1751        qdict_del(old_options, QCOW2_OPT_OVERLAP_TEMPLATE);
1752        qdict_del(old_options, QCOW2_OPT_OVERLAP_MAIN_HEADER);
1753        qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L1);
1754        qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L2);
1755        qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_TABLE);
1756        qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK);
1757        qdict_del(old_options, QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE);
1758        qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L1);
1759        qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L2);
1760    }
1761
1762    /* New total cache size overrides all old options */
1763    if (qdict_haskey(options, QCOW2_OPT_CACHE_SIZE)) {
1764        qdict_del(old_options, QCOW2_OPT_L2_CACHE_SIZE);
1765        qdict_del(old_options, QCOW2_OPT_REFCOUNT_CACHE_SIZE);
1766    }
1767
1768    qdict_join(options, old_options, false);
1769
1770    /*
1771     * If after merging all cache size options are set, an old total size is
1772     * overwritten. Do keep all options, however, if all three are new. The
1773     * resulting error message is what we want to happen.
1774     */
1775    has_all_cache_options =
1776        qdict_haskey(options, QCOW2_OPT_CACHE_SIZE) ||
1777        qdict_haskey(options, QCOW2_OPT_L2_CACHE_SIZE) ||
1778        qdict_haskey(options, QCOW2_OPT_REFCOUNT_CACHE_SIZE);
1779
1780    if (has_all_cache_options && !has_new_total_cache_size) {
1781        qdict_del(options, QCOW2_OPT_CACHE_SIZE);
1782    }
1783}
1784
1785static int coroutine_fn qcow2_co_block_status(BlockDriverState *bs,
1786                                              bool want_zero,
1787                                              int64_t offset, int64_t count,
1788                                              int64_t *pnum, int64_t *map,
1789                                              BlockDriverState **file)
1790{
1791    BDRVQcow2State *s = bs->opaque;
1792    uint64_t cluster_offset;
1793    int index_in_cluster, ret;
1794    unsigned int bytes;
1795    int status = 0;
1796
1797    bytes = MIN(INT_MAX, count);
1798    qemu_co_mutex_lock(&s->lock);
1799    ret = qcow2_get_cluster_offset(bs, offset, &bytes, &cluster_offset);
1800    qemu_co_mutex_unlock(&s->lock);
1801    if (ret < 0) {
1802        return ret;
1803    }
1804
1805    *pnum = bytes;
1806
1807    if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED &&
1808        !s->crypto) {
1809        index_in_cluster = offset & (s->cluster_size - 1);
1810        *map = cluster_offset | index_in_cluster;
1811        *file = bs->file->bs;
1812        status |= BDRV_BLOCK_OFFSET_VALID;
1813    }
1814    if (ret == QCOW2_CLUSTER_ZERO_PLAIN || ret == QCOW2_CLUSTER_ZERO_ALLOC) {
1815        status |= BDRV_BLOCK_ZERO;
1816    } else if (ret != QCOW2_CLUSTER_UNALLOCATED) {
1817        status |= BDRV_BLOCK_DATA;
1818    }
1819    return status;
1820}
1821
1822static coroutine_fn int qcow2_handle_l2meta(BlockDriverState *bs,
1823                                            QCowL2Meta **pl2meta,
1824                                            bool link_l2)
1825{
1826    int ret = 0;
1827    QCowL2Meta *l2meta = *pl2meta;
1828
1829    while (l2meta != NULL) {
1830        QCowL2Meta *next;
1831
1832        if (link_l2) {
1833            ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
1834            if (ret) {
1835                goto out;
1836            }
1837        } else {
1838            qcow2_alloc_cluster_abort(bs, l2meta);
1839        }
1840
1841        /* Take the request off the list of running requests */
1842        if (l2meta->nb_clusters != 0) {
1843            QLIST_REMOVE(l2meta, next_in_flight);
1844        }
1845
1846        qemu_co_queue_restart_all(&l2meta->dependent_requests);
1847
1848        next = l2meta->next;
1849        g_free(l2meta);
1850        l2meta = next;
1851    }
1852out:
1853    *pl2meta = l2meta;
1854    return ret;
1855}
1856
1857static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
1858                                        uint64_t bytes, QEMUIOVector *qiov,
1859                                        int flags)
1860{
1861    BDRVQcow2State *s = bs->opaque;
1862    int offset_in_cluster;
1863    int ret;
1864    unsigned int cur_bytes; /* number of bytes in current iteration */
1865    uint64_t cluster_offset = 0;
1866    uint64_t bytes_done = 0;
1867    QEMUIOVector hd_qiov;
1868    uint8_t *cluster_data = NULL;
1869
1870    qemu_iovec_init(&hd_qiov, qiov->niov);
1871
1872    qemu_co_mutex_lock(&s->lock);
1873
1874    while (bytes != 0) {
1875
1876        /* prepare next request */
1877        cur_bytes = MIN(bytes, INT_MAX);
1878        if (s->crypto) {
1879            cur_bytes = MIN(cur_bytes,
1880                            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
1881        }
1882
1883        ret = qcow2_get_cluster_offset(bs, offset, &cur_bytes, &cluster_offset);
1884        if (ret < 0) {
1885            goto fail;
1886        }
1887
1888        offset_in_cluster = offset_into_cluster(s, offset);
1889
1890        qemu_iovec_reset(&hd_qiov);
1891        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, cur_bytes);
1892
1893        switch (ret) {
1894        case QCOW2_CLUSTER_UNALLOCATED:
1895
1896            if (bs->backing) {
1897                BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
1898                qemu_co_mutex_unlock(&s->lock);
1899                ret = bdrv_co_preadv(bs->backing, offset, cur_bytes,
1900                                     &hd_qiov, 0);
1901                qemu_co_mutex_lock(&s->lock);
1902                if (ret < 0) {
1903                    goto fail;
1904                }
1905            } else {
1906                /* Note: in this case, no need to wait */
1907                qemu_iovec_memset(&hd_qiov, 0, 0, cur_bytes);
1908            }
1909            break;
1910
1911        case QCOW2_CLUSTER_ZERO_PLAIN:
1912        case QCOW2_CLUSTER_ZERO_ALLOC:
1913            qemu_iovec_memset(&hd_qiov, 0, 0, cur_bytes);
1914            break;
1915
1916        case QCOW2_CLUSTER_COMPRESSED:
1917            /* add AIO support for compressed blocks ? */
1918            ret = qcow2_decompress_cluster(bs, cluster_offset);
1919            if (ret < 0) {
1920                goto fail;
1921            }
1922
1923            qemu_iovec_from_buf(&hd_qiov, 0,
1924                                s->cluster_cache + offset_in_cluster,
1925                                cur_bytes);
1926            break;
1927
1928        case QCOW2_CLUSTER_NORMAL:
1929            if ((cluster_offset & 511) != 0) {
1930                ret = -EIO;
1931                goto fail;
1932            }
1933
1934            if (bs->encrypted) {
1935                assert(s->crypto);
1936
1937                /*
1938                 * For encrypted images, read everything into a temporary
1939                 * contiguous buffer on which the AES functions can work.
1940                 */
1941                if (!cluster_data) {
1942                    cluster_data =
1943                        qemu_try_blockalign(bs->file->bs,
1944                                            QCOW_MAX_CRYPT_CLUSTERS
1945                                            * s->cluster_size);
1946                    if (cluster_data == NULL) {
1947                        ret = -ENOMEM;
1948                        goto fail;
1949                    }
1950                }
1951
1952                assert(cur_bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
1953                qemu_iovec_reset(&hd_qiov);
1954                qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes);
1955            }
1956
1957            BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
1958            qemu_co_mutex_unlock(&s->lock);
1959            ret = bdrv_co_preadv(bs->file,
1960                                 cluster_offset + offset_in_cluster,
1961                                 cur_bytes, &hd_qiov, 0);
1962            qemu_co_mutex_lock(&s->lock);
1963            if (ret < 0) {
1964                goto fail;
1965            }
1966            if (bs->encrypted) {
1967                assert(s->crypto);
1968                assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
1969                assert((cur_bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
1970                if (qcrypto_block_decrypt(s->crypto,
1971                                          (s->crypt_physical_offset ?
1972                                           cluster_offset + offset_in_cluster :
1973                                           offset),
1974                                          cluster_data,
1975                                          cur_bytes,
1976                                          NULL) < 0) {
1977                    ret = -EIO;
1978                    goto fail;
1979                }
1980                qemu_iovec_from_buf(qiov, bytes_done, cluster_data, cur_bytes);
1981            }
1982            break;
1983
1984        default:
1985            g_assert_not_reached();
1986            ret = -EIO;
1987            goto fail;
1988        }
1989
1990        bytes -= cur_bytes;
1991        offset += cur_bytes;
1992        bytes_done += cur_bytes;
1993    }
1994    ret = 0;
1995
1996fail:
1997    qemu_co_mutex_unlock(&s->lock);
1998
1999    qemu_iovec_destroy(&hd_qiov);
2000    qemu_vfree(cluster_data);
2001
2002    return ret;
2003}
2004
2005/* Check if it's possible to merge a write request with the writing of
2006 * the data from the COW regions */
2007static bool merge_cow(uint64_t offset, unsigned bytes,
2008                      QEMUIOVector *hd_qiov, QCowL2Meta *l2meta)
2009{
2010    QCowL2Meta *m;
2011
2012    for (m = l2meta; m != NULL; m = m->next) {
2013        /* If both COW regions are empty then there's nothing to merge */
2014        if (m->cow_start.nb_bytes == 0 && m->cow_end.nb_bytes == 0) {
2015            continue;
2016        }
2017
2018        /* The data (middle) region must be immediately after the
2019         * start region */
2020        if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) {
2021            continue;
2022        }
2023
2024        /* The end region must be immediately after the data (middle)
2025         * region */
2026        if (m->offset + m->cow_end.offset != offset + bytes) {
2027            continue;
2028        }
2029
2030        /* Make sure that adding both COW regions to the QEMUIOVector
2031         * does not exceed IOV_MAX */
2032        if (hd_qiov->niov > IOV_MAX - 2) {
2033            continue;
2034        }
2035
2036        m->data_qiov = hd_qiov;
2037        return true;
2038    }
2039
2040    return false;
2041}
2042
2043static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
2044                                         uint64_t bytes, QEMUIOVector *qiov,
2045                                         int flags)
2046{
2047    BDRVQcow2State *s = bs->opaque;
2048    int offset_in_cluster;
2049    int ret;
2050    unsigned int cur_bytes; /* number of sectors in current iteration */
2051    uint64_t cluster_offset;
2052    QEMUIOVector hd_qiov;
2053    uint64_t bytes_done = 0;
2054    uint8_t *cluster_data = NULL;
2055    QCowL2Meta *l2meta = NULL;
2056
2057    trace_qcow2_writev_start_req(qemu_coroutine_self(), offset, bytes);
2058
2059    qemu_iovec_init(&hd_qiov, qiov->niov);
2060
2061    s->cluster_cache_offset = -1; /* disable compressed cache */
2062
2063    qemu_co_mutex_lock(&s->lock);
2064
2065    while (bytes != 0) {
2066
2067        l2meta = NULL;
2068
2069        trace_qcow2_writev_start_part(qemu_coroutine_self());
2070        offset_in_cluster = offset_into_cluster(s, offset);
2071        cur_bytes = MIN(bytes, INT_MAX);
2072        if (bs->encrypted) {
2073            cur_bytes = MIN(cur_bytes,
2074                            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
2075                            - offset_in_cluster);
2076        }
2077
2078        ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes,
2079                                         &cluster_offset, &l2meta);
2080        if (ret < 0) {
2081            goto fail;
2082        }
2083
2084        assert((cluster_offset & 511) == 0);
2085
2086        qemu_iovec_reset(&hd_qiov);
2087        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, cur_bytes);
2088
2089        if (bs->encrypted) {
2090            assert(s->crypto);
2091            if (!cluster_data) {
2092                cluster_data = qemu_try_blockalign(bs->file->bs,
2093                                                   QCOW_MAX_CRYPT_CLUSTERS
2094                                                   * s->cluster_size);
2095                if (cluster_data == NULL) {
2096                    ret = -ENOMEM;
2097                    goto fail;
2098                }
2099            }
2100
2101            assert(hd_qiov.size <=
2102                   QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
2103            qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size);
2104
2105            if (qcrypto_block_encrypt(s->crypto,
2106                                      (s->crypt_physical_offset ?
2107                                       cluster_offset + offset_in_cluster :
2108                                       offset),
2109                                      cluster_data,
2110                                      cur_bytes, NULL) < 0) {
2111                ret = -EIO;
2112                goto fail;
2113            }
2114
2115            qemu_iovec_reset(&hd_qiov);
2116            qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes);
2117        }
2118
2119        ret = qcow2_pre_write_overlap_check(bs, 0,
2120                cluster_offset + offset_in_cluster, cur_bytes);
2121        if (ret < 0) {
2122            goto fail;
2123        }
2124
2125        /* If we need to do COW, check if it's possible to merge the
2126         * writing of the guest data together with that of the COW regions.
2127         * If it's not possible (or not necessary) then write the
2128         * guest data now. */
2129        if (!merge_cow(offset, cur_bytes, &hd_qiov, l2meta)) {
2130            qemu_co_mutex_unlock(&s->lock);
2131            BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
2132            trace_qcow2_writev_data(qemu_coroutine_self(),
2133                                    cluster_offset + offset_in_cluster);
2134            ret = bdrv_co_pwritev(bs->file,
2135                                  cluster_offset + offset_in_cluster,
2136                                  cur_bytes, &hd_qiov, 0);
2137            qemu_co_mutex_lock(&s->lock);
2138            if (ret < 0) {
2139                goto fail;
2140            }
2141        }
2142
2143        ret = qcow2_handle_l2meta(bs, &l2meta, true);
2144        if (ret) {
2145            goto fail;
2146        }
2147
2148        bytes -= cur_bytes;
2149        offset += cur_bytes;
2150        bytes_done += cur_bytes;
2151        trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_bytes);
2152    }
2153    ret = 0;
2154
2155fail:
2156    qcow2_handle_l2meta(bs, &l2meta, false);
2157
2158    qemu_co_mutex_unlock(&s->lock);
2159
2160    qemu_iovec_destroy(&hd_qiov);
2161    qemu_vfree(cluster_data);
2162    trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
2163
2164    return ret;
2165}
2166
2167static int qcow2_inactivate(BlockDriverState *bs)
2168{
2169    BDRVQcow2State *s = bs->opaque;
2170    int ret, result = 0;
2171    Error *local_err = NULL;
2172
2173    qcow2_store_persistent_dirty_bitmaps(bs, &local_err);
2174    if (local_err != NULL) {
2175        result = -EINVAL;
2176        error_reportf_err(local_err, "Lost persistent bitmaps during "
2177                          "inactivation of node '%s': ",
2178                          bdrv_get_device_or_node_name(bs));
2179    }
2180
2181    ret = qcow2_cache_flush(bs, s->l2_table_cache);
2182    if (ret) {
2183        result = ret;
2184        error_report("Failed to flush the L2 table cache: %s",
2185                     strerror(-ret));
2186    }
2187
2188    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
2189    if (ret) {
2190        result = ret;
2191        error_report("Failed to flush the refcount block cache: %s",
2192                     strerror(-ret));
2193    }
2194
2195    if (result == 0) {
2196        qcow2_mark_clean(bs);
2197    }
2198
2199    return result;
2200}
2201
2202static void qcow2_close(BlockDriverState *bs)
2203{
2204    BDRVQcow2State *s = bs->opaque;
2205    qemu_vfree(s->l1_table);
2206    /* else pre-write overlap checks in cache_destroy may crash */
2207    s->l1_table = NULL;
2208
2209    if (!(s->flags & BDRV_O_INACTIVE)) {
2210        qcow2_inactivate(bs);
2211    }
2212
2213    cache_clean_timer_del(bs);
2214    qcow2_cache_destroy(s->l2_table_cache);
2215    qcow2_cache_destroy(s->refcount_block_cache);
2216
2217    qcrypto_block_free(s->crypto);
2218    s->crypto = NULL;
2219
2220    g_free(s->unknown_header_fields);
2221    cleanup_unknown_header_ext(bs);
2222
2223    g_free(s->image_backing_file);
2224    g_free(s->image_backing_format);
2225
2226    g_free(s->cluster_cache);
2227    qemu_vfree(s->cluster_data);
2228    qcow2_refcount_close(bs);
2229    qcow2_free_snapshots(bs);
2230}
2231
2232static void coroutine_fn qcow2_co_invalidate_cache(BlockDriverState *bs,
2233                                                   Error **errp)
2234{
2235    BDRVQcow2State *s = bs->opaque;
2236    int flags = s->flags;
2237    QCryptoBlock *crypto = NULL;
2238    QDict *options;
2239    Error *local_err = NULL;
2240    int ret;
2241
2242    /*
2243     * Backing files are read-only which makes all of their metadata immutable,
2244     * that means we don't have to worry about reopening them here.
2245     */
2246
2247    crypto = s->crypto;
2248    s->crypto = NULL;
2249
2250    qcow2_close(bs);
2251
2252    memset(s, 0, sizeof(BDRVQcow2State));
2253    options = qdict_clone_shallow(bs->options);
2254
2255    flags &= ~BDRV_O_INACTIVE;
2256    qemu_co_mutex_lock(&s->lock);
2257    ret = qcow2_do_open(bs, options, flags, &local_err);
2258    qemu_co_mutex_unlock(&s->lock);
2259    qobject_unref(options);
2260    if (local_err) {
2261        error_propagate_prepend(errp, local_err,
2262                                "Could not reopen qcow2 layer: ");
2263        bs->drv = NULL;
2264        return;
2265    } else if (ret < 0) {
2266        error_setg_errno(errp, -ret, "Could not reopen qcow2 layer");
2267        bs->drv = NULL;
2268        return;
2269    }
2270
2271    s->crypto = crypto;
2272}
2273
2274static size_t header_ext_add(char *buf, uint32_t magic, const void *s,
2275    size_t len, size_t buflen)
2276{
2277    QCowExtension *ext_backing_fmt = (QCowExtension*) buf;
2278    size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7);
2279
2280    if (buflen < ext_len) {
2281        return -ENOSPC;
2282    }
2283
2284    *ext_backing_fmt = (QCowExtension) {
2285        .magic  = cpu_to_be32(magic),
2286        .len    = cpu_to_be32(len),
2287    };
2288
2289    if (len) {
2290        memcpy(buf + sizeof(QCowExtension), s, len);
2291    }
2292
2293    return ext_len;
2294}
2295
2296/*
2297 * Updates the qcow2 header, including the variable length parts of it, i.e.
2298 * the backing file name and all extensions. qcow2 was not designed to allow
2299 * such changes, so if we run out of space (we can only use the first cluster)
2300 * this function may fail.
2301 *
2302 * Returns 0 on success, -errno in error cases.
2303 */
2304int qcow2_update_header(BlockDriverState *bs)
2305{
2306    BDRVQcow2State *s = bs->opaque;
2307    QCowHeader *header;
2308    char *buf;
2309    size_t buflen = s->cluster_size;
2310    int ret;
2311    uint64_t total_size;
2312    uint32_t refcount_table_clusters;
2313    size_t header_length;
2314    Qcow2UnknownHeaderExtension *uext;
2315
2316    buf = qemu_blockalign(bs, buflen);
2317
2318    /* Header structure */
2319    header = (QCowHeader*) buf;
2320
2321    if (buflen < sizeof(*header)) {
2322        ret = -ENOSPC;
2323        goto fail;
2324    }
2325
2326    header_length = sizeof(*header) + s->unknown_header_fields_size;
2327    total_size = bs->total_sectors * BDRV_SECTOR_SIZE;
2328    refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
2329
2330    *header = (QCowHeader) {
2331        /* Version 2 fields */
2332        .magic                  = cpu_to_be32(QCOW_MAGIC),
2333        .version                = cpu_to_be32(s->qcow_version),
2334        .backing_file_offset    = 0,
2335        .backing_file_size      = 0,
2336        .cluster_bits           = cpu_to_be32(s->cluster_bits),
2337        .size                   = cpu_to_be64(total_size),
2338        .crypt_method           = cpu_to_be32(s->crypt_method_header),
2339        .l1_size                = cpu_to_be32(s->l1_size),
2340        .l1_table_offset        = cpu_to_be64(s->l1_table_offset),
2341        .refcount_table_offset  = cpu_to_be64(s->refcount_table_offset),
2342        .refcount_table_clusters = cpu_to_be32(refcount_table_clusters),
2343        .nb_snapshots           = cpu_to_be32(s->nb_snapshots),
2344        .snapshots_offset       = cpu_to_be64(s->snapshots_offset),
2345
2346        /* Version 3 fields */
2347        .incompatible_features  = cpu_to_be64(s->incompatible_features),
2348        .compatible_features    = cpu_to_be64(s->compatible_features),
2349        .autoclear_features     = cpu_to_be64(s->autoclear_features),
2350        .refcount_order         = cpu_to_be32(s->refcount_order),
2351        .header_length          = cpu_to_be32(header_length),
2352    };
2353
2354    /* For older versions, write a shorter header */
2355    switch (s->qcow_version) {
2356    case 2:
2357        ret = offsetof(QCowHeader, incompatible_features);
2358        break;
2359    case 3:
2360        ret = sizeof(*header);
2361        break;
2362    default:
2363        ret = -EINVAL;
2364        goto fail;
2365    }
2366
2367    buf += ret;
2368    buflen -= ret;
2369    memset(buf, 0, buflen);
2370
2371    /* Preserve any unknown field in the header */
2372    if (s->unknown_header_fields_size) {
2373        if (buflen < s->unknown_header_fields_size) {
2374            ret = -ENOSPC;
2375            goto fail;
2376        }
2377
2378        memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size);
2379        buf += s->unknown_header_fields_size;
2380        buflen -= s->unknown_header_fields_size;
2381    }
2382
2383    /* Backing file format header extension */
2384    if (s->image_backing_format) {
2385        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT,
2386                             s->image_backing_format,
2387                             strlen(s->image_backing_format),
2388                             buflen);
2389        if (ret < 0) {
2390            goto fail;
2391        }
2392
2393        buf += ret;
2394        buflen -= ret;
2395    }
2396
2397    /* Full disk encryption header pointer extension */
2398    if (s->crypto_header.offset != 0) {
2399        s->crypto_header.offset = cpu_to_be64(s->crypto_header.offset);
2400        s->crypto_header.length = cpu_to_be64(s->crypto_header.length);
2401        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_CRYPTO_HEADER,
2402                             &s->crypto_header, sizeof(s->crypto_header),
2403                             buflen);
2404        s->crypto_header.offset = be64_to_cpu(s->crypto_header.offset);
2405        s->crypto_header.length = be64_to_cpu(s->crypto_header.length);
2406        if (ret < 0) {
2407            goto fail;
2408        }
2409        buf += ret;
2410        buflen -= ret;
2411    }
2412
2413    /* Feature table */
2414    if (s->qcow_version >= 3) {
2415        Qcow2Feature features[] = {
2416            {
2417                .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
2418                .bit  = QCOW2_INCOMPAT_DIRTY_BITNR,
2419                .name = "dirty bit",
2420            },
2421            {
2422                .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
2423                .bit  = QCOW2_INCOMPAT_CORRUPT_BITNR,
2424                .name = "corrupt bit",
2425            },
2426            {
2427                .type = QCOW2_FEAT_TYPE_COMPATIBLE,
2428                .bit  = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
2429                .name = "lazy refcounts",
2430            },
2431        };
2432
2433        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE,
2434                             features, sizeof(features), buflen);
2435        if (ret < 0) {
2436            goto fail;
2437        }
2438        buf += ret;
2439        buflen -= ret;
2440    }
2441
2442    /* Bitmap extension */
2443    if (s->nb_bitmaps > 0) {
2444        Qcow2BitmapHeaderExt bitmaps_header = {
2445            .nb_bitmaps = cpu_to_be32(s->nb_bitmaps),
2446            .bitmap_directory_size =
2447                    cpu_to_be64(s->bitmap_directory_size),
2448            .bitmap_directory_offset =
2449                    cpu_to_be64(s->bitmap_directory_offset)
2450        };
2451        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BITMAPS,
2452                             &bitmaps_header, sizeof(bitmaps_header),
2453                             buflen);
2454        if (ret < 0) {
2455            goto fail;
2456        }
2457        buf += ret;
2458        buflen -= ret;
2459    }
2460
2461    /* Keep unknown header extensions */
2462    QLIST_FOREACH(uext, &s->unknown_header_ext, next) {
2463        ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen);
2464        if (ret < 0) {
2465            goto fail;
2466        }
2467
2468        buf += ret;
2469        buflen -= ret;
2470    }
2471
2472    /* End of header extensions */
2473    ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen);
2474    if (ret < 0) {
2475        goto fail;
2476    }
2477
2478    buf += ret;
2479    buflen -= ret;
2480
2481    /* Backing file name */
2482    if (s->image_backing_file) {
2483        size_t backing_file_len = strlen(s->image_backing_file);
2484
2485        if (buflen < backing_file_len) {
2486            ret = -ENOSPC;
2487            goto fail;
2488        }
2489
2490        /* Using strncpy is ok here, since buf is not NUL-terminated. */
2491        strncpy(buf, s->image_backing_file, buflen);
2492
2493        header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
2494        header->backing_file_size   = cpu_to_be32(backing_file_len);
2495    }
2496
2497    /* Write the new header */
2498    ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size);
2499    if (ret < 0) {
2500        goto fail;
2501    }
2502
2503    ret = 0;
2504fail:
2505    qemu_vfree(header);
2506    return ret;
2507}
2508
2509static int qcow2_change_backing_file(BlockDriverState *bs,
2510    const char *backing_file, const char *backing_fmt)
2511{
2512    BDRVQcow2State *s = bs->opaque;
2513
2514    if (backing_file && strlen(backing_file) > 1023) {
2515        return -EINVAL;
2516    }
2517
2518    pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2519    pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2520
2521    g_free(s->image_backing_file);
2522    g_free(s->image_backing_format);
2523
2524    s->image_backing_file = backing_file ? g_strdup(bs->backing_file) : NULL;
2525    s->image_backing_format = backing_fmt ? g_strdup(bs->backing_format) : NULL;
2526
2527    return qcow2_update_header(bs);
2528}
2529
2530static int qcow2_crypt_method_from_format(const char *encryptfmt)
2531{
2532    if (g_str_equal(encryptfmt, "luks")) {
2533        return QCOW_CRYPT_LUKS;
2534    } else if (g_str_equal(encryptfmt, "aes")) {
2535        return QCOW_CRYPT_AES;
2536    } else {
2537        return -EINVAL;
2538    }
2539}
2540
2541static int qcow2_set_up_encryption(BlockDriverState *bs,
2542                                   QCryptoBlockCreateOptions *cryptoopts,
2543                                   Error **errp)
2544{
2545    BDRVQcow2State *s = bs->opaque;
2546    QCryptoBlock *crypto = NULL;
2547    int fmt, ret;
2548
2549    switch (cryptoopts->format) {
2550    case Q_CRYPTO_BLOCK_FORMAT_LUKS:
2551        fmt = QCOW_CRYPT_LUKS;
2552        break;
2553    case Q_CRYPTO_BLOCK_FORMAT_QCOW:
2554        fmt = QCOW_CRYPT_AES;
2555        break;
2556    default:
2557        error_setg(errp, "Crypto format not supported in qcow2");
2558        return -EINVAL;
2559    }
2560
2561    s->crypt_method_header = fmt;
2562
2563    crypto = qcrypto_block_create(cryptoopts, "encrypt.",
2564                                  qcow2_crypto_hdr_init_func,
2565                                  qcow2_crypto_hdr_write_func,
2566                                  bs, errp);
2567    if (!crypto) {
2568        return -EINVAL;
2569    }
2570
2571    ret = qcow2_update_header(bs);
2572    if (ret < 0) {
2573        error_setg_errno(errp, -ret, "Could not write encryption header");
2574        goto out;
2575    }
2576
2577    ret = 0;
2578 out:
2579    qcrypto_block_free(crypto);
2580    return ret;
2581}
2582
2583/**
2584 * Preallocates metadata structures for data clusters between @offset (in the
2585 * guest disk) and @new_length (which is thus generally the new guest disk
2586 * size).
2587 *
2588 * Returns: 0 on success, -errno on failure.
2589 */
2590static int coroutine_fn preallocate_co(BlockDriverState *bs, uint64_t offset,
2591                                       uint64_t new_length)
2592{
2593    uint64_t bytes;
2594    uint64_t host_offset = 0;
2595    unsigned int cur_bytes;
2596    int ret;
2597    QCowL2Meta *meta;
2598
2599    assert(offset <= new_length);
2600    bytes = new_length - offset;
2601
2602    while (bytes) {
2603        cur_bytes = MIN(bytes, INT_MAX);
2604        ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes,
2605                                         &host_offset, &meta);
2606        if (ret < 0) {
2607            return ret;
2608        }
2609
2610        while (meta) {
2611            QCowL2Meta *next = meta->next;
2612
2613            ret = qcow2_alloc_cluster_link_l2(bs, meta);
2614            if (ret < 0) {
2615                qcow2_free_any_clusters(bs, meta->alloc_offset,
2616                                        meta->nb_clusters, QCOW2_DISCARD_NEVER);
2617                return ret;
2618            }
2619
2620            /* There are no dependent requests, but we need to remove our
2621             * request from the list of in-flight requests */
2622            QLIST_REMOVE(meta, next_in_flight);
2623
2624            g_free(meta);
2625            meta = next;
2626        }
2627
2628        /* TODO Preallocate data if requested */
2629
2630        bytes -= cur_bytes;
2631        offset += cur_bytes;
2632    }
2633
2634    /*
2635     * It is expected that the image file is large enough to actually contain
2636     * all of the allocated clusters (otherwise we get failing reads after
2637     * EOF). Extend the image to the last allocated sector.
2638     */
2639    if (host_offset != 0) {
2640        uint8_t data = 0;
2641        ret = bdrv_pwrite(bs->file, (host_offset + cur_bytes) - 1,
2642                          &data, 1);
2643        if (ret < 0) {
2644            return ret;
2645        }
2646    }
2647
2648    return 0;
2649}
2650
2651/* qcow2_refcount_metadata_size:
2652 * @clusters: number of clusters to refcount (including data and L1/L2 tables)
2653 * @cluster_size: size of a cluster, in bytes
2654 * @refcount_order: refcount bits power-of-2 exponent
2655 * @generous_increase: allow for the refcount table to be 1.5x as large as it
2656 *                     needs to be
2657 *
2658 * Returns: Number of bytes required for refcount blocks and table metadata.
2659 */
2660int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size,
2661                                     int refcount_order, bool generous_increase,
2662                                     uint64_t *refblock_count)
2663{
2664    /*
2665     * Every host cluster is reference-counted, including metadata (even
2666     * refcount metadata is recursively included).
2667     *
2668     * An accurate formula for the size of refcount metadata size is difficult
2669     * to derive.  An easier method of calculation is finding the fixed point
2670     * where no further refcount blocks or table clusters are required to
2671     * reference count every cluster.
2672     */
2673    int64_t blocks_per_table_cluster = cluster_size / sizeof(uint64_t);
2674    int64_t refcounts_per_block = cluster_size * 8 / (1 << refcount_order);
2675    int64_t table = 0;  /* number of refcount table clusters */
2676    int64_t blocks = 0; /* number of refcount block clusters */
2677    int64_t last;
2678    int64_t n = 0;
2679
2680    do {
2681        last = n;
2682        blocks = DIV_ROUND_UP(clusters + table + blocks, refcounts_per_block);
2683        table = DIV_ROUND_UP(blocks, blocks_per_table_cluster);
2684        n = clusters + blocks + table;
2685
2686        if (n == last && generous_increase) {
2687            clusters += DIV_ROUND_UP(table, 2);
2688            n = 0; /* force another loop */
2689            generous_increase = false;
2690        }
2691    } while (n != last);
2692
2693    if (refblock_count) {
2694        *refblock_count = blocks;
2695    }
2696
2697    return (blocks + table) * cluster_size;
2698}
2699
2700/**
2701 * qcow2_calc_prealloc_size:
2702 * @total_size: virtual disk size in bytes
2703 * @cluster_size: cluster size in bytes
2704 * @refcount_order: refcount bits power-of-2 exponent
2705 *
2706 * Returns: Total number of bytes required for the fully allocated image
2707 * (including metadata).
2708 */
2709static int64_t qcow2_calc_prealloc_size(int64_t total_size,
2710                                        size_t cluster_size,
2711                                        int refcount_order)
2712{
2713    int64_t meta_size = 0;
2714    uint64_t nl1e, nl2e;
2715    int64_t aligned_total_size = ROUND_UP(total_size, cluster_size);
2716
2717    /* header: 1 cluster */
2718    meta_size += cluster_size;
2719
2720    /* total size of L2 tables */
2721    nl2e = aligned_total_size / cluster_size;
2722    nl2e = ROUND_UP(nl2e, cluster_size / sizeof(uint64_t));
2723    meta_size += nl2e * sizeof(uint64_t);
2724
2725    /* total size of L1 tables */
2726    nl1e = nl2e * sizeof(uint64_t) / cluster_size;
2727    nl1e = ROUND_UP(nl1e, cluster_size / sizeof(uint64_t));
2728    meta_size += nl1e * sizeof(uint64_t);
2729
2730    /* total size of refcount table and blocks */
2731    meta_size += qcow2_refcount_metadata_size(
2732            (meta_size + aligned_total_size) / cluster_size,
2733            cluster_size, refcount_order, false, NULL);
2734
2735    return meta_size + aligned_total_size;
2736}
2737
2738static bool validate_cluster_size(size_t cluster_size, Error **errp)
2739{
2740    int cluster_bits = ctz32(cluster_size);
2741    if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
2742        (1 << cluster_bits) != cluster_size)
2743    {
2744        error_setg(errp, "Cluster size must be a power of two between %d and "
2745                   "%dk", 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10));
2746        return false;
2747    }
2748    return true;
2749}
2750
2751static size_t qcow2_opt_get_cluster_size_del(QemuOpts *opts, Error **errp)
2752{
2753    size_t cluster_size;
2754
2755    cluster_size = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE,
2756                                         DEFAULT_CLUSTER_SIZE);
2757    if (!validate_cluster_size(cluster_size, errp)) {
2758        return 0;
2759    }
2760    return cluster_size;
2761}
2762
2763static int qcow2_opt_get_version_del(QemuOpts *opts, Error **errp)
2764{
2765    char *buf;
2766    int ret;
2767
2768    buf = qemu_opt_get_del(opts, BLOCK_OPT_COMPAT_LEVEL);
2769    if (!buf) {
2770        ret = 3; /* default */
2771    } else if (!strcmp(buf, "0.10")) {
2772        ret = 2;
2773    } else if (!strcmp(buf, "1.1")) {
2774        ret = 3;
2775    } else {
2776        error_setg(errp, "Invalid compatibility level: '%s'", buf);
2777        ret = -EINVAL;
2778    }
2779    g_free(buf);
2780    return ret;
2781}
2782
2783static uint64_t qcow2_opt_get_refcount_bits_del(QemuOpts *opts, int version,
2784                                                Error **errp)
2785{
2786    uint64_t refcount_bits;
2787
2788    refcount_bits = qemu_opt_get_number_del(opts, BLOCK_OPT_REFCOUNT_BITS, 16);
2789    if (refcount_bits > 64 || !is_power_of_2(refcount_bits)) {
2790        error_setg(errp, "Refcount width must be a power of two and may not "
2791                   "exceed 64 bits");
2792        return 0;
2793    }
2794
2795    if (version < 3 && refcount_bits != 16) {
2796        error_setg(errp, "Different refcount widths than 16 bits require "
2797                   "compatibility level 1.1 or above (use compat=1.1 or "
2798                   "greater)");
2799        return 0;
2800    }
2801
2802    return refcount_bits;
2803}
2804
2805static int coroutine_fn
2806qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
2807{
2808    BlockdevCreateOptionsQcow2 *qcow2_opts;
2809    QDict *options;
2810
2811    /*
2812     * Open the image file and write a minimal qcow2 header.
2813     *
2814     * We keep things simple and start with a zero-sized image. We also
2815     * do without refcount blocks or a L1 table for now. We'll fix the
2816     * inconsistency later.
2817     *
2818     * We do need a refcount table because growing the refcount table means
2819     * allocating two new refcount blocks - the seconds of which would be at
2820     * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file
2821     * size for any qcow2 image.
2822     */
2823    BlockBackend *blk = NULL;
2824    BlockDriverState *bs = NULL;
2825    QCowHeader *header;
2826    size_t cluster_size;
2827    int version;
2828    int refcount_order;
2829    uint64_t* refcount_table;
2830    Error *local_err = NULL;
2831    int ret;
2832
2833    assert(create_options->driver == BLOCKDEV_DRIVER_QCOW2);
2834    qcow2_opts = &create_options->u.qcow2;
2835
2836    bs = bdrv_open_blockdev_ref(qcow2_opts->file, errp);
2837    if (bs == NULL) {
2838        return -EIO;
2839    }
2840
2841    /* Validate options and set default values */
2842    if (!QEMU_IS_ALIGNED(qcow2_opts->size, BDRV_SECTOR_SIZE)) {
2843        error_setg(errp, "Image size must be a multiple of 512 bytes");
2844        ret = -EINVAL;
2845        goto out;
2846    }
2847
2848    if (qcow2_opts->has_version) {
2849        switch (qcow2_opts->version) {
2850        case BLOCKDEV_QCOW2_VERSION_V2:
2851            version = 2;
2852            break;
2853        case BLOCKDEV_QCOW2_VERSION_V3:
2854            version = 3;
2855            break;
2856        default:
2857            g_assert_not_reached();
2858        }
2859    } else {
2860        version = 3;
2861    }
2862
2863    if (qcow2_opts->has_cluster_size) {
2864        cluster_size = qcow2_opts->cluster_size;
2865    } else {
2866        cluster_size = DEFAULT_CLUSTER_SIZE;
2867    }
2868
2869    if (!validate_cluster_size(cluster_size, errp)) {
2870        ret = -EINVAL;
2871        goto out;
2872    }
2873
2874    if (!qcow2_opts->has_preallocation) {
2875        qcow2_opts->preallocation = PREALLOC_MODE_OFF;
2876    }
2877    if (qcow2_opts->has_backing_file &&
2878        qcow2_opts->preallocation != PREALLOC_MODE_OFF)
2879    {
2880        error_setg(errp, "Backing file and preallocation cannot be used at "
2881                   "the same time");
2882        ret = -EINVAL;
2883        goto out;
2884    }
2885    if (qcow2_opts->has_backing_fmt && !qcow2_opts->has_backing_file) {
2886        error_setg(errp, "Backing format cannot be used without backing file");
2887        ret = -EINVAL;
2888        goto out;
2889    }
2890
2891    if (!qcow2_opts->has_lazy_refcounts) {
2892        qcow2_opts->lazy_refcounts = false;
2893    }
2894    if (version < 3 && qcow2_opts->lazy_refcounts) {
2895        error_setg(errp, "Lazy refcounts only supported with compatibility "
2896                   "level 1.1 and above (use version=v3 or greater)");
2897        ret = -EINVAL;
2898        goto out;
2899    }
2900
2901    if (!qcow2_opts->has_refcount_bits) {
2902        qcow2_opts->refcount_bits = 16;
2903    }
2904    if (qcow2_opts->refcount_bits > 64 ||
2905        !is_power_of_2(qcow2_opts->refcount_bits))
2906    {
2907        error_setg(errp, "Refcount width must be a power of two and may not "
2908                   "exceed 64 bits");
2909        ret = -EINVAL;
2910        goto out;
2911    }
2912    if (version < 3 && qcow2_opts->refcount_bits != 16) {
2913        error_setg(errp, "Different refcount widths than 16 bits require "
2914                   "compatibility level 1.1 or above (use version=v3 or "
2915                   "greater)");
2916        ret = -EINVAL;
2917        goto out;
2918    }
2919    refcount_order = ctz32(qcow2_opts->refcount_bits);
2920
2921
2922    /* Create BlockBackend to write to the image */
2923    blk = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
2924    ret = blk_insert_bs(blk, bs, errp);
2925    if (ret < 0) {
2926        goto out;
2927    }
2928    blk_set_allow_write_beyond_eof(blk, true);
2929
2930    /* Clear the protocol layer and preallocate it if necessary */
2931    ret = blk_truncate(blk, 0, PREALLOC_MODE_OFF, errp);
2932    if (ret < 0) {
2933        goto out;
2934    }
2935
2936    if (qcow2_opts->preallocation == PREALLOC_MODE_FULL ||
2937        qcow2_opts->preallocation == PREALLOC_MODE_FALLOC)
2938    {
2939        int64_t prealloc_size =
2940            qcow2_calc_prealloc_size(qcow2_opts->size, cluster_size,
2941                                     refcount_order);
2942
2943        ret = blk_truncate(blk, prealloc_size, qcow2_opts->preallocation, errp);
2944        if (ret < 0) {
2945            goto out;
2946        }
2947    }
2948
2949    /* Write the header */
2950    QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header));
2951    header = g_malloc0(cluster_size);
2952    *header = (QCowHeader) {
2953        .magic                      = cpu_to_be32(QCOW_MAGIC),
2954        .version                    = cpu_to_be32(version),
2955        .cluster_bits               = cpu_to_be32(ctz32(cluster_size)),
2956        .size                       = cpu_to_be64(0),
2957        .l1_table_offset            = cpu_to_be64(0),
2958        .l1_size                    = cpu_to_be32(0),
2959        .refcount_table_offset      = cpu_to_be64(cluster_size),
2960        .refcount_table_clusters    = cpu_to_be32(1),
2961        .refcount_order             = cpu_to_be32(refcount_order),
2962        .header_length              = cpu_to_be32(sizeof(*header)),
2963    };
2964
2965    /* We'll update this to correct value later */
2966    header->crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
2967
2968    if (qcow2_opts->lazy_refcounts) {
2969        header->compatible_features |=
2970            cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
2971    }
2972
2973    ret = blk_pwrite(blk, 0, header, cluster_size, 0);
2974    g_free(header);
2975    if (ret < 0) {
2976        error_setg_errno(errp, -ret, "Could not write qcow2 header");
2977        goto out;
2978    }
2979
2980    /* Write a refcount table with one refcount block */
2981    refcount_table = g_malloc0(2 * cluster_size);
2982    refcount_table[0] = cpu_to_be64(2 * cluster_size);
2983    ret = blk_pwrite(blk, cluster_size, refcount_table, 2 * cluster_size, 0);
2984    g_free(refcount_table);
2985
2986    if (ret < 0) {
2987        error_setg_errno(errp, -ret, "Could not write refcount table");
2988        goto out;
2989    }
2990
2991    blk_unref(blk);
2992    blk = NULL;
2993
2994    /*
2995     * And now open the image and make it consistent first (i.e. increase the
2996     * refcount of the cluster that is occupied by the header and the refcount
2997     * table)
2998     */
2999    options = qdict_new();
3000    qdict_put_str(options, "driver", "qcow2");
3001    qdict_put_str(options, "file", bs->node_name);
3002    blk = blk_new_open(NULL, NULL, options,
3003                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_NO_FLUSH,
3004                       &local_err);
3005    if (blk == NULL) {
3006        error_propagate(errp, local_err);
3007        ret = -EIO;
3008        goto out;
3009    }
3010
3011    ret = qcow2_alloc_clusters(blk_bs(blk), 3 * cluster_size);
3012    if (ret < 0) {
3013        error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 "
3014                         "header and refcount table");
3015        goto out;
3016
3017    } else if (ret != 0) {
3018        error_report("Huh, first cluster in empty image is already in use?");
3019        abort();
3020    }
3021
3022    /* Create a full header (including things like feature table) */
3023    ret = qcow2_update_header(blk_bs(blk));
3024    if (ret < 0) {
3025        error_setg_errno(errp, -ret, "Could not update qcow2 header");
3026        goto out;
3027    }
3028
3029    /* Okay, now that we have a valid image, let's give it the right size */
3030    ret = blk_truncate(blk, qcow2_opts->size, PREALLOC_MODE_OFF, errp);
3031    if (ret < 0) {
3032        error_prepend(errp, "Could not resize image: ");
3033        goto out;
3034    }
3035
3036    /* Want a backing file? There you go.*/
3037    if (qcow2_opts->has_backing_file) {
3038        const char *backing_format = NULL;
3039
3040        if (qcow2_opts->has_backing_fmt) {
3041            backing_format = BlockdevDriver_str(qcow2_opts->backing_fmt);
3042        }
3043
3044        ret = bdrv_change_backing_file(blk_bs(blk), qcow2_opts->backing_file,
3045                                       backing_format);
3046        if (ret < 0) {
3047            error_setg_errno(errp, -ret, "Could not assign backing file '%s' "
3048                             "with format '%s'", qcow2_opts->backing_file,
3049                             backing_format);
3050            goto out;
3051        }
3052    }
3053
3054    /* Want encryption? There you go. */
3055    if (qcow2_opts->has_encrypt) {
3056        ret = qcow2_set_up_encryption(blk_bs(blk), qcow2_opts->encrypt, errp);
3057        if (ret < 0) {
3058            goto out;
3059        }
3060    }
3061
3062    /* And if we're supposed to preallocate metadata, do that now */
3063    if (qcow2_opts->preallocation != PREALLOC_MODE_OFF) {
3064        BDRVQcow2State *s = blk_bs(blk)->opaque;
3065        qemu_co_mutex_lock(&s->lock);
3066        ret = preallocate_co(blk_bs(blk), 0, qcow2_opts->size);
3067        qemu_co_mutex_unlock(&s->lock);
3068
3069        if (ret < 0) {
3070            error_setg_errno(errp, -ret, "Could not preallocate metadata");
3071            goto out;
3072        }
3073    }
3074
3075    blk_unref(blk);
3076    blk = NULL;
3077
3078    /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning.
3079     * Using BDRV_O_NO_IO, since encryption is now setup we don't want to
3080     * have to setup decryption context. We're not doing any I/O on the top
3081     * level BlockDriverState, only lower layers, where BDRV_O_NO_IO does
3082     * not have effect.
3083     */
3084    options = qdict_new();
3085    qdict_put_str(options, "driver", "qcow2");
3086    qdict_put_str(options, "file", bs->node_name);
3087    blk = blk_new_open(NULL, NULL, options,
3088                       BDRV_O_RDWR | BDRV_O_NO_BACKING | BDRV_O_NO_IO,
3089                       &local_err);
3090    if (blk == NULL) {
3091        error_propagate(errp, local_err);
3092        ret = -EIO;
3093        goto out;
3094    }
3095
3096    ret = 0;
3097out:
3098    blk_unref(blk);
3099    bdrv_unref(bs);
3100    return ret;
3101}
3102
3103static int coroutine_fn qcow2_co_create_opts(const char *filename, QemuOpts *opts,
3104                                             Error **errp)
3105{
3106    BlockdevCreateOptions *create_options = NULL;
3107    QDict *qdict;
3108    Visitor *v;
3109    BlockDriverState *bs = NULL;
3110    Error *local_err = NULL;
3111    const char *val;
3112    int ret;
3113
3114    /* Only the keyval visitor supports the dotted syntax needed for
3115     * encryption, so go through a QDict before getting a QAPI type. Ignore
3116     * options meant for the protocol layer so that the visitor doesn't
3117     * complain. */
3118    qdict = qemu_opts_to_qdict_filtered(opts, NULL, bdrv_qcow2.create_opts,
3119                                        true);
3120
3121    /* Handle encryption options */
3122    val = qdict_get_try_str(qdict, BLOCK_OPT_ENCRYPT);
3123    if (val && !strcmp(val, "on")) {
3124        qdict_put_str(qdict, BLOCK_OPT_ENCRYPT, "qcow");
3125    } else if (val && !strcmp(val, "off")) {
3126        qdict_del(qdict, BLOCK_OPT_ENCRYPT);
3127    }
3128
3129    val = qdict_get_try_str(qdict, BLOCK_OPT_ENCRYPT_FORMAT);
3130    if (val && !strcmp(val, "aes")) {
3131        qdict_put_str(qdict, BLOCK_OPT_ENCRYPT_FORMAT, "qcow");
3132    }
3133
3134    /* Convert compat=0.10/1.1 into compat=v2/v3, to be renamed into
3135     * version=v2/v3 below. */
3136    val = qdict_get_try_str(qdict, BLOCK_OPT_COMPAT_LEVEL);
3137    if (val && !strcmp(val, "0.10")) {
3138        qdict_put_str(qdict, BLOCK_OPT_COMPAT_LEVEL, "v2");
3139    } else if (val && !strcmp(val, "1.1")) {
3140        qdict_put_str(qdict, BLOCK_OPT_COMPAT_LEVEL, "v3");
3141    }
3142
3143    /* Change legacy command line options into QMP ones */
3144    static const QDictRenames opt_renames[] = {
3145        { BLOCK_OPT_BACKING_FILE,       "backing-file" },
3146        { BLOCK_OPT_BACKING_FMT,        "backing-fmt" },
3147        { BLOCK_OPT_CLUSTER_SIZE,       "cluster-size" },
3148        { BLOCK_OPT_LAZY_REFCOUNTS,     "lazy-refcounts" },
3149        { BLOCK_OPT_REFCOUNT_BITS,      "refcount-bits" },
3150        { BLOCK_OPT_ENCRYPT,            BLOCK_OPT_ENCRYPT_FORMAT },
3151        { BLOCK_OPT_COMPAT_LEVEL,       "version" },
3152        { NULL, NULL },
3153    };
3154
3155    if (!qdict_rename_keys(qdict, opt_renames, errp)) {
3156        ret = -EINVAL;
3157        goto finish;
3158    }
3159
3160    /* Create and open the file (protocol layer) */
3161    ret = bdrv_create_file(filename, opts, errp);
3162    if (ret < 0) {
3163        goto finish;
3164    }
3165
3166    bs = bdrv_open(filename, NULL, NULL,
3167                   BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
3168    if (bs == NULL) {
3169        ret = -EIO;
3170        goto finish;
3171    }
3172
3173    /* Set 'driver' and 'node' options */
3174    qdict_put_str(qdict, "driver", "qcow2");
3175    qdict_put_str(qdict, "file", bs->node_name);
3176
3177    /* Now get the QAPI type BlockdevCreateOptions */
3178    v = qobject_input_visitor_new_flat_confused(qdict, errp);
3179    if (!v) {
3180        ret = -EINVAL;
3181        goto finish;
3182    }
3183
3184    visit_type_BlockdevCreateOptions(v, NULL, &create_options, &local_err);
3185    visit_free(v);
3186
3187    if (local_err) {
3188        error_propagate(errp, local_err);
3189        ret = -EINVAL;
3190        goto finish;
3191    }
3192
3193    /* Silently round up size */
3194    create_options->u.qcow2.size = ROUND_UP(create_options->u.qcow2.size,
3195                                            BDRV_SECTOR_SIZE);
3196
3197    /* Create the qcow2 image (format layer) */
3198    ret = qcow2_co_create(create_options, errp);
3199    if (ret < 0) {
3200        goto finish;
3201    }
3202
3203    ret = 0;
3204finish:
3205    qobject_unref(qdict);
3206    bdrv_unref(bs);
3207    qapi_free_BlockdevCreateOptions(create_options);
3208    return ret;
3209}
3210
3211
3212static bool is_zero(BlockDriverState *bs, int64_t offset, int64_t bytes)
3213{
3214    int64_t nr;
3215    int res;
3216
3217    /* Clamp to image length, before checking status of underlying sectors */
3218    if (offset + bytes > bs->total_sectors * BDRV_SECTOR_SIZE) {
3219        bytes = bs->total_sectors * BDRV_SECTOR_SIZE - offset;
3220    }
3221
3222    if (!bytes) {
3223        return true;
3224    }
3225    res = bdrv_block_status_above(bs, NULL, offset, bytes, &nr, NULL, NULL);
3226    return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == bytes;
3227}
3228
3229static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
3230    int64_t offset, int bytes, BdrvRequestFlags flags)
3231{
3232    int ret;
3233    BDRVQcow2State *s = bs->opaque;
3234
3235    uint32_t head = offset % s->cluster_size;
3236    uint32_t tail = (offset + bytes) % s->cluster_size;
3237
3238    trace_qcow2_pwrite_zeroes_start_req(qemu_coroutine_self(), offset, bytes);
3239    if (offset + bytes == bs->total_sectors * BDRV_SECTOR_SIZE) {
3240        tail = 0;
3241    }
3242
3243    if (head || tail) {
3244        uint64_t off;
3245        unsigned int nr;
3246
3247        assert(head + bytes <= s->cluster_size);
3248
3249        /* check whether remainder of cluster already reads as zero */
3250        if (!(is_zero(bs, offset - head, head) &&
3251              is_zero(bs, offset + bytes,
3252                      tail ? s->cluster_size - tail : 0))) {
3253            return -ENOTSUP;
3254        }
3255
3256        qemu_co_mutex_lock(&s->lock);
3257        /* We can have new write after previous check */
3258        offset = QEMU_ALIGN_DOWN(offset, s->cluster_size);
3259        bytes = s->cluster_size;
3260        nr = s->cluster_size;
3261        ret = qcow2_get_cluster_offset(bs, offset, &nr, &off);
3262        if (ret != QCOW2_CLUSTER_UNALLOCATED &&
3263            ret != QCOW2_CLUSTER_ZERO_PLAIN &&
3264            ret != QCOW2_CLUSTER_ZERO_ALLOC) {
3265            qemu_co_mutex_unlock(&s->lock);
3266            return -ENOTSUP;
3267        }
3268    } else {
3269        qemu_co_mutex_lock(&s->lock);
3270    }
3271
3272    trace_qcow2_pwrite_zeroes(qemu_coroutine_self(), offset, bytes);
3273
3274    /* Whatever is left can use real zero clusters */
3275    ret = qcow2_cluster_zeroize(bs, offset, bytes, flags);
3276    qemu_co_mutex_unlock(&s->lock);
3277
3278    return ret;
3279}
3280
3281static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs,
3282                                          int64_t offset, int bytes)
3283{
3284    int ret;
3285    BDRVQcow2State *s = bs->opaque;
3286
3287    if (!QEMU_IS_ALIGNED(offset | bytes, s->cluster_size)) {
3288        assert(bytes < s->cluster_size);
3289        /* Ignore partial clusters, except for the special case of the
3290         * complete partial cluster at the end of an unaligned file */
3291        if (!QEMU_IS_ALIGNED(offset, s->cluster_size) ||
3292            offset + bytes != bs->total_sectors * BDRV_SECTOR_SIZE) {
3293            return -ENOTSUP;
3294        }
3295    }
3296
3297    qemu_co_mutex_lock(&s->lock);
3298    ret = qcow2_cluster_discard(bs, offset, bytes, QCOW2_DISCARD_REQUEST,
3299                                false);
3300    qemu_co_mutex_unlock(&s->lock);
3301    return ret;
3302}
3303
3304static int coroutine_fn
3305qcow2_co_copy_range_from(BlockDriverState *bs,
3306                         BdrvChild *src, uint64_t src_offset,
3307                         BdrvChild *dst, uint64_t dst_offset,
3308                         uint64_t bytes, BdrvRequestFlags read_flags,
3309                         BdrvRequestFlags write_flags)
3310{
3311    BDRVQcow2State *s = bs->opaque;
3312    int ret;
3313    unsigned int cur_bytes; /* number of bytes in current iteration */
3314    BdrvChild *child = NULL;
3315    BdrvRequestFlags cur_write_flags;
3316
3317    assert(!bs->encrypted);
3318    qemu_co_mutex_lock(&s->lock);
3319
3320    while (bytes != 0) {
3321        uint64_t copy_offset = 0;
3322        /* prepare next request */
3323        cur_bytes = MIN(bytes, INT_MAX);
3324        cur_write_flags = write_flags;
3325
3326        ret = qcow2_get_cluster_offset(bs, src_offset, &cur_bytes, &copy_offset);
3327        if (ret < 0) {
3328            goto out;
3329        }
3330
3331        switch (ret) {
3332        case QCOW2_CLUSTER_UNALLOCATED:
3333            if (bs->backing && bs->backing->bs) {
3334                int64_t backing_length = bdrv_getlength(bs->backing->bs);
3335                if (src_offset >= backing_length) {
3336                    cur_write_flags |= BDRV_REQ_ZERO_WRITE;
3337                } else {
3338                    child = bs->backing;
3339                    cur_bytes = MIN(cur_bytes, backing_length - src_offset);
3340                    copy_offset = src_offset;
3341                }
3342            } else {
3343                cur_write_flags |= BDRV_REQ_ZERO_WRITE;
3344            }
3345            break;
3346
3347        case QCOW2_CLUSTER_ZERO_PLAIN:
3348        case QCOW2_CLUSTER_ZERO_ALLOC:
3349            cur_write_flags |= BDRV_REQ_ZERO_WRITE;
3350            break;
3351
3352        case QCOW2_CLUSTER_COMPRESSED:
3353            ret = -ENOTSUP;
3354            goto out;
3355
3356        case QCOW2_CLUSTER_NORMAL:
3357            child = bs->file;
3358            copy_offset += offset_into_cluster(s, src_offset);
3359            if ((copy_offset & 511) != 0) {
3360                ret = -EIO;
3361                goto out;
3362            }
3363            break;
3364
3365        default:
3366            abort();
3367        }
3368        qemu_co_mutex_unlock(&s->lock);
3369        ret = bdrv_co_copy_range_from(child,
3370                                      copy_offset,
3371                                      dst, dst_offset,
3372                                      cur_bytes, read_flags, cur_write_flags);
3373        qemu_co_mutex_lock(&s->lock);
3374        if (ret < 0) {
3375            goto out;
3376        }
3377
3378        bytes -= cur_bytes;
3379        src_offset += cur_bytes;
3380        dst_offset += cur_bytes;
3381    }
3382    ret = 0;
3383
3384out:
3385    qemu_co_mutex_unlock(&s->lock);
3386    return ret;
3387}
3388
3389static int coroutine_fn
3390qcow2_co_copy_range_to(BlockDriverState *bs,
3391                       BdrvChild *src, uint64_t src_offset,
3392                       BdrvChild *dst, uint64_t dst_offset,
3393                       uint64_t bytes, BdrvRequestFlags read_flags,
3394                       BdrvRequestFlags write_flags)
3395{
3396    BDRVQcow2State *s = bs->opaque;
3397    int offset_in_cluster;
3398    int ret;
3399    unsigned int cur_bytes; /* number of sectors in current iteration */
3400    uint64_t cluster_offset;
3401    QCowL2Meta *l2meta = NULL;
3402
3403    assert(!bs->encrypted);
3404    s->cluster_cache_offset = -1; /* disable compressed cache */
3405
3406    qemu_co_mutex_lock(&s->lock);
3407
3408    while (bytes != 0) {
3409
3410        l2meta = NULL;
3411
3412        offset_in_cluster = offset_into_cluster(s, dst_offset);
3413        cur_bytes = MIN(bytes, INT_MAX);
3414
3415        /* TODO:
3416         * If src->bs == dst->bs, we could simply copy by incrementing
3417         * the refcnt, without copying user data.
3418         * Or if src->bs == dst->bs->backing->bs, we could copy by discarding. */
3419        ret = qcow2_alloc_cluster_offset(bs, dst_offset, &cur_bytes,
3420                                         &cluster_offset, &l2meta);
3421        if (ret < 0) {
3422            goto fail;
3423        }
3424
3425        assert((cluster_offset & 511) == 0);
3426
3427        ret = qcow2_pre_write_overlap_check(bs, 0,
3428                cluster_offset + offset_in_cluster, cur_bytes);
3429        if (ret < 0) {
3430            goto fail;
3431        }
3432
3433        qemu_co_mutex_unlock(&s->lock);
3434        ret = bdrv_co_copy_range_to(src, src_offset,
3435                                    bs->file,
3436                                    cluster_offset + offset_in_cluster,
3437                                    cur_bytes, read_flags, write_flags);
3438        qemu_co_mutex_lock(&s->lock);
3439        if (ret < 0) {
3440            goto fail;
3441        }
3442
3443        ret = qcow2_handle_l2meta(bs, &l2meta, true);
3444        if (ret) {
3445            goto fail;
3446        }
3447
3448        bytes -= cur_bytes;
3449        src_offset += cur_bytes;
3450        dst_offset += cur_bytes;
3451    }
3452    ret = 0;
3453
3454fail:
3455    qcow2_handle_l2meta(bs, &l2meta, false);
3456
3457    qemu_co_mutex_unlock(&s->lock);
3458
3459    trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
3460
3461    return ret;
3462}
3463
3464static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
3465                                          PreallocMode prealloc, Error **errp)
3466{
3467    BDRVQcow2State *s = bs->opaque;
3468    uint64_t old_length;
3469    int64_t new_l1_size;
3470    int ret;
3471    QDict *options;
3472
3473    if (prealloc != PREALLOC_MODE_OFF && prealloc != PREALLOC_MODE_METADATA &&
3474        prealloc != PREALLOC_MODE_FALLOC && prealloc != PREALLOC_MODE_FULL)
3475    {
3476        error_setg(errp, "Unsupported preallocation mode '%s'",
3477                   PreallocMode_str(prealloc));
3478        return -ENOTSUP;
3479    }
3480
3481    if (offset & 511) {
3482        error_setg(errp, "The new size must be a multiple of 512");
3483        return -EINVAL;
3484    }
3485
3486    qemu_co_mutex_lock(&s->lock);
3487
3488    /* cannot proceed if image has snapshots */
3489    if (s->nb_snapshots) {
3490        error_setg(errp, "Can't resize an image which has snapshots");
3491        ret = -ENOTSUP;
3492        goto fail;
3493    }
3494
3495    /* cannot proceed if image has bitmaps */
3496    if (s->nb_bitmaps) {
3497        /* TODO: resize bitmaps in the image */
3498        error_setg(errp, "Can't resize an image which has bitmaps");
3499        ret = -ENOTSUP;
3500        goto fail;
3501    }
3502
3503    old_length = bs->total_sectors * BDRV_SECTOR_SIZE;
3504    new_l1_size = size_to_l1(s, offset);
3505
3506    if (offset < old_length) {
3507        int64_t last_cluster, old_file_size;
3508        if (prealloc != PREALLOC_MODE_OFF) {
3509            error_setg(errp,
3510                       "Preallocation can't be used for shrinking an image");
3511            ret = -EINVAL;
3512            goto fail;
3513        }
3514
3515        ret = qcow2_cluster_discard(bs, ROUND_UP(offset, s->cluster_size),
3516                                    old_length - ROUND_UP(offset,
3517                                                          s->cluster_size),
3518                                    QCOW2_DISCARD_ALWAYS, true);
3519        if (ret < 0) {
3520            error_setg_errno(errp, -ret, "Failed to discard cropped clusters");
3521            goto fail;
3522        }
3523
3524        ret = qcow2_shrink_l1_table(bs, new_l1_size);
3525        if (ret < 0) {
3526            error_setg_errno(errp, -ret,
3527                             "Failed to reduce the number of L2 tables");
3528            goto fail;
3529        }
3530
3531        ret = qcow2_shrink_reftable(bs);
3532        if (ret < 0) {
3533            error_setg_errno(errp, -ret,
3534                             "Failed to discard unused refblocks");
3535            goto fail;
3536        }
3537
3538        old_file_size = bdrv_getlength(bs->file->bs);
3539        if (old_file_size < 0) {
3540            error_setg_errno(errp, -old_file_size,
3541                             "Failed to inquire current file length");
3542            ret = old_file_size;
3543            goto fail;
3544        }
3545        last_cluster = qcow2_get_last_cluster(bs, old_file_size);
3546        if (last_cluster < 0) {
3547            error_setg_errno(errp, -last_cluster,
3548                             "Failed to find the last cluster");
3549            ret = last_cluster;
3550            goto fail;
3551        }
3552        if ((last_cluster + 1) * s->cluster_size < old_file_size) {
3553            Error *local_err = NULL;
3554
3555            bdrv_co_truncate(bs->file, (last_cluster + 1) * s->cluster_size,
3556                             PREALLOC_MODE_OFF, &local_err);
3557            if (local_err) {
3558                warn_reportf_err(local_err,
3559                                 "Failed to truncate the tail of the image: ");
3560            }
3561        }
3562    } else {
3563        ret = qcow2_grow_l1_table(bs, new_l1_size, true);
3564        if (ret < 0) {
3565            error_setg_errno(errp, -ret, "Failed to grow the L1 table");
3566            goto fail;
3567        }
3568    }
3569
3570    switch (prealloc) {
3571    case PREALLOC_MODE_OFF:
3572        break;
3573
3574    case PREALLOC_MODE_METADATA:
3575        ret = preallocate_co(bs, old_length, offset);
3576        if (ret < 0) {
3577            error_setg_errno(errp, -ret, "Preallocation failed");
3578            goto fail;
3579        }
3580        break;
3581
3582    case PREALLOC_MODE_FALLOC:
3583    case PREALLOC_MODE_FULL:
3584    {
3585        int64_t allocation_start, host_offset, guest_offset;
3586        int64_t clusters_allocated;
3587        int64_t old_file_size, new_file_size;
3588        uint64_t nb_new_data_clusters, nb_new_l2_tables;
3589
3590        old_file_size = bdrv_getlength(bs->file->bs);
3591        if (old_file_size < 0) {
3592            error_setg_errno(errp, -old_file_size,
3593                             "Failed to inquire current file length");
3594            ret = old_file_size;
3595            goto fail;
3596        }
3597        old_file_size = ROUND_UP(old_file_size, s->cluster_size);
3598
3599        nb_new_data_clusters = DIV_ROUND_UP(offset - old_length,
3600                                            s->cluster_size);
3601
3602        /* This is an overestimation; we will not actually allocate space for
3603         * these in the file but just make sure the new refcount structures are
3604         * able to cover them so we will not have to allocate new refblocks
3605         * while entering the data blocks in the potentially new L2 tables.
3606         * (We do not actually care where the L2 tables are placed. Maybe they
3607         *  are already allocated or they can be placed somewhere before
3608         *  @old_file_size. It does not matter because they will be fully
3609         *  allocated automatically, so they do not need to be covered by the
3610         *  preallocation. All that matters is that we will not have to allocate
3611         *  new refcount structures for them.) */
3612        nb_new_l2_tables = DIV_ROUND_UP(nb_new_data_clusters,
3613                                        s->cluster_size / sizeof(uint64_t));
3614        /* The cluster range may not be aligned to L2 boundaries, so add one L2
3615         * table for a potential head/tail */
3616        nb_new_l2_tables++;
3617
3618        allocation_start = qcow2_refcount_area(bs, old_file_size,
3619                                               nb_new_data_clusters +
3620                                               nb_new_l2_tables,
3621                                               true, 0, 0);
3622        if (allocation_start < 0) {
3623            error_setg_errno(errp, -allocation_start,
3624                             "Failed to resize refcount structures");
3625            ret = allocation_start;
3626            goto fail;
3627        }
3628
3629        clusters_allocated = qcow2_alloc_clusters_at(bs, allocation_start,
3630                                                     nb_new_data_clusters);
3631        if (clusters_allocated < 0) {
3632            error_setg_errno(errp, -clusters_allocated,
3633                             "Failed to allocate data clusters");
3634            ret = clusters_allocated;
3635            goto fail;
3636        }
3637
3638        assert(clusters_allocated == nb_new_data_clusters);
3639
3640        /* Allocate the data area */
3641        new_file_size = allocation_start +
3642                        nb_new_data_clusters * s->cluster_size;
3643        ret = bdrv_co_truncate(bs->file, new_file_size, prealloc, errp);
3644        if (ret < 0) {
3645            error_prepend(errp, "Failed to resize underlying file: ");
3646            qcow2_free_clusters(bs, allocation_start,
3647                                nb_new_data_clusters * s->cluster_size,
3648                                QCOW2_DISCARD_OTHER);
3649            goto fail;
3650        }
3651
3652        /* Create the necessary L2 entries */
3653        host_offset = allocation_start;
3654        guest_offset = old_length;
3655        while (nb_new_data_clusters) {
3656            int64_t nb_clusters = MIN(
3657                nb_new_data_clusters,
3658                s->l2_slice_size - offset_to_l2_slice_index(s, guest_offset));
3659            QCowL2Meta allocation = {
3660                .offset       = guest_offset,
3661                .alloc_offset = host_offset,
3662                .nb_clusters  = nb_clusters,
3663            };
3664            qemu_co_queue_init(&allocation.dependent_requests);
3665
3666            ret = qcow2_alloc_cluster_link_l2(bs, &allocation);
3667            if (ret < 0) {
3668                error_setg_errno(errp, -ret, "Failed to update L2 tables");
3669                qcow2_free_clusters(bs, host_offset,
3670                                    nb_new_data_clusters * s->cluster_size,
3671                                    QCOW2_DISCARD_OTHER);
3672                goto fail;
3673            }
3674
3675            guest_offset += nb_clusters * s->cluster_size;
3676            host_offset += nb_clusters * s->cluster_size;
3677            nb_new_data_clusters -= nb_clusters;
3678        }
3679        break;
3680    }
3681
3682    default:
3683        g_assert_not_reached();
3684    }
3685
3686    if (prealloc != PREALLOC_MODE_OFF) {
3687        /* Flush metadata before actually changing the image size */
3688        ret = qcow2_write_caches(bs);
3689        if (ret < 0) {
3690            error_setg_errno(errp, -ret,
3691                             "Failed to flush the preallocated area to disk");
3692            goto fail;
3693        }
3694    }
3695
3696    bs->total_sectors = offset / BDRV_SECTOR_SIZE;
3697
3698    /* write updated header.size */
3699    offset = cpu_to_be64(offset);
3700    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size),
3701                           &offset, sizeof(uint64_t));
3702    if (ret < 0) {
3703        error_setg_errno(errp, -ret, "Failed to update the image size");
3704        goto fail;
3705    }
3706
3707    s->l1_vm_state_index = new_l1_size;
3708
3709    /* Update cache sizes */
3710    options = qdict_clone_shallow(bs->options);
3711    ret = qcow2_update_options(bs, options, s->flags, errp);
3712    qobject_unref(options);
3713    if (ret < 0) {
3714        goto fail;
3715    }
3716    ret = 0;
3717fail:
3718    qemu_co_mutex_unlock(&s->lock);
3719    return ret;
3720}
3721
3722/*
3723 * qcow2_compress()
3724 *
3725 * @dest - destination buffer, at least of @size-1 bytes
3726 * @src - source buffer, @size bytes
3727 *
3728 * Returns: compressed size on success
3729 *          -1 if compression is inefficient
3730 *          -2 on any other error
3731 */
3732static ssize_t qcow2_compress(void *dest, const void *src, size_t size)
3733{
3734    ssize_t ret;
3735    z_stream strm;
3736
3737    /* best compression, small window, no zlib header */
3738    memset(&strm, 0, sizeof(strm));
3739    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
3740                       -12, 9, Z_DEFAULT_STRATEGY);
3741    if (ret != 0) {
3742        return -2;
3743    }
3744
3745    /* strm.next_in is not const in old zlib versions, such as those used on
3746     * OpenBSD/NetBSD, so cast the const away */
3747    strm.avail_in = size;
3748    strm.next_in = (void *) src;
3749    strm.avail_out = size - 1;
3750    strm.next_out = dest;
3751
3752    ret = deflate(&strm, Z_FINISH);
3753    if (ret == Z_STREAM_END) {
3754        ret = size - 1 - strm.avail_out;
3755    } else {
3756        ret = (ret == Z_OK ? -1 : -2);
3757    }
3758
3759    deflateEnd(&strm);
3760
3761    return ret;
3762}
3763
3764#define MAX_COMPRESS_THREADS 4
3765
3766typedef struct Qcow2CompressData {
3767    void *dest;
3768    const void *src;
3769    size_t size;
3770    ssize_t ret;
3771} Qcow2CompressData;
3772
3773static int qcow2_compress_pool_func(void *opaque)
3774{
3775    Qcow2CompressData *data = opaque;
3776
3777    data->ret = qcow2_compress(data->dest, data->src, data->size);
3778
3779    return 0;
3780}
3781
3782static void qcow2_compress_complete(void *opaque, int ret)
3783{
3784    qemu_coroutine_enter(opaque);
3785}
3786
3787/* See qcow2_compress definition for parameters description */
3788static ssize_t qcow2_co_compress(BlockDriverState *bs,
3789                                 void *dest, const void *src, size_t size)
3790{
3791    BDRVQcow2State *s = bs->opaque;
3792    BlockAIOCB *acb;
3793    ThreadPool *pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
3794    Qcow2CompressData arg = {
3795        .dest = dest,
3796        .src = src,
3797        .size = size,
3798    };
3799
3800    while (s->nb_compress_threads >= MAX_COMPRESS_THREADS) {
3801        qemu_co_queue_wait(&s->compress_wait_queue, NULL);
3802    }
3803
3804    s->nb_compress_threads++;
3805    acb = thread_pool_submit_aio(pool, qcow2_compress_pool_func, &arg,
3806                                 qcow2_compress_complete,
3807                                 qemu_coroutine_self());
3808
3809    if (!acb) {
3810        s->nb_compress_threads--;
3811        return -EINVAL;
3812    }
3813    qemu_coroutine_yield();
3814    s->nb_compress_threads--;
3815    qemu_co_queue_next(&s->compress_wait_queue);
3816
3817    return arg.ret;
3818}
3819
3820/* XXX: put compressed sectors first, then all the cluster aligned
3821   tables to avoid losing bytes in alignment */
3822static coroutine_fn int
3823qcow2_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
3824                            uint64_t bytes, QEMUIOVector *qiov)
3825{
3826    BDRVQcow2State *s = bs->opaque;
3827    QEMUIOVector hd_qiov;
3828    struct iovec iov;
3829    int ret;
3830    size_t out_len;
3831    uint8_t *buf, *out_buf;
3832    int64_t cluster_offset;
3833
3834    if (bytes == 0) {
3835        /* align end of file to a sector boundary to ease reading with
3836           sector based I/Os */
3837        cluster_offset = bdrv_getlength(bs->file->bs);
3838        if (cluster_offset < 0) {
3839            return cluster_offset;
3840        }
3841        return bdrv_co_truncate(bs->file, cluster_offset, PREALLOC_MODE_OFF,
3842                                NULL);
3843    }
3844
3845    if (offset_into_cluster(s, offset)) {
3846        return -EINVAL;
3847    }
3848
3849    buf = qemu_blockalign(bs, s->cluster_size);
3850    if (bytes != s->cluster_size) {
3851        if (bytes > s->cluster_size ||
3852            offset + bytes != bs->total_sectors << BDRV_SECTOR_BITS)
3853        {
3854            qemu_vfree(buf);
3855            return -EINVAL;
3856        }
3857        /* Zero-pad last write if image size is not cluster aligned */
3858        memset(buf + bytes, 0, s->cluster_size - bytes);
3859    }
3860    qemu_iovec_to_buf(qiov, 0, buf, bytes);
3861
3862    out_buf = g_malloc(s->cluster_size);
3863
3864    out_len = qcow2_co_compress(bs, out_buf, buf, s->cluster_size);
3865    if (out_len == -2) {
3866        ret = -EINVAL;
3867        goto fail;
3868    } else if (out_len == -1) {
3869        /* could not compress: write normal cluster */
3870        ret = qcow2_co_pwritev(bs, offset, bytes, qiov, 0);
3871        if (ret < 0) {
3872            goto fail;
3873        }
3874        goto success;
3875    }
3876
3877    qemu_co_mutex_lock(&s->lock);
3878    cluster_offset =
3879        qcow2_alloc_compressed_cluster_offset(bs, offset, out_len);
3880    if (!cluster_offset) {
3881        qemu_co_mutex_unlock(&s->lock);
3882        ret = -EIO;
3883        goto fail;
3884    }
3885    cluster_offset &= s->cluster_offset_mask;
3886
3887    ret = qcow2_pre_write_overlap_check(bs, 0, cluster_offset, out_len);
3888    qemu_co_mutex_unlock(&s->lock);
3889    if (ret < 0) {
3890        goto fail;
3891    }
3892
3893    iov = (struct iovec) {
3894        .iov_base   = out_buf,
3895        .iov_len    = out_len,
3896    };
3897    qemu_iovec_init_external(&hd_qiov, &iov, 1);
3898
3899    BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
3900    ret = bdrv_co_pwritev(bs->file, cluster_offset, out_len, &hd_qiov, 0);
3901    if (ret < 0) {
3902        goto fail;
3903    }
3904success:
3905    ret = 0;
3906fail:
3907    qemu_vfree(buf);
3908    g_free(out_buf);
3909    return ret;
3910}
3911
3912static int make_completely_empty(BlockDriverState *bs)
3913{
3914    BDRVQcow2State *s = bs->opaque;
3915    Error *local_err = NULL;
3916    int ret, l1_clusters;
3917    int64_t offset;
3918    uint64_t *new_reftable = NULL;
3919    uint64_t rt_entry, l1_size2;
3920    struct {
3921        uint64_t l1_offset;
3922        uint64_t reftable_offset;
3923        uint32_t reftable_clusters;
3924    } QEMU_PACKED l1_ofs_rt_ofs_cls;
3925
3926    ret = qcow2_cache_empty(bs, s->l2_table_cache);
3927    if (ret < 0) {
3928        goto fail;
3929    }
3930
3931    ret = qcow2_cache_empty(bs, s->refcount_block_cache);
3932    if (ret < 0) {
3933        goto fail;
3934    }
3935
3936    /* Refcounts will be broken utterly */
3937    ret = qcow2_mark_dirty(bs);
3938    if (ret < 0) {
3939        goto fail;
3940    }
3941
3942    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
3943
3944    l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t));
3945    l1_size2 = (uint64_t)s->l1_size * sizeof(uint64_t);
3946
3947    /* After this call, neither the in-memory nor the on-disk refcount
3948     * information accurately describe the actual references */
3949
3950    ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset,
3951                             l1_clusters * s->cluster_size, 0);
3952    if (ret < 0) {
3953        goto fail_broken_refcounts;
3954    }
3955    memset(s->l1_table, 0, l1_size2);
3956
3957    BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE);
3958
3959    /* Overwrite enough clusters at the beginning of the sectors to place
3960     * the refcount table, a refcount block and the L1 table in; this may
3961     * overwrite parts of the existing refcount and L1 table, which is not
3962     * an issue because the dirty flag is set, complete data loss is in fact
3963     * desired and partial data loss is consequently fine as well */
3964    ret = bdrv_pwrite_zeroes(bs->file, s->cluster_size,
3965                             (2 + l1_clusters) * s->cluster_size, 0);
3966    /* This call (even if it failed overall) may have overwritten on-disk
3967     * refcount structures; in that case, the in-memory refcount information
3968     * will probably differ from the on-disk information which makes the BDS
3969     * unusable */
3970    if (ret < 0) {
3971        goto fail_broken_refcounts;
3972    }
3973
3974    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
3975    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE);
3976
3977    /* "Create" an empty reftable (one cluster) directly after the image
3978     * header and an empty L1 table three clusters after the image header;
3979     * the cluster between those two will be used as the first refblock */
3980    l1_ofs_rt_ofs_cls.l1_offset = cpu_to_be64(3 * s->cluster_size);
3981    l1_ofs_rt_ofs_cls.reftable_offset = cpu_to_be64(s->cluster_size);
3982    l1_ofs_rt_ofs_cls.reftable_clusters = cpu_to_be32(1);
3983    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_table_offset),
3984                           &l1_ofs_rt_ofs_cls, sizeof(l1_ofs_rt_ofs_cls));
3985    if (ret < 0) {
3986        goto fail_broken_refcounts;
3987    }
3988
3989    s->l1_table_offset = 3 * s->cluster_size;
3990
3991    new_reftable = g_try_new0(uint64_t, s->cluster_size / sizeof(uint64_t));
3992    if (!new_reftable) {
3993        ret = -ENOMEM;
3994        goto fail_broken_refcounts;
3995    }
3996
3997    s->refcount_table_offset = s->cluster_size;
3998    s->refcount_table_size   = s->cluster_size / sizeof(uint64_t);
3999    s->max_refcount_table_index = 0;
4000
4001    g_free(s->refcount_table);
4002    s->refcount_table = new_reftable;
4003    new_reftable = NULL;
4004
4005    /* Now the in-memory refcount information again corresponds to the on-disk
4006     * information (reftable is empty and no refblocks (the refblock cache is
4007     * empty)); however, this means some clusters (e.g. the image header) are
4008     * referenced, but not refcounted, but the normal qcow2 code assumes that
4009     * the in-memory information is always correct */
4010
4011    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
4012
4013    /* Enter the first refblock into the reftable */
4014    rt_entry = cpu_to_be64(2 * s->cluster_size);
4015    ret = bdrv_pwrite_sync(bs->file, s->cluster_size,
4016                           &rt_entry, sizeof(rt_entry));
4017    if (ret < 0) {
4018        goto fail_broken_refcounts;
4019    }
4020    s->refcount_table[0] = 2 * s->cluster_size;
4021
4022    s->free_cluster_index = 0;
4023    assert(3 + l1_clusters <= s->refcount_block_size);
4024    offset = qcow2_alloc_clusters(bs, 3 * s->cluster_size + l1_size2);
4025    if (offset < 0) {
4026        ret = offset;
4027        goto fail_broken_refcounts;
4028    } else if (offset > 0) {
4029        error_report("First cluster in emptied image is in use");
4030        abort();
4031    }
4032
4033    /* Now finally the in-memory information corresponds to the on-disk
4034     * structures and is correct */
4035    ret = qcow2_mark_clean(bs);
4036    if (ret < 0) {
4037        goto fail;
4038    }
4039
4040    ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size,
4041                        PREALLOC_MODE_OFF, &local_err);
4042    if (ret < 0) {
4043        error_report_err(local_err);
4044        goto fail;
4045    }
4046
4047    return 0;
4048
4049fail_broken_refcounts:
4050    /* The BDS is unusable at this point. If we wanted to make it usable, we
4051     * would have to call qcow2_refcount_close(), qcow2_refcount_init(),
4052     * qcow2_check_refcounts(), qcow2_refcount_close() and qcow2_refcount_init()
4053     * again. However, because the functions which could have caused this error
4054     * path to be taken are used by those functions as well, it's very likely
4055     * that that sequence will fail as well. Therefore, just eject the BDS. */
4056    bs->drv = NULL;
4057
4058fail:
4059    g_free(new_reftable);
4060    return ret;
4061}
4062
4063static int qcow2_make_empty(BlockDriverState *bs)
4064{
4065    BDRVQcow2State *s = bs->opaque;
4066    uint64_t offset, end_offset;
4067    int step = QEMU_ALIGN_DOWN(INT_MAX, s->cluster_size);
4068    int l1_clusters, ret = 0;
4069
4070    l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t));
4071
4072    if (s->qcow_version >= 3 && !s->snapshots && !s->nb_bitmaps &&
4073        3 + l1_clusters <= s->refcount_block_size &&
4074        s->crypt_method_header != QCOW_CRYPT_LUKS) {
4075        /* The following function only works for qcow2 v3 images (it
4076         * requires the dirty flag) and only as long as there are no
4077         * features that reserve extra clusters (such as snapshots,
4078         * LUKS header, or persistent bitmaps), because it completely
4079         * empties the image.  Furthermore, the L1 table and three
4080         * additional clusters (image header, refcount table, one
4081         * refcount block) have to fit inside one refcount block. */
4082        return make_completely_empty(bs);
4083    }
4084
4085    /* This fallback code simply discards every active cluster; this is slow,
4086     * but works in all cases */
4087    end_offset = bs->total_sectors * BDRV_SECTOR_SIZE;
4088    for (offset = 0; offset < end_offset; offset += step) {
4089        /* As this function is generally used after committing an external
4090         * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
4091         * default action for this kind of discard is to pass the discard,
4092         * which will ideally result in an actually smaller image file, as
4093         * is probably desired. */
4094        ret = qcow2_cluster_discard(bs, offset, MIN(step, end_offset - offset),
4095                                    QCOW2_DISCARD_SNAPSHOT, true);
4096        if (ret < 0) {
4097            break;
4098        }
4099    }
4100
4101    return ret;
4102}
4103
4104static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
4105{
4106    BDRVQcow2State *s = bs->opaque;
4107    int ret;
4108
4109    qemu_co_mutex_lock(&s->lock);
4110    ret = qcow2_write_caches(bs);
4111    qemu_co_mutex_unlock(&s->lock);
4112
4113    return ret;
4114}
4115
4116static BlockMeasureInfo *qcow2_measure(QemuOpts *opts, BlockDriverState *in_bs,
4117                                       Error **errp)
4118{
4119    Error *local_err = NULL;
4120    BlockMeasureInfo *info;
4121    uint64_t required = 0; /* bytes that contribute to required size */
4122    uint64_t virtual_size; /* disk size as seen by guest */
4123    uint64_t refcount_bits;
4124    uint64_t l2_tables;
4125    size_t cluster_size;
4126    int version;
4127    char *optstr;
4128    PreallocMode prealloc;
4129    bool has_backing_file;
4130
4131    /* Parse image creation options */
4132    cluster_size = qcow2_opt_get_cluster_size_del(opts, &local_err);
4133    if (local_err) {
4134        goto err;
4135    }
4136
4137    version = qcow2_opt_get_version_del(opts, &local_err);
4138    if (local_err) {
4139        goto err;
4140    }
4141
4142    refcount_bits = qcow2_opt_get_refcount_bits_del(opts, version, &local_err);
4143    if (local_err) {
4144        goto err;
4145    }
4146
4147    optstr = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
4148    prealloc = qapi_enum_parse(&PreallocMode_lookup, optstr,
4149                               PREALLOC_MODE_OFF, &local_err);
4150    g_free(optstr);
4151    if (local_err) {
4152        goto err;
4153    }
4154
4155    optstr = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
4156    has_backing_file = !!optstr;
4157    g_free(optstr);
4158
4159    virtual_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
4160    virtual_size = ROUND_UP(virtual_size, cluster_size);
4161
4162    /* Check that virtual disk size is valid */
4163    l2_tables = DIV_ROUND_UP(virtual_size / cluster_size,
4164                             cluster_size / sizeof(uint64_t));
4165    if (l2_tables * sizeof(uint64_t) > QCOW_MAX_L1_SIZE) {
4166        error_setg(&local_err, "The image size is too large "
4167                               "(try using a larger cluster size)");
4168        goto err;
4169    }
4170
4171    /* Account for input image */
4172    if (in_bs) {
4173        int64_t ssize = bdrv_getlength(in_bs);
4174        if (ssize < 0) {
4175            error_setg_errno(&local_err, -ssize,
4176                             "Unable to get image virtual_size");
4177            goto err;
4178        }
4179
4180        virtual_size = ROUND_UP(ssize, cluster_size);
4181
4182        if (has_backing_file) {
4183            /* We don't how much of the backing chain is shared by the input
4184             * image and the new image file.  In the worst case the new image's
4185             * backing file has nothing in common with the input image.  Be
4186             * conservative and assume all clusters need to be written.
4187             */
4188            required = virtual_size;
4189        } else {
4190            int64_t offset;
4191            int64_t pnum = 0;
4192
4193            for (offset = 0; offset < ssize; offset += pnum) {
4194                int ret;
4195
4196                ret = bdrv_block_status_above(in_bs, NULL, offset,
4197                                              ssize - offset, &pnum, NULL,
4198                                              NULL);
4199                if (ret < 0) {
4200                    error_setg_errno(&local_err, -ret,
4201                                     "Unable to get block status");
4202                    goto err;
4203                }
4204
4205                if (ret & BDRV_BLOCK_ZERO) {
4206                    /* Skip zero regions (safe with no backing file) */
4207                } else if ((ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) ==
4208                           (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) {
4209                    /* Extend pnum to end of cluster for next iteration */
4210                    pnum = ROUND_UP(offset + pnum, cluster_size) - offset;
4211
4212                    /* Count clusters we've seen */
4213                    required += offset % cluster_size + pnum;
4214                }
4215            }
4216        }
4217    }
4218
4219    /* Take into account preallocation.  Nothing special is needed for
4220     * PREALLOC_MODE_METADATA since metadata is always counted.
4221     */
4222    if (prealloc == PREALLOC_MODE_FULL || prealloc == PREALLOC_MODE_FALLOC) {
4223        required = virtual_size;
4224    }
4225
4226    info = g_new(BlockMeasureInfo, 1);
4227    info->fully_allocated =
4228        qcow2_calc_prealloc_size(virtual_size, cluster_size,
4229                                 ctz32(refcount_bits));
4230
4231    /* Remove data clusters that are not required.  This overestimates the
4232     * required size because metadata needed for the fully allocated file is
4233     * still counted.
4234     */
4235    info->required = info->fully_allocated - virtual_size + required;
4236    return info;
4237
4238err:
4239    error_propagate(errp, local_err);
4240    return NULL;
4241}
4242
4243static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
4244{
4245    BDRVQcow2State *s = bs->opaque;
4246    bdi->unallocated_blocks_are_zero = true;
4247    bdi->cluster_size = s->cluster_size;
4248    bdi->vm_state_offset = qcow2_vm_state_offset(s);
4249    return 0;
4250}
4251
4252static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs)
4253{
4254    BDRVQcow2State *s = bs->opaque;
4255    ImageInfoSpecific *spec_info;
4256    QCryptoBlockInfo *encrypt_info = NULL;
4257
4258    if (s->crypto != NULL) {
4259        encrypt_info = qcrypto_block_get_info(s->crypto, &error_abort);
4260    }
4261
4262    spec_info = g_new(ImageInfoSpecific, 1);
4263    *spec_info = (ImageInfoSpecific){
4264        .type  = IMAGE_INFO_SPECIFIC_KIND_QCOW2,
4265        .u.qcow2.data = g_new(ImageInfoSpecificQCow2, 1),
4266    };
4267    if (s->qcow_version == 2) {
4268        *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){
4269            .compat             = g_strdup("0.10"),
4270            .refcount_bits      = s->refcount_bits,
4271        };
4272    } else if (s->qcow_version == 3) {
4273        *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){
4274            .compat             = g_strdup("1.1"),
4275            .lazy_refcounts     = s->compatible_features &
4276                                  QCOW2_COMPAT_LAZY_REFCOUNTS,
4277            .has_lazy_refcounts = true,
4278            .corrupt            = s->incompatible_features &
4279                                  QCOW2_INCOMPAT_CORRUPT,
4280            .has_corrupt        = true,
4281            .refcount_bits      = s->refcount_bits,
4282        };
4283    } else {
4284        /* if this assertion fails, this probably means a new version was
4285         * added without having it covered here */
4286        assert(false);
4287    }
4288
4289    if (encrypt_info) {
4290        ImageInfoSpecificQCow2Encryption *qencrypt =
4291            g_new(ImageInfoSpecificQCow2Encryption, 1);
4292        switch (encrypt_info->format) {
4293        case Q_CRYPTO_BLOCK_FORMAT_QCOW:
4294            qencrypt->format = BLOCKDEV_QCOW2_ENCRYPTION_FORMAT_AES;
4295            break;
4296        case Q_CRYPTO_BLOCK_FORMAT_LUKS:
4297            qencrypt->format = BLOCKDEV_QCOW2_ENCRYPTION_FORMAT_LUKS;
4298            qencrypt->u.luks = encrypt_info->u.luks;
4299            break;
4300        default:
4301            abort();
4302        }
4303        /* Since we did shallow copy above, erase any pointers
4304         * in the original info */
4305        memset(&encrypt_info->u, 0, sizeof(encrypt_info->u));
4306        qapi_free_QCryptoBlockInfo(encrypt_info);
4307
4308        spec_info->u.qcow2.data->has_encrypt = true;
4309        spec_info->u.qcow2.data->encrypt = qencrypt;
4310    }
4311
4312    return spec_info;
4313}
4314
4315static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
4316                              int64_t pos)
4317{
4318    BDRVQcow2State *s = bs->opaque;
4319
4320    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
4321    return bs->drv->bdrv_co_pwritev(bs, qcow2_vm_state_offset(s) + pos,
4322                                    qiov->size, qiov, 0);
4323}
4324
4325static int qcow2_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
4326                              int64_t pos)
4327{
4328    BDRVQcow2State *s = bs->opaque;
4329
4330    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
4331    return bs->drv->bdrv_co_preadv(bs, qcow2_vm_state_offset(s) + pos,
4332                                   qiov->size, qiov, 0);
4333}
4334
4335/*
4336 * Downgrades an image's version. To achieve this, any incompatible features
4337 * have to be removed.
4338 */
4339static int qcow2_downgrade(BlockDriverState *bs, int target_version,
4340                           BlockDriverAmendStatusCB *status_cb, void *cb_opaque,
4341                           Error **errp)
4342{
4343    BDRVQcow2State *s = bs->opaque;
4344    int current_version = s->qcow_version;
4345    int ret;
4346
4347    /* This is qcow2_downgrade(), not qcow2_upgrade() */
4348    assert(target_version < current_version);
4349
4350    /* There are no other versions (now) that you can downgrade to */
4351    assert(target_version == 2);
4352
4353    if (s->refcount_order != 4) {
4354        error_setg(errp, "compat=0.10 requires refcount_bits=16");
4355        return -ENOTSUP;
4356    }
4357
4358    /* clear incompatible features */
4359    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
4360        ret = qcow2_mark_clean(bs);
4361        if (ret < 0) {
4362            error_setg_errno(errp, -ret, "Failed to make the image clean");
4363            return ret;
4364        }
4365    }
4366
4367    /* with QCOW2_INCOMPAT_CORRUPT, it is pretty much impossible to get here in
4368     * the first place; if that happens nonetheless, returning -ENOTSUP is the
4369     * best thing to do anyway */
4370
4371    if (s->incompatible_features) {
4372        error_setg(errp, "Cannot downgrade an image with incompatible features "
4373                   "%#" PRIx64 " set", s->incompatible_features);
4374        return -ENOTSUP;
4375    }
4376
4377    /* since we can ignore compatible features, we can set them to 0 as well */
4378    s->compatible_features = 0;
4379    /* if lazy refcounts have been used, they have already been fixed through
4380     * clearing the dirty flag */
4381
4382    /* clearing autoclear features is trivial */
4383    s->autoclear_features = 0;
4384
4385    ret = qcow2_expand_zero_clusters(bs, status_cb, cb_opaque);
4386    if (ret < 0) {
4387        error_setg_errno(errp, -ret, "Failed to turn zero into data clusters");
4388        return ret;
4389    }
4390
4391    s->qcow_version = target_version;
4392    ret = qcow2_update_header(bs);
4393    if (ret < 0) {
4394        s->qcow_version = current_version;
4395        error_setg_errno(errp, -ret, "Failed to update the image header");
4396        return ret;
4397    }
4398    return 0;
4399}
4400
4401typedef enum Qcow2AmendOperation {
4402    /* This is the value Qcow2AmendHelperCBInfo::last_operation will be
4403     * statically initialized to so that the helper CB can discern the first
4404     * invocation from an operation change */
4405    QCOW2_NO_OPERATION = 0,
4406
4407    QCOW2_CHANGING_REFCOUNT_ORDER,
4408    QCOW2_DOWNGRADING,
4409} Qcow2AmendOperation;
4410
4411typedef struct Qcow2AmendHelperCBInfo {
4412    /* The code coordinating the amend operations should only modify
4413     * these four fields; the rest will be managed by the CB */
4414    BlockDriverAmendStatusCB *original_status_cb;
4415    void *original_cb_opaque;
4416
4417    Qcow2AmendOperation current_operation;
4418
4419    /* Total number of operations to perform (only set once) */
4420    int total_operations;
4421
4422    /* The following fields are managed by the CB */
4423
4424    /* Number of operations completed */
4425    int operations_completed;
4426
4427    /* Cumulative offset of all completed operations */
4428    int64_t offset_completed;
4429
4430    Qcow2AmendOperation last_operation;
4431    int64_t last_work_size;
4432} Qcow2AmendHelperCBInfo;
4433
4434static void qcow2_amend_helper_cb(BlockDriverState *bs,
4435                                  int64_t operation_offset,
4436                                  int64_t operation_work_size, void *opaque)
4437{
4438    Qcow2AmendHelperCBInfo *info = opaque;
4439    int64_t current_work_size;
4440    int64_t projected_work_size;
4441
4442    if (info->current_operation != info->last_operation) {
4443        if (info->last_operation != QCOW2_NO_OPERATION) {
4444            info->offset_completed += info->last_work_size;
4445            info->operations_completed++;
4446        }
4447
4448        info->last_operation = info->current_operation;
4449    }
4450
4451    assert(info->total_operations > 0);
4452    assert(info->operations_completed < info->total_operations);
4453
4454    info->last_work_size = operation_work_size;
4455
4456    current_work_size = info->offset_completed + operation_work_size;
4457
4458    /* current_work_size is the total work size for (operations_completed + 1)
4459     * operations (which includes this one), so multiply it by the number of
4460     * operations not covered and divide it by the number of operations
4461     * covered to get a projection for the operations not covered */
4462    projected_work_size = current_work_size * (info->total_operations -
4463                                               info->operations_completed - 1)
4464                                            / (info->operations_completed + 1);
4465
4466    info->original_status_cb(bs, info->offset_completed + operation_offset,
4467                             current_work_size + projected_work_size,
4468                             info->original_cb_opaque);
4469}
4470
4471static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
4472                               BlockDriverAmendStatusCB *status_cb,
4473                               void *cb_opaque,
4474                               Error **errp)
4475{
4476    BDRVQcow2State *s = bs->opaque;
4477    int old_version = s->qcow_version, new_version = old_version;
4478    uint64_t new_size = 0;
4479    const char *backing_file = NULL, *backing_format = NULL;
4480    bool lazy_refcounts = s->use_lazy_refcounts;
4481    const char *compat = NULL;
4482    uint64_t cluster_size = s->cluster_size;
4483    bool encrypt;
4484    int encformat;
4485    int refcount_bits = s->refcount_bits;
4486    int ret;
4487    QemuOptDesc *desc = opts->list->desc;
4488    Qcow2AmendHelperCBInfo helper_cb_info;
4489
4490    while (desc && desc->name) {
4491        if (!qemu_opt_find(opts, desc->name)) {
4492            /* only change explicitly defined options */
4493            desc++;
4494            continue;
4495        }
4496
4497        if (!strcmp(desc->name, BLOCK_OPT_COMPAT_LEVEL)) {
4498            compat = qemu_opt_get(opts, BLOCK_OPT_COMPAT_LEVEL);
4499            if (!compat) {
4500                /* preserve default */
4501            } else if (!strcmp(compat, "0.10")) {
4502                new_version = 2;
4503            } else if (!strcmp(compat, "1.1")) {
4504                new_version = 3;
4505            } else {
4506                error_setg(errp, "Unknown compatibility level %s", compat);
4507                return -EINVAL;
4508            }
4509        } else if (!strcmp(desc->name, BLOCK_OPT_PREALLOC)) {
4510            error_setg(errp, "Cannot change preallocation mode");
4511            return -ENOTSUP;
4512        } else if (!strcmp(desc->name, BLOCK_OPT_SIZE)) {
4513            new_size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
4514        } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FILE)) {
4515            backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
4516        } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FMT)) {
4517            backing_format = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
4518        } else if (!strcmp(desc->name, BLOCK_OPT_ENCRYPT)) {
4519            encrypt = qemu_opt_get_bool(opts, BLOCK_OPT_ENCRYPT,
4520                                        !!s->crypto);
4521
4522            if (encrypt != !!s->crypto) {
4523                error_setg(errp,
4524                           "Changing the encryption flag is not supported");
4525                return -ENOTSUP;
4526            }
4527        } else if (!strcmp(desc->name, BLOCK_OPT_ENCRYPT_FORMAT)) {
4528            encformat = qcow2_crypt_method_from_format(
4529                qemu_opt_get(opts, BLOCK_OPT_ENCRYPT_FORMAT));
4530
4531            if (encformat != s->crypt_method_header) {
4532                error_setg(errp,
4533                           "Changing the encryption format is not supported");
4534                return -ENOTSUP;
4535            }
4536        } else if (g_str_has_prefix(desc->name, "encrypt.")) {
4537            error_setg(errp,
4538                       "Changing the encryption parameters is not supported");
4539            return -ENOTSUP;
4540        } else if (!strcmp(desc->name, BLOCK_OPT_CLUSTER_SIZE)) {
4541            cluster_size = qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE,
4542                                             cluster_size);
4543            if (cluster_size != s->cluster_size) {
4544                error_setg(errp, "Changing the cluster size is not supported");
4545                return -ENOTSUP;
4546            }
4547        } else if (!strcmp(desc->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
4548            lazy_refcounts = qemu_opt_get_bool(opts, BLOCK_OPT_LAZY_REFCOUNTS,
4549                                               lazy_refcounts);
4550        } else if (!strcmp(desc->name, BLOCK_OPT_REFCOUNT_BITS)) {
4551            refcount_bits = qemu_opt_get_number(opts, BLOCK_OPT_REFCOUNT_BITS,
4552                                                refcount_bits);
4553
4554            if (refcount_bits <= 0 || refcount_bits > 64 ||
4555                !is_power_of_2(refcount_bits))
4556            {
4557                error_setg(errp, "Refcount width must be a power of two and "
4558                           "may not exceed 64 bits");
4559                return -EINVAL;
4560            }
4561        } else {
4562            /* if this point is reached, this probably means a new option was
4563             * added without having it covered here */
4564            abort();
4565        }
4566
4567        desc++;
4568    }
4569
4570    helper_cb_info = (Qcow2AmendHelperCBInfo){
4571        .original_status_cb = status_cb,
4572        .original_cb_opaque = cb_opaque,
4573        .total_operations = (new_version < old_version)
4574                          + (s->refcount_bits != refcount_bits)
4575    };
4576
4577    /* Upgrade first (some features may require compat=1.1) */
4578    if (new_version > old_version) {
4579        s->qcow_version = new_version;
4580        ret = qcow2_update_header(bs);
4581        if (ret < 0) {
4582            s->qcow_version = old_version;
4583            error_setg_errno(errp, -ret, "Failed to update the image header");
4584            return ret;
4585        }
4586    }
4587
4588    if (s->refcount_bits != refcount_bits) {
4589        int refcount_order = ctz32(refcount_bits);
4590
4591        if (new_version < 3 && refcount_bits != 16) {
4592            error_setg(errp, "Refcount widths other than 16 bits require "
4593                       "compatibility level 1.1 or above (use compat=1.1 or "
4594                       "greater)");
4595            return -EINVAL;
4596        }
4597
4598        helper_cb_info.current_operation = QCOW2_CHANGING_REFCOUNT_ORDER;
4599        ret = qcow2_change_refcount_order(bs, refcount_order,
4600                                          &qcow2_amend_helper_cb,
4601                                          &helper_cb_info, errp);
4602        if (ret < 0) {
4603            return ret;
4604        }
4605    }
4606
4607    if (backing_file || backing_format) {
4608        ret = qcow2_change_backing_file(bs,
4609                    backing_file ?: s->image_backing_file,
4610                    backing_format ?: s->image_backing_format);
4611        if (ret < 0) {
4612            error_setg_errno(errp, -ret, "Failed to change the backing file");
4613            return ret;
4614        }
4615    }
4616
4617    if (s->use_lazy_refcounts != lazy_refcounts) {
4618        if (lazy_refcounts) {
4619            if (new_version < 3) {
4620                error_setg(errp, "Lazy refcounts only supported with "
4621                           "compatibility level 1.1 and above (use compat=1.1 "
4622                           "or greater)");
4623                return -EINVAL;
4624            }
4625            s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS;
4626            ret = qcow2_update_header(bs);
4627            if (ret < 0) {
4628                s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS;
4629                error_setg_errno(errp, -ret, "Failed to update the image header");
4630                return ret;
4631            }
4632            s->use_lazy_refcounts = true;
4633        } else {
4634            /* make image clean first */
4635            ret = qcow2_mark_clean(bs);
4636            if (ret < 0) {
4637                error_setg_errno(errp, -ret, "Failed to make the image clean");
4638                return ret;
4639            }
4640            /* now disallow lazy refcounts */
4641            s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS;
4642            ret = qcow2_update_header(bs);
4643            if (ret < 0) {
4644                s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS;
4645                error_setg_errno(errp, -ret, "Failed to update the image header");
4646                return ret;
4647            }
4648            s->use_lazy_refcounts = false;
4649        }
4650    }
4651
4652    if (new_size) {
4653        BlockBackend *blk = blk_new(BLK_PERM_RESIZE, BLK_PERM_ALL);
4654        ret = blk_insert_bs(blk, bs, errp);
4655        if (ret < 0) {
4656            blk_unref(blk);
4657            return ret;
4658        }
4659
4660        ret = blk_truncate(blk, new_size, PREALLOC_MODE_OFF, errp);
4661        blk_unref(blk);
4662        if (ret < 0) {
4663            return ret;
4664        }
4665    }
4666
4667    /* Downgrade last (so unsupported features can be removed before) */
4668    if (new_version < old_version) {
4669        helper_cb_info.current_operation = QCOW2_DOWNGRADING;
4670        ret = qcow2_downgrade(bs, new_version, &qcow2_amend_helper_cb,
4671                              &helper_cb_info, errp);
4672        if (ret < 0) {
4673            return ret;
4674        }
4675    }
4676
4677    return 0;
4678}
4679
4680/*
4681 * If offset or size are negative, respectively, they will not be included in
4682 * the BLOCK_IMAGE_CORRUPTED event emitted.
4683 * fatal will be ignored for read-only BDS; corruptions found there will always
4684 * be considered non-fatal.
4685 */
4686void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
4687                             int64_t size, const char *message_format, ...)
4688{
4689    BDRVQcow2State *s = bs->opaque;
4690    const char *node_name;
4691    char *message;
4692    va_list ap;
4693
4694    fatal = fatal && bdrv_is_writable(bs);
4695
4696    if (s->signaled_corruption &&
4697        (!fatal || (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT)))
4698    {
4699        return;
4700    }
4701
4702    va_start(ap, message_format);
4703    message = g_strdup_vprintf(message_format, ap);
4704    va_end(ap);
4705
4706    if (fatal) {
4707        fprintf(stderr, "qcow2: Marking image as corrupt: %s; further "
4708                "corruption events will be suppressed\n", message);
4709    } else {
4710        fprintf(stderr, "qcow2: Image is corrupt: %s; further non-fatal "
4711                "corruption events will be suppressed\n", message);
4712    }
4713
4714    node_name = bdrv_get_node_name(bs);
4715    qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs),
4716                                          *node_name != '\0', node_name,
4717                                          message, offset >= 0, offset,
4718                                          size >= 0, size,
4719                                          fatal);
4720    g_free(message);
4721
4722    if (fatal) {
4723        qcow2_mark_corrupt(bs);
4724        bs->drv = NULL; /* make BDS unusable */
4725    }
4726
4727    s->signaled_corruption = true;
4728}
4729
4730static QemuOptsList qcow2_create_opts = {
4731    .name = "qcow2-create-opts",
4732    .head = QTAILQ_HEAD_INITIALIZER(qcow2_create_opts.head),
4733    .desc = {
4734        {
4735            .name = BLOCK_OPT_SIZE,
4736            .type = QEMU_OPT_SIZE,
4737            .help = "Virtual disk size"
4738        },
4739        {
4740            .name = BLOCK_OPT_COMPAT_LEVEL,
4741            .type = QEMU_OPT_STRING,
4742            .help = "Compatibility level (0.10 or 1.1)"
4743        },
4744        {
4745            .name = BLOCK_OPT_BACKING_FILE,
4746            .type = QEMU_OPT_STRING,
4747            .help = "File name of a base image"
4748        },
4749        {
4750            .name = BLOCK_OPT_BACKING_FMT,
4751            .type = QEMU_OPT_STRING,
4752            .help = "Image format of the base image"
4753        },
4754        {
4755            .name = BLOCK_OPT_ENCRYPT,
4756            .type = QEMU_OPT_BOOL,
4757            .help = "Encrypt the image with format 'aes'. (Deprecated "
4758                    "in favor of " BLOCK_OPT_ENCRYPT_FORMAT "=aes)",
4759        },
4760        {
4761            .name = BLOCK_OPT_ENCRYPT_FORMAT,
4762            .type = QEMU_OPT_STRING,
4763            .help = "Encrypt the image, format choices: 'aes', 'luks'",
4764        },
4765        BLOCK_CRYPTO_OPT_DEF_KEY_SECRET("encrypt.",
4766            "ID of secret providing qcow AES key or LUKS passphrase"),
4767        BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_ALG("encrypt."),
4768        BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_MODE("encrypt."),
4769        BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_ALG("encrypt."),
4770        BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_HASH_ALG("encrypt."),
4771        BLOCK_CRYPTO_OPT_DEF_LUKS_HASH_ALG("encrypt."),
4772        BLOCK_CRYPTO_OPT_DEF_LUKS_ITER_TIME("encrypt."),
4773        {
4774            .name = BLOCK_OPT_CLUSTER_SIZE,
4775            .type = QEMU_OPT_SIZE,
4776            .help = "qcow2 cluster size",
4777            .def_value_str = stringify(DEFAULT_CLUSTER_SIZE)
4778        },
4779        {
4780            .name = BLOCK_OPT_PREALLOC,
4781            .type = QEMU_OPT_STRING,
4782            .help = "Preallocation mode (allowed values: off, metadata, "
4783                    "falloc, full)"
4784        },
4785        {
4786            .name = BLOCK_OPT_LAZY_REFCOUNTS,
4787            .type = QEMU_OPT_BOOL,
4788            .help = "Postpone refcount updates",
4789            .def_value_str = "off"
4790        },
4791        {
4792            .name = BLOCK_OPT_REFCOUNT_BITS,
4793            .type = QEMU_OPT_NUMBER,
4794            .help = "Width of a reference count entry in bits",
4795            .def_value_str = "16"
4796        },
4797        { /* end of list */ }
4798    }
4799};
4800
4801BlockDriver bdrv_qcow2 = {
4802    .format_name        = "qcow2",
4803    .instance_size      = sizeof(BDRVQcow2State),
4804    .bdrv_probe         = qcow2_probe,
4805    .bdrv_open          = qcow2_open,
4806    .bdrv_close         = qcow2_close,
4807    .bdrv_reopen_prepare  = qcow2_reopen_prepare,
4808    .bdrv_reopen_commit   = qcow2_reopen_commit,
4809    .bdrv_reopen_abort    = qcow2_reopen_abort,
4810    .bdrv_join_options    = qcow2_join_options,
4811    .bdrv_child_perm      = bdrv_format_default_perms,
4812    .bdrv_co_create_opts  = qcow2_co_create_opts,
4813    .bdrv_co_create       = qcow2_co_create,
4814    .bdrv_has_zero_init = bdrv_has_zero_init_1,
4815    .bdrv_co_block_status = qcow2_co_block_status,
4816
4817    .bdrv_co_preadv         = qcow2_co_preadv,
4818    .bdrv_co_pwritev        = qcow2_co_pwritev,
4819    .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
4820
4821    .bdrv_co_pwrite_zeroes  = qcow2_co_pwrite_zeroes,
4822    .bdrv_co_pdiscard       = qcow2_co_pdiscard,
4823    .bdrv_co_copy_range_from = qcow2_co_copy_range_from,
4824    .bdrv_co_copy_range_to  = qcow2_co_copy_range_to,
4825    .bdrv_co_truncate       = qcow2_co_truncate,
4826    .bdrv_co_pwritev_compressed = qcow2_co_pwritev_compressed,
4827    .bdrv_make_empty        = qcow2_make_empty,
4828
4829    .bdrv_snapshot_create   = qcow2_snapshot_create,
4830    .bdrv_snapshot_goto     = qcow2_snapshot_goto,
4831    .bdrv_snapshot_delete   = qcow2_snapshot_delete,
4832    .bdrv_snapshot_list     = qcow2_snapshot_list,
4833    .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp,
4834    .bdrv_measure           = qcow2_measure,
4835    .bdrv_get_info          = qcow2_get_info,
4836    .bdrv_get_specific_info = qcow2_get_specific_info,
4837
4838    .bdrv_save_vmstate    = qcow2_save_vmstate,
4839    .bdrv_load_vmstate    = qcow2_load_vmstate,
4840
4841    .supports_backing           = true,
4842    .bdrv_change_backing_file   = qcow2_change_backing_file,
4843
4844    .bdrv_refresh_limits        = qcow2_refresh_limits,
4845    .bdrv_co_invalidate_cache   = qcow2_co_invalidate_cache,
4846    .bdrv_inactivate            = qcow2_inactivate,
4847
4848    .create_opts         = &qcow2_create_opts,
4849    .bdrv_co_check       = qcow2_co_check,
4850    .bdrv_amend_options  = qcow2_amend_options,
4851
4852    .bdrv_detach_aio_context  = qcow2_detach_aio_context,
4853    .bdrv_attach_aio_context  = qcow2_attach_aio_context,
4854
4855    .bdrv_reopen_bitmaps_rw = qcow2_reopen_bitmaps_rw,
4856    .bdrv_can_store_new_dirty_bitmap = qcow2_can_store_new_dirty_bitmap,
4857    .bdrv_remove_persistent_dirty_bitmap = qcow2_remove_persistent_dirty_bitmap,
4858};
4859
4860static void bdrv_qcow2_init(void)
4861{
4862    bdrv_register(&bdrv_qcow2);
4863}
4864
4865block_init(bdrv_qcow2_init);
4866