qemu/block/qcow2.c
<<
>>
Prefs
   1/*
   2 * Block driver for the QCOW version 2 format
   3 *
   4 * Copyright (c) 2004-2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24#include "qemu-common.h"
  25#include "block/block_int.h"
  26#include "qemu/module.h"
  27#include <zlib.h>
  28#include "qemu/aes.h"
  29#include "block/qcow2.h"
  30#include "qemu/error-report.h"
  31#include "qapi/qmp/qerror.h"
  32#include "qapi/qmp/qbool.h"
  33#include "trace.h"
  34
  35/*
  36  Differences with QCOW:
  37
  38  - Support for multiple incremental snapshots.
  39  - Memory management by reference counts.
  40  - Clusters which have a reference count of one have the bit
  41    QCOW_OFLAG_COPIED to optimize write performance.
  42  - Size of compressed clusters is stored in sectors to reduce bit usage
  43    in the cluster offsets.
  44  - Support for storing additional data (such as the VM state) in the
  45    snapshots.
  46  - If a backing store is used, the cluster size is not constrained
  47    (could be backported to QCOW).
  48  - L2 tables have always a size of one cluster.
  49*/
  50
  51
  52typedef struct {
  53    uint32_t magic;
  54    uint32_t len;
  55} QEMU_PACKED QCowExtension;
  56
  57#define  QCOW2_EXT_MAGIC_END 0
  58#define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
  59#define  QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
  60
  61static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
  62{
  63    const QCowHeader *cow_header = (const void *)buf;
  64
  65    if (buf_size >= sizeof(QCowHeader) &&
  66        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
  67        be32_to_cpu(cow_header->version) >= 2)
  68        return 100;
  69    else
  70        return 0;
  71}
  72
  73
  74/* 
  75 * read qcow2 extension and fill bs
  76 * start reading from start_offset
  77 * finish reading upon magic of value 0 or when end_offset reached
  78 * unknown magic is skipped (future extension this version knows nothing about)
  79 * return 0 upon success, non-0 otherwise
  80 */
  81static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
  82                                 uint64_t end_offset, void **p_feature_table,
  83                                 Error **errp)
  84{
  85    BDRVQcowState *s = bs->opaque;
  86    QCowExtension ext;
  87    uint64_t offset;
  88    int ret;
  89
  90#ifdef DEBUG_EXT
  91    printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
  92#endif
  93    offset = start_offset;
  94    while (offset < end_offset) {
  95
  96#ifdef DEBUG_EXT
  97        /* Sanity check */
  98        if (offset > s->cluster_size)
  99            printf("qcow2_read_extension: suspicious offset %lu\n", offset);
 100
 101        printf("attempting to read extended header in offset %lu\n", offset);
 102#endif
 103
 104        ret = bdrv_pread(bs->file, offset, &ext, sizeof(ext));
 105        if (ret < 0) {
 106            error_setg_errno(errp, -ret, "qcow2_read_extension: ERROR: "
 107                             "pread fail from offset %" PRIu64, offset);
 108            return 1;
 109        }
 110        be32_to_cpus(&ext.magic);
 111        be32_to_cpus(&ext.len);
 112        offset += sizeof(ext);
 113#ifdef DEBUG_EXT
 114        printf("ext.magic = 0x%x\n", ext.magic);
 115#endif
 116        if (ext.len > end_offset - offset) {
 117            error_setg(errp, "Header extension too large");
 118            return -EINVAL;
 119        }
 120
 121        switch (ext.magic) {
 122        case QCOW2_EXT_MAGIC_END:
 123            return 0;
 124
 125        case QCOW2_EXT_MAGIC_BACKING_FORMAT:
 126            if (ext.len >= sizeof(bs->backing_format)) {
 127                error_setg(errp, "ERROR: ext_backing_format: len=%u too large"
 128                           " (>=%zu)", ext.len, sizeof(bs->backing_format));
 129                return 2;
 130            }
 131            ret = bdrv_pread(bs->file, offset, bs->backing_format, ext.len);
 132            if (ret < 0) {
 133                error_setg_errno(errp, -ret, "ERROR: ext_backing_format: "
 134                                 "Could not read format name");
 135                return 3;
 136            }
 137            bs->backing_format[ext.len] = '\0';
 138#ifdef DEBUG_EXT
 139            printf("Qcow2: Got format extension %s\n", bs->backing_format);
 140#endif
 141            break;
 142
 143        case QCOW2_EXT_MAGIC_FEATURE_TABLE:
 144            if (p_feature_table != NULL) {
 145                void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature));
 146                ret = bdrv_pread(bs->file, offset , feature_table, ext.len);
 147                if (ret < 0) {
 148                    error_setg_errno(errp, -ret, "ERROR: ext_feature_table: "
 149                                     "Could not read table");
 150                    return ret;
 151                }
 152
 153                *p_feature_table = feature_table;
 154            }
 155            break;
 156
 157        default:
 158            /* unknown magic - save it in case we need to rewrite the header */
 159            {
 160                Qcow2UnknownHeaderExtension *uext;
 161
 162                uext = g_malloc0(sizeof(*uext)  + ext.len);
 163                uext->magic = ext.magic;
 164                uext->len = ext.len;
 165                QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next);
 166
 167                ret = bdrv_pread(bs->file, offset , uext->data, uext->len);
 168                if (ret < 0) {
 169                    error_setg_errno(errp, -ret, "ERROR: unknown extension: "
 170                                     "Could not read data");
 171                    return ret;
 172                }
 173            }
 174            break;
 175        }
 176
 177        offset += ((ext.len + 7) & ~7);
 178    }
 179
 180    return 0;
 181}
 182
 183static void cleanup_unknown_header_ext(BlockDriverState *bs)
 184{
 185    BDRVQcowState *s = bs->opaque;
 186    Qcow2UnknownHeaderExtension *uext, *next;
 187
 188    QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) {
 189        QLIST_REMOVE(uext, next);
 190        g_free(uext);
 191    }
 192}
 193
 194static void GCC_FMT_ATTR(3, 4) report_unsupported(BlockDriverState *bs,
 195    Error **errp, const char *fmt, ...)
 196{
 197    char msg[64];
 198    va_list ap;
 199
 200    va_start(ap, fmt);
 201    vsnprintf(msg, sizeof(msg), fmt, ap);
 202    va_end(ap);
 203
 204    error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, bs->device_name, "qcow2",
 205              msg);
 206}
 207
 208static void report_unsupported_feature(BlockDriverState *bs,
 209    Error **errp, Qcow2Feature *table, uint64_t mask)
 210{
 211    while (table && table->name[0] != '\0') {
 212        if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) {
 213            if (mask & (1 << table->bit)) {
 214                report_unsupported(bs, errp, "%.46s", table->name);
 215                mask &= ~(1 << table->bit);
 216            }
 217        }
 218        table++;
 219    }
 220
 221    if (mask) {
 222        report_unsupported(bs, errp, "Unknown incompatible feature: %" PRIx64,
 223                           mask);
 224    }
 225}
 226
 227/*
 228 * Sets the dirty bit and flushes afterwards if necessary.
 229 *
 230 * The incompatible_features bit is only set if the image file header was
 231 * updated successfully.  Therefore it is not required to check the return
 232 * value of this function.
 233 */
 234int qcow2_mark_dirty(BlockDriverState *bs)
 235{
 236    BDRVQcowState *s = bs->opaque;
 237    uint64_t val;
 238    int ret;
 239
 240    assert(s->qcow_version >= 3);
 241
 242    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
 243        return 0; /* already dirty */
 244    }
 245
 246    val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY);
 247    ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features),
 248                      &val, sizeof(val));
 249    if (ret < 0) {
 250        return ret;
 251    }
 252    ret = bdrv_flush(bs->file);
 253    if (ret < 0) {
 254        return ret;
 255    }
 256
 257    /* Only treat image as dirty if the header was updated successfully */
 258    s->incompatible_features |= QCOW2_INCOMPAT_DIRTY;
 259    return 0;
 260}
 261
 262/*
 263 * Clears the dirty bit and flushes before if necessary.  Only call this
 264 * function when there are no pending requests, it does not guard against
 265 * concurrent requests dirtying the image.
 266 */
 267static int qcow2_mark_clean(BlockDriverState *bs)
 268{
 269    BDRVQcowState *s = bs->opaque;
 270
 271    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
 272        int ret = bdrv_flush(bs);
 273        if (ret < 0) {
 274            return ret;
 275        }
 276
 277        s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY;
 278        return qcow2_update_header(bs);
 279    }
 280    return 0;
 281}
 282
 283/*
 284 * Marks the image as corrupt.
 285 */
 286int qcow2_mark_corrupt(BlockDriverState *bs)
 287{
 288    BDRVQcowState *s = bs->opaque;
 289
 290    s->incompatible_features |= QCOW2_INCOMPAT_CORRUPT;
 291    return qcow2_update_header(bs);
 292}
 293
 294/*
 295 * Marks the image as consistent, i.e., unsets the corrupt bit, and flushes
 296 * before if necessary.
 297 */
 298int qcow2_mark_consistent(BlockDriverState *bs)
 299{
 300    BDRVQcowState *s = bs->opaque;
 301
 302    if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
 303        int ret = bdrv_flush(bs);
 304        if (ret < 0) {
 305            return ret;
 306        }
 307
 308        s->incompatible_features &= ~QCOW2_INCOMPAT_CORRUPT;
 309        return qcow2_update_header(bs);
 310    }
 311    return 0;
 312}
 313
 314static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
 315                       BdrvCheckMode fix)
 316{
 317    int ret = qcow2_check_refcounts(bs, result, fix);
 318    if (ret < 0) {
 319        return ret;
 320    }
 321
 322    if (fix && result->check_errors == 0 && result->corruptions == 0) {
 323        ret = qcow2_mark_clean(bs);
 324        if (ret < 0) {
 325            return ret;
 326        }
 327        return qcow2_mark_consistent(bs);
 328    }
 329    return ret;
 330}
 331
 332static QemuOptsList qcow2_runtime_opts = {
 333    .name = "qcow2",
 334    .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head),
 335    .desc = {
 336        {
 337            .name = QCOW2_OPT_LAZY_REFCOUNTS,
 338            .type = QEMU_OPT_BOOL,
 339            .help = "Postpone refcount updates",
 340        },
 341        {
 342            .name = QCOW2_OPT_DISCARD_REQUEST,
 343            .type = QEMU_OPT_BOOL,
 344            .help = "Pass guest discard requests to the layer below",
 345        },
 346        {
 347            .name = QCOW2_OPT_DISCARD_SNAPSHOT,
 348            .type = QEMU_OPT_BOOL,
 349            .help = "Generate discard requests when snapshot related space "
 350                    "is freed",
 351        },
 352        {
 353            .name = QCOW2_OPT_DISCARD_OTHER,
 354            .type = QEMU_OPT_BOOL,
 355            .help = "Generate discard requests when other clusters are freed",
 356        },
 357        {
 358            .name = QCOW2_OPT_OVERLAP,
 359            .type = QEMU_OPT_STRING,
 360            .help = "Selects which overlap checks to perform from a range of "
 361                    "templates (none, constant, cached, all)",
 362        },
 363        {
 364            .name = QCOW2_OPT_OVERLAP_MAIN_HEADER,
 365            .type = QEMU_OPT_BOOL,
 366            .help = "Check for unintended writes into the main qcow2 header",
 367        },
 368        {
 369            .name = QCOW2_OPT_OVERLAP_ACTIVE_L1,
 370            .type = QEMU_OPT_BOOL,
 371            .help = "Check for unintended writes into the active L1 table",
 372        },
 373        {
 374            .name = QCOW2_OPT_OVERLAP_ACTIVE_L2,
 375            .type = QEMU_OPT_BOOL,
 376            .help = "Check for unintended writes into an active L2 table",
 377        },
 378        {
 379            .name = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
 380            .type = QEMU_OPT_BOOL,
 381            .help = "Check for unintended writes into the refcount table",
 382        },
 383        {
 384            .name = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
 385            .type = QEMU_OPT_BOOL,
 386            .help = "Check for unintended writes into a refcount block",
 387        },
 388        {
 389            .name = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
 390            .type = QEMU_OPT_BOOL,
 391            .help = "Check for unintended writes into the snapshot table",
 392        },
 393        {
 394            .name = QCOW2_OPT_OVERLAP_INACTIVE_L1,
 395            .type = QEMU_OPT_BOOL,
 396            .help = "Check for unintended writes into an inactive L1 table",
 397        },
 398        {
 399            .name = QCOW2_OPT_OVERLAP_INACTIVE_L2,
 400            .type = QEMU_OPT_BOOL,
 401            .help = "Check for unintended writes into an inactive L2 table",
 402        },
 403        { /* end of list */ }
 404    },
 405};
 406
 407static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = {
 408    [QCOW2_OL_MAIN_HEADER_BITNR]    = QCOW2_OPT_OVERLAP_MAIN_HEADER,
 409    [QCOW2_OL_ACTIVE_L1_BITNR]      = QCOW2_OPT_OVERLAP_ACTIVE_L1,
 410    [QCOW2_OL_ACTIVE_L2_BITNR]      = QCOW2_OPT_OVERLAP_ACTIVE_L2,
 411    [QCOW2_OL_REFCOUNT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
 412    [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
 413    [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
 414    [QCOW2_OL_INACTIVE_L1_BITNR]    = QCOW2_OPT_OVERLAP_INACTIVE_L1,
 415    [QCOW2_OL_INACTIVE_L2_BITNR]    = QCOW2_OPT_OVERLAP_INACTIVE_L2,
 416};
 417
 418static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
 419                      Error **errp)
 420{
 421    BDRVQcowState *s = bs->opaque;
 422    int len, i, ret = 0;
 423    QCowHeader header;
 424    QemuOpts *opts;
 425    Error *local_err = NULL;
 426    uint64_t ext_end;
 427    uint64_t l1_vm_state_index;
 428    const char *opt_overlap_check;
 429    int overlap_check_template = 0;
 430
 431    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
 432    if (ret < 0) {
 433        error_setg_errno(errp, -ret, "Could not read qcow2 header");
 434        goto fail;
 435    }
 436    be32_to_cpus(&header.magic);
 437    be32_to_cpus(&header.version);
 438    be64_to_cpus(&header.backing_file_offset);
 439    be32_to_cpus(&header.backing_file_size);
 440    be64_to_cpus(&header.size);
 441    be32_to_cpus(&header.cluster_bits);
 442    be32_to_cpus(&header.crypt_method);
 443    be64_to_cpus(&header.l1_table_offset);
 444    be32_to_cpus(&header.l1_size);
 445    be64_to_cpus(&header.refcount_table_offset);
 446    be32_to_cpus(&header.refcount_table_clusters);
 447    be64_to_cpus(&header.snapshots_offset);
 448    be32_to_cpus(&header.nb_snapshots);
 449
 450    if (header.magic != QCOW_MAGIC) {
 451        error_setg(errp, "Image is not in qcow2 format");
 452        ret = -EMEDIUMTYPE;
 453        goto fail;
 454    }
 455    if (header.version < 2 || header.version > 3) {
 456        report_unsupported(bs, errp, "QCOW version %d", header.version);
 457        ret = -ENOTSUP;
 458        goto fail;
 459    }
 460
 461    s->qcow_version = header.version;
 462
 463    /* Initialise version 3 header fields */
 464    if (header.version == 2) {
 465        header.incompatible_features    = 0;
 466        header.compatible_features      = 0;
 467        header.autoclear_features       = 0;
 468        header.refcount_order           = 4;
 469        header.header_length            = 72;
 470    } else {
 471        be64_to_cpus(&header.incompatible_features);
 472        be64_to_cpus(&header.compatible_features);
 473        be64_to_cpus(&header.autoclear_features);
 474        be32_to_cpus(&header.refcount_order);
 475        be32_to_cpus(&header.header_length);
 476    }
 477
 478    if (header.header_length > sizeof(header)) {
 479        s->unknown_header_fields_size = header.header_length - sizeof(header);
 480        s->unknown_header_fields = g_malloc(s->unknown_header_fields_size);
 481        ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields,
 482                         s->unknown_header_fields_size);
 483        if (ret < 0) {
 484            error_setg_errno(errp, -ret, "Could not read unknown qcow2 header "
 485                             "fields");
 486            goto fail;
 487        }
 488    }
 489
 490    if (header.backing_file_offset) {
 491        ext_end = header.backing_file_offset;
 492    } else {
 493        ext_end = 1 << header.cluster_bits;
 494    }
 495
 496    /* Handle feature bits */
 497    s->incompatible_features    = header.incompatible_features;
 498    s->compatible_features      = header.compatible_features;
 499    s->autoclear_features       = header.autoclear_features;
 500
 501    if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) {
 502        void *feature_table = NULL;
 503        qcow2_read_extensions(bs, header.header_length, ext_end,
 504                              &feature_table, NULL);
 505        report_unsupported_feature(bs, errp, feature_table,
 506                                   s->incompatible_features &
 507                                   ~QCOW2_INCOMPAT_MASK);
 508        ret = -ENOTSUP;
 509        goto fail;
 510    }
 511
 512    if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
 513        /* Corrupt images may not be written to unless they are being repaired
 514         */
 515        if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) {
 516            error_setg(errp, "qcow2: Image is corrupt; cannot be opened "
 517                       "read/write");
 518            ret = -EACCES;
 519            goto fail;
 520        }
 521    }
 522
 523    /* Check support for various header values */
 524    if (header.refcount_order != 4) {
 525        report_unsupported(bs, errp, "%d bit reference counts",
 526                           1 << header.refcount_order);
 527        ret = -ENOTSUP;
 528        goto fail;
 529    }
 530    s->refcount_order = header.refcount_order;
 531
 532    if (header.cluster_bits < MIN_CLUSTER_BITS ||
 533        header.cluster_bits > MAX_CLUSTER_BITS) {
 534        error_setg(errp, "Unsupported cluster size: 2^%i", header.cluster_bits);
 535        ret = -EINVAL;
 536        goto fail;
 537    }
 538    if (header.crypt_method > QCOW_CRYPT_AES) {
 539        error_setg(errp, "Unsupported encryption method: %i",
 540                   header.crypt_method);
 541        ret = -EINVAL;
 542        goto fail;
 543    }
 544    s->crypt_method_header = header.crypt_method;
 545    if (s->crypt_method_header) {
 546        bs->encrypted = 1;
 547    }
 548    s->cluster_bits = header.cluster_bits;
 549    s->cluster_size = 1 << s->cluster_bits;
 550    s->cluster_sectors = 1 << (s->cluster_bits - 9);
 551    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
 552    s->l2_size = 1 << s->l2_bits;
 553    bs->total_sectors = header.size / 512;
 554    s->csize_shift = (62 - (s->cluster_bits - 8));
 555    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
 556    s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
 557    s->refcount_table_offset = header.refcount_table_offset;
 558    s->refcount_table_size =
 559        header.refcount_table_clusters << (s->cluster_bits - 3);
 560
 561    s->snapshots_offset = header.snapshots_offset;
 562    s->nb_snapshots = header.nb_snapshots;
 563
 564    /* read the level 1 table */
 565    s->l1_size = header.l1_size;
 566
 567    l1_vm_state_index = size_to_l1(s, header.size);
 568    if (l1_vm_state_index > INT_MAX) {
 569        error_setg(errp, "Image is too big");
 570        ret = -EFBIG;
 571        goto fail;
 572    }
 573    s->l1_vm_state_index = l1_vm_state_index;
 574
 575    /* the L1 table must contain at least enough entries to put
 576       header.size bytes */
 577    if (s->l1_size < s->l1_vm_state_index) {
 578        error_setg(errp, "L1 table is too small");
 579        ret = -EINVAL;
 580        goto fail;
 581    }
 582    s->l1_table_offset = header.l1_table_offset;
 583    if (s->l1_size > 0) {
 584        s->l1_table = g_malloc0(
 585            align_offset(s->l1_size * sizeof(uint64_t), 512));
 586        ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
 587                         s->l1_size * sizeof(uint64_t));
 588        if (ret < 0) {
 589            error_setg_errno(errp, -ret, "Could not read L1 table");
 590            goto fail;
 591        }
 592        for(i = 0;i < s->l1_size; i++) {
 593            be64_to_cpus(&s->l1_table[i]);
 594        }
 595    }
 596
 597    /* alloc L2 table/refcount block cache */
 598    s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE);
 599    s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE);
 600
 601    s->cluster_cache = g_malloc(s->cluster_size);
 602    /* one more sector for decompressed data alignment */
 603    s->cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
 604                                  + 512);
 605    s->cluster_cache_offset = -1;
 606    s->flags = flags;
 607
 608    ret = qcow2_refcount_init(bs);
 609    if (ret != 0) {
 610        error_setg_errno(errp, -ret, "Could not initialize refcount handling");
 611        goto fail;
 612    }
 613
 614    QLIST_INIT(&s->cluster_allocs);
 615    QTAILQ_INIT(&s->discards);
 616
 617    /* read qcow2 extensions */
 618    if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL,
 619        &local_err)) {
 620        error_propagate(errp, local_err);
 621        ret = -EINVAL;
 622        goto fail;
 623    }
 624
 625    /* read the backing file name */
 626    if (header.backing_file_offset != 0) {
 627        len = header.backing_file_size;
 628        if (len > 1023) {
 629            len = 1023;
 630        }
 631        ret = bdrv_pread(bs->file, header.backing_file_offset,
 632                         bs->backing_file, len);
 633        if (ret < 0) {
 634            error_setg_errno(errp, -ret, "Could not read backing file name");
 635            goto fail;
 636        }
 637        bs->backing_file[len] = '\0';
 638    }
 639
 640    ret = qcow2_read_snapshots(bs);
 641    if (ret < 0) {
 642        error_setg_errno(errp, -ret, "Could not read snapshots");
 643        goto fail;
 644    }
 645
 646    /* Clear unknown autoclear feature bits */
 647    if (!bs->read_only && s->autoclear_features != 0) {
 648        s->autoclear_features = 0;
 649        ret = qcow2_update_header(bs);
 650        if (ret < 0) {
 651            error_setg_errno(errp, -ret, "Could not update qcow2 header");
 652            goto fail;
 653        }
 654    }
 655
 656    /* Initialise locks */
 657    qemu_co_mutex_init(&s->lock);
 658
 659    /* Repair image if dirty */
 660    if (!(flags & BDRV_O_CHECK) && !bs->read_only &&
 661        (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) {
 662        BdrvCheckResult result = {0};
 663
 664        ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS);
 665        if (ret < 0) {
 666            error_setg_errno(errp, -ret, "Could not repair dirty image");
 667            goto fail;
 668        }
 669    }
 670
 671    /* Enable lazy_refcounts according to image and command line options */
 672    opts = qemu_opts_create_nofail(&qcow2_runtime_opts);
 673    qemu_opts_absorb_qdict(opts, options, &local_err);
 674    if (error_is_set(&local_err)) {
 675        error_propagate(errp, local_err);
 676        ret = -EINVAL;
 677        goto fail;
 678    }
 679
 680    s->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS,
 681        (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS));
 682
 683    s->discard_passthrough[QCOW2_DISCARD_NEVER] = false;
 684    s->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true;
 685    s->discard_passthrough[QCOW2_DISCARD_REQUEST] =
 686        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST,
 687                          flags & BDRV_O_UNMAP);
 688    s->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] =
 689        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true);
 690    s->discard_passthrough[QCOW2_DISCARD_OTHER] =
 691        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false);
 692
 693    opt_overlap_check = qemu_opt_get(opts, "overlap-check") ?: "cached";
 694    if (!strcmp(opt_overlap_check, "none")) {
 695        overlap_check_template = 0;
 696    } else if (!strcmp(opt_overlap_check, "constant")) {
 697        overlap_check_template = QCOW2_OL_CONSTANT;
 698    } else if (!strcmp(opt_overlap_check, "cached")) {
 699        overlap_check_template = QCOW2_OL_CACHED;
 700    } else if (!strcmp(opt_overlap_check, "all")) {
 701        overlap_check_template = QCOW2_OL_ALL;
 702    } else {
 703        error_setg(errp, "Unsupported value '%s' for qcow2 option "
 704                   "'overlap-check'. Allowed are either of the following: "
 705                   "none, constant, cached, all", opt_overlap_check);
 706        qemu_opts_del(opts);
 707        ret = -EINVAL;
 708        goto fail;
 709    }
 710
 711    s->overlap_check = 0;
 712    for (i = 0; i < QCOW2_OL_MAX_BITNR; i++) {
 713        /* overlap-check defines a template bitmask, but every flag may be
 714         * overwritten through the associated boolean option */
 715        s->overlap_check |=
 716            qemu_opt_get_bool(opts, overlap_bool_option_names[i],
 717                              overlap_check_template & (1 << i)) << i;
 718    }
 719
 720    qemu_opts_del(opts);
 721
 722    if (s->use_lazy_refcounts && s->qcow_version < 3) {
 723        error_setg(errp, "Lazy refcounts require a qcow2 image with at least "
 724                   "qemu 1.1 compatibility level");
 725        ret = -EINVAL;
 726        goto fail;
 727    }
 728
 729#ifdef DEBUG_ALLOC
 730    {
 731        BdrvCheckResult result = {0};
 732        qcow2_check_refcounts(bs, &result, 0);
 733    }
 734#endif
 735    return ret;
 736
 737 fail:
 738    g_free(s->unknown_header_fields);
 739    cleanup_unknown_header_ext(bs);
 740    qcow2_free_snapshots(bs);
 741    qcow2_refcount_close(bs);
 742    g_free(s->l1_table);
 743    /* else pre-write overlap checks in cache_destroy may crash */
 744    s->l1_table = NULL;
 745    if (s->l2_table_cache) {
 746        qcow2_cache_destroy(bs, s->l2_table_cache);
 747    }
 748    g_free(s->cluster_cache);
 749    qemu_vfree(s->cluster_data);
 750    return ret;
 751}
 752
 753static int qcow2_set_key(BlockDriverState *bs, const char *key)
 754{
 755    BDRVQcowState *s = bs->opaque;
 756    uint8_t keybuf[16];
 757    int len, i;
 758
 759    memset(keybuf, 0, 16);
 760    len = strlen(key);
 761    if (len > 16)
 762        len = 16;
 763    /* XXX: we could compress the chars to 7 bits to increase
 764       entropy */
 765    for(i = 0;i < len;i++) {
 766        keybuf[i] = key[i];
 767    }
 768    s->crypt_method = s->crypt_method_header;
 769
 770    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
 771        return -1;
 772    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
 773        return -1;
 774#if 0
 775    /* test */
 776    {
 777        uint8_t in[16];
 778        uint8_t out[16];
 779        uint8_t tmp[16];
 780        for(i=0;i<16;i++)
 781            in[i] = i;
 782        AES_encrypt(in, tmp, &s->aes_encrypt_key);
 783        AES_decrypt(tmp, out, &s->aes_decrypt_key);
 784        for(i = 0; i < 16; i++)
 785            printf(" %02x", tmp[i]);
 786        printf("\n");
 787        for(i = 0; i < 16; i++)
 788            printf(" %02x", out[i]);
 789        printf("\n");
 790    }
 791#endif
 792    return 0;
 793}
 794
 795/* We have nothing to do for QCOW2 reopen, stubs just return
 796 * success */
 797static int qcow2_reopen_prepare(BDRVReopenState *state,
 798                                BlockReopenQueue *queue, Error **errp)
 799{
 800    return 0;
 801}
 802
 803static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
 804        int64_t sector_num, int nb_sectors, int *pnum)
 805{
 806    BDRVQcowState *s = bs->opaque;
 807    uint64_t cluster_offset;
 808    int index_in_cluster, ret;
 809    int64_t status = 0;
 810
 811    *pnum = nb_sectors;
 812    qemu_co_mutex_lock(&s->lock);
 813    ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset);
 814    qemu_co_mutex_unlock(&s->lock);
 815    if (ret < 0) {
 816        return ret;
 817    }
 818
 819    if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED &&
 820        !s->crypt_method) {
 821        index_in_cluster = sector_num & (s->cluster_sectors - 1);
 822        cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS);
 823        status |= BDRV_BLOCK_OFFSET_VALID | cluster_offset;
 824    }
 825    if (ret == QCOW2_CLUSTER_ZERO) {
 826        status |= BDRV_BLOCK_ZERO;
 827    } else if (ret != QCOW2_CLUSTER_UNALLOCATED) {
 828        status |= BDRV_BLOCK_DATA;
 829    }
 830    return status;
 831}
 832
 833/* handle reading after the end of the backing file */
 834int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
 835                  int64_t sector_num, int nb_sectors)
 836{
 837    int n1;
 838    if ((sector_num + nb_sectors) <= bs->total_sectors)
 839        return nb_sectors;
 840    if (sector_num >= bs->total_sectors)
 841        n1 = 0;
 842    else
 843        n1 = bs->total_sectors - sector_num;
 844
 845    qemu_iovec_memset(qiov, 512 * n1, 0, 512 * (nb_sectors - n1));
 846
 847    return n1;
 848}
 849
 850static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
 851                          int remaining_sectors, QEMUIOVector *qiov)
 852{
 853    BDRVQcowState *s = bs->opaque;
 854    int index_in_cluster, n1;
 855    int ret;
 856    int cur_nr_sectors; /* number of sectors in current iteration */
 857    uint64_t cluster_offset = 0;
 858    uint64_t bytes_done = 0;
 859    QEMUIOVector hd_qiov;
 860    uint8_t *cluster_data = NULL;
 861
 862    qemu_iovec_init(&hd_qiov, qiov->niov);
 863
 864    qemu_co_mutex_lock(&s->lock);
 865
 866    while (remaining_sectors != 0) {
 867
 868        /* prepare next request */
 869        cur_nr_sectors = remaining_sectors;
 870        if (s->crypt_method) {
 871            cur_nr_sectors = MIN(cur_nr_sectors,
 872                QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
 873        }
 874
 875        ret = qcow2_get_cluster_offset(bs, sector_num << 9,
 876            &cur_nr_sectors, &cluster_offset);
 877        if (ret < 0) {
 878            goto fail;
 879        }
 880
 881        index_in_cluster = sector_num & (s->cluster_sectors - 1);
 882
 883        qemu_iovec_reset(&hd_qiov);
 884        qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
 885            cur_nr_sectors * 512);
 886
 887        switch (ret) {
 888        case QCOW2_CLUSTER_UNALLOCATED:
 889
 890            if (bs->backing_hd) {
 891                /* read from the base image */
 892                n1 = qcow2_backing_read1(bs->backing_hd, &hd_qiov,
 893                    sector_num, cur_nr_sectors);
 894                if (n1 > 0) {
 895                    BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
 896                    qemu_co_mutex_unlock(&s->lock);
 897                    ret = bdrv_co_readv(bs->backing_hd, sector_num,
 898                                        n1, &hd_qiov);
 899                    qemu_co_mutex_lock(&s->lock);
 900                    if (ret < 0) {
 901                        goto fail;
 902                    }
 903                }
 904            } else {
 905                /* Note: in this case, no need to wait */
 906                qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
 907            }
 908            break;
 909
 910        case QCOW2_CLUSTER_ZERO:
 911            qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
 912            break;
 913
 914        case QCOW2_CLUSTER_COMPRESSED:
 915            /* add AIO support for compressed blocks ? */
 916            ret = qcow2_decompress_cluster(bs, cluster_offset);
 917            if (ret < 0) {
 918                goto fail;
 919            }
 920
 921            qemu_iovec_from_buf(&hd_qiov, 0,
 922                s->cluster_cache + index_in_cluster * 512,
 923                512 * cur_nr_sectors);
 924            break;
 925
 926        case QCOW2_CLUSTER_NORMAL:
 927            if ((cluster_offset & 511) != 0) {
 928                ret = -EIO;
 929                goto fail;
 930            }
 931
 932            if (s->crypt_method) {
 933                /*
 934                 * For encrypted images, read everything into a temporary
 935                 * contiguous buffer on which the AES functions can work.
 936                 */
 937                if (!cluster_data) {
 938                    cluster_data =
 939                        qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
 940                }
 941
 942                assert(cur_nr_sectors <=
 943                    QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
 944                qemu_iovec_reset(&hd_qiov);
 945                qemu_iovec_add(&hd_qiov, cluster_data,
 946                    512 * cur_nr_sectors);
 947            }
 948
 949            BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
 950            qemu_co_mutex_unlock(&s->lock);
 951            ret = bdrv_co_readv(bs->file,
 952                                (cluster_offset >> 9) + index_in_cluster,
 953                                cur_nr_sectors, &hd_qiov);
 954            qemu_co_mutex_lock(&s->lock);
 955            if (ret < 0) {
 956                goto fail;
 957            }
 958            if (s->crypt_method) {
 959                qcow2_encrypt_sectors(s, sector_num,  cluster_data,
 960                    cluster_data, cur_nr_sectors, 0, &s->aes_decrypt_key);
 961                qemu_iovec_from_buf(qiov, bytes_done,
 962                    cluster_data, 512 * cur_nr_sectors);
 963            }
 964            break;
 965
 966        default:
 967            g_assert_not_reached();
 968            ret = -EIO;
 969            goto fail;
 970        }
 971
 972        remaining_sectors -= cur_nr_sectors;
 973        sector_num += cur_nr_sectors;
 974        bytes_done += cur_nr_sectors * 512;
 975    }
 976    ret = 0;
 977
 978fail:
 979    qemu_co_mutex_unlock(&s->lock);
 980
 981    qemu_iovec_destroy(&hd_qiov);
 982    qemu_vfree(cluster_data);
 983
 984    return ret;
 985}
 986
 987static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
 988                           int64_t sector_num,
 989                           int remaining_sectors,
 990                           QEMUIOVector *qiov)
 991{
 992    BDRVQcowState *s = bs->opaque;
 993    int index_in_cluster;
 994    int n_end;
 995    int ret;
 996    int cur_nr_sectors; /* number of sectors in current iteration */
 997    uint64_t cluster_offset;
 998    QEMUIOVector hd_qiov;
 999    uint64_t bytes_done = 0;
1000    uint8_t *cluster_data = NULL;
1001    QCowL2Meta *l2meta = NULL;
1002
1003    trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num,
1004                                 remaining_sectors);
1005
1006    qemu_iovec_init(&hd_qiov, qiov->niov);
1007
1008    s->cluster_cache_offset = -1; /* disable compressed cache */
1009
1010    qemu_co_mutex_lock(&s->lock);
1011
1012    while (remaining_sectors != 0) {
1013
1014        l2meta = NULL;
1015
1016        trace_qcow2_writev_start_part(qemu_coroutine_self());
1017        index_in_cluster = sector_num & (s->cluster_sectors - 1);
1018        n_end = index_in_cluster + remaining_sectors;
1019        if (s->crypt_method &&
1020            n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) {
1021            n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
1022        }
1023
1024        ret = qcow2_alloc_cluster_offset(bs, sector_num << 9,
1025            index_in_cluster, n_end, &cur_nr_sectors, &cluster_offset, &l2meta);
1026        if (ret < 0) {
1027            goto fail;
1028        }
1029
1030        assert((cluster_offset & 511) == 0);
1031
1032        qemu_iovec_reset(&hd_qiov);
1033        qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
1034            cur_nr_sectors * 512);
1035
1036        if (s->crypt_method) {
1037            if (!cluster_data) {
1038                cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS *
1039                                                 s->cluster_size);
1040            }
1041
1042            assert(hd_qiov.size <=
1043                   QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
1044            qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size);
1045
1046            qcow2_encrypt_sectors(s, sector_num, cluster_data,
1047                cluster_data, cur_nr_sectors, 1, &s->aes_encrypt_key);
1048
1049            qemu_iovec_reset(&hd_qiov);
1050            qemu_iovec_add(&hd_qiov, cluster_data,
1051                cur_nr_sectors * 512);
1052        }
1053
1054        ret = qcow2_pre_write_overlap_check(bs, 0,
1055                cluster_offset + index_in_cluster * BDRV_SECTOR_SIZE,
1056                cur_nr_sectors * BDRV_SECTOR_SIZE);
1057        if (ret < 0) {
1058            goto fail;
1059        }
1060
1061        qemu_co_mutex_unlock(&s->lock);
1062        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
1063        trace_qcow2_writev_data(qemu_coroutine_self(),
1064                                (cluster_offset >> 9) + index_in_cluster);
1065        ret = bdrv_co_writev(bs->file,
1066                             (cluster_offset >> 9) + index_in_cluster,
1067                             cur_nr_sectors, &hd_qiov);
1068        qemu_co_mutex_lock(&s->lock);
1069        if (ret < 0) {
1070            goto fail;
1071        }
1072
1073        while (l2meta != NULL) {
1074            QCowL2Meta *next;
1075
1076            ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
1077            if (ret < 0) {
1078                goto fail;
1079            }
1080
1081            /* Take the request off the list of running requests */
1082            if (l2meta->nb_clusters != 0) {
1083                QLIST_REMOVE(l2meta, next_in_flight);
1084            }
1085
1086            qemu_co_queue_restart_all(&l2meta->dependent_requests);
1087
1088            next = l2meta->next;
1089            g_free(l2meta);
1090            l2meta = next;
1091        }
1092
1093        remaining_sectors -= cur_nr_sectors;
1094        sector_num += cur_nr_sectors;
1095        bytes_done += cur_nr_sectors * 512;
1096        trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors);
1097    }
1098    ret = 0;
1099
1100fail:
1101    qemu_co_mutex_unlock(&s->lock);
1102
1103    while (l2meta != NULL) {
1104        QCowL2Meta *next;
1105
1106        if (l2meta->nb_clusters != 0) {
1107            QLIST_REMOVE(l2meta, next_in_flight);
1108        }
1109        qemu_co_queue_restart_all(&l2meta->dependent_requests);
1110
1111        next = l2meta->next;
1112        g_free(l2meta);
1113        l2meta = next;
1114    }
1115
1116    qemu_iovec_destroy(&hd_qiov);
1117    qemu_vfree(cluster_data);
1118    trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
1119
1120    return ret;
1121}
1122
1123static void qcow2_close(BlockDriverState *bs)
1124{
1125    BDRVQcowState *s = bs->opaque;
1126    g_free(s->l1_table);
1127    /* else pre-write overlap checks in cache_destroy may crash */
1128    s->l1_table = NULL;
1129
1130    qcow2_cache_flush(bs, s->l2_table_cache);
1131    qcow2_cache_flush(bs, s->refcount_block_cache);
1132
1133    qcow2_mark_clean(bs);
1134
1135    qcow2_cache_destroy(bs, s->l2_table_cache);
1136    qcow2_cache_destroy(bs, s->refcount_block_cache);
1137
1138    g_free(s->unknown_header_fields);
1139    cleanup_unknown_header_ext(bs);
1140
1141    g_free(s->cluster_cache);
1142    qemu_vfree(s->cluster_data);
1143    qcow2_refcount_close(bs);
1144    qcow2_free_snapshots(bs);
1145}
1146
1147static void qcow2_invalidate_cache(BlockDriverState *bs)
1148{
1149    BDRVQcowState *s = bs->opaque;
1150    int flags = s->flags;
1151    AES_KEY aes_encrypt_key;
1152    AES_KEY aes_decrypt_key;
1153    uint32_t crypt_method = 0;
1154    QDict *options;
1155
1156    /*
1157     * Backing files are read-only which makes all of their metadata immutable,
1158     * that means we don't have to worry about reopening them here.
1159     */
1160
1161    if (s->crypt_method) {
1162        crypt_method = s->crypt_method;
1163        memcpy(&aes_encrypt_key, &s->aes_encrypt_key, sizeof(aes_encrypt_key));
1164        memcpy(&aes_decrypt_key, &s->aes_decrypt_key, sizeof(aes_decrypt_key));
1165    }
1166
1167    qcow2_close(bs);
1168
1169    options = qdict_new();
1170    qdict_put(options, QCOW2_OPT_LAZY_REFCOUNTS,
1171              qbool_from_int(s->use_lazy_refcounts));
1172
1173    memset(s, 0, sizeof(BDRVQcowState));
1174    qcow2_open(bs, options, flags, NULL);
1175
1176    QDECREF(options);
1177
1178    if (crypt_method) {
1179        s->crypt_method = crypt_method;
1180        memcpy(&s->aes_encrypt_key, &aes_encrypt_key, sizeof(aes_encrypt_key));
1181        memcpy(&s->aes_decrypt_key, &aes_decrypt_key, sizeof(aes_decrypt_key));
1182    }
1183}
1184
1185static size_t header_ext_add(char *buf, uint32_t magic, const void *s,
1186    size_t len, size_t buflen)
1187{
1188    QCowExtension *ext_backing_fmt = (QCowExtension*) buf;
1189    size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7);
1190
1191    if (buflen < ext_len) {
1192        return -ENOSPC;
1193    }
1194
1195    *ext_backing_fmt = (QCowExtension) {
1196        .magic  = cpu_to_be32(magic),
1197        .len    = cpu_to_be32(len),
1198    };
1199    memcpy(buf + sizeof(QCowExtension), s, len);
1200
1201    return ext_len;
1202}
1203
1204/*
1205 * Updates the qcow2 header, including the variable length parts of it, i.e.
1206 * the backing file name and all extensions. qcow2 was not designed to allow
1207 * such changes, so if we run out of space (we can only use the first cluster)
1208 * this function may fail.
1209 *
1210 * Returns 0 on success, -errno in error cases.
1211 */
1212int qcow2_update_header(BlockDriverState *bs)
1213{
1214    BDRVQcowState *s = bs->opaque;
1215    QCowHeader *header;
1216    char *buf;
1217    size_t buflen = s->cluster_size;
1218    int ret;
1219    uint64_t total_size;
1220    uint32_t refcount_table_clusters;
1221    size_t header_length;
1222    Qcow2UnknownHeaderExtension *uext;
1223
1224    buf = qemu_blockalign(bs, buflen);
1225
1226    /* Header structure */
1227    header = (QCowHeader*) buf;
1228
1229    if (buflen < sizeof(*header)) {
1230        ret = -ENOSPC;
1231        goto fail;
1232    }
1233
1234    header_length = sizeof(*header) + s->unknown_header_fields_size;
1235    total_size = bs->total_sectors * BDRV_SECTOR_SIZE;
1236    refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
1237
1238    *header = (QCowHeader) {
1239        /* Version 2 fields */
1240        .magic                  = cpu_to_be32(QCOW_MAGIC),
1241        .version                = cpu_to_be32(s->qcow_version),
1242        .backing_file_offset    = 0,
1243        .backing_file_size      = 0,
1244        .cluster_bits           = cpu_to_be32(s->cluster_bits),
1245        .size                   = cpu_to_be64(total_size),
1246        .crypt_method           = cpu_to_be32(s->crypt_method_header),
1247        .l1_size                = cpu_to_be32(s->l1_size),
1248        .l1_table_offset        = cpu_to_be64(s->l1_table_offset),
1249        .refcount_table_offset  = cpu_to_be64(s->refcount_table_offset),
1250        .refcount_table_clusters = cpu_to_be32(refcount_table_clusters),
1251        .nb_snapshots           = cpu_to_be32(s->nb_snapshots),
1252        .snapshots_offset       = cpu_to_be64(s->snapshots_offset),
1253
1254        /* Version 3 fields */
1255        .incompatible_features  = cpu_to_be64(s->incompatible_features),
1256        .compatible_features    = cpu_to_be64(s->compatible_features),
1257        .autoclear_features     = cpu_to_be64(s->autoclear_features),
1258        .refcount_order         = cpu_to_be32(s->refcount_order),
1259        .header_length          = cpu_to_be32(header_length),
1260    };
1261
1262    /* For older versions, write a shorter header */
1263    switch (s->qcow_version) {
1264    case 2:
1265        ret = offsetof(QCowHeader, incompatible_features);
1266        break;
1267    case 3:
1268        ret = sizeof(*header);
1269        break;
1270    default:
1271        ret = -EINVAL;
1272        goto fail;
1273    }
1274
1275    buf += ret;
1276    buflen -= ret;
1277    memset(buf, 0, buflen);
1278
1279    /* Preserve any unknown field in the header */
1280    if (s->unknown_header_fields_size) {
1281        if (buflen < s->unknown_header_fields_size) {
1282            ret = -ENOSPC;
1283            goto fail;
1284        }
1285
1286        memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size);
1287        buf += s->unknown_header_fields_size;
1288        buflen -= s->unknown_header_fields_size;
1289    }
1290
1291    /* Backing file format header extension */
1292    if (*bs->backing_format) {
1293        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT,
1294                             bs->backing_format, strlen(bs->backing_format),
1295                             buflen);
1296        if (ret < 0) {
1297            goto fail;
1298        }
1299
1300        buf += ret;
1301        buflen -= ret;
1302    }
1303
1304    /* Feature table */
1305    Qcow2Feature features[] = {
1306        {
1307            .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
1308            .bit  = QCOW2_INCOMPAT_DIRTY_BITNR,
1309            .name = "dirty bit",
1310        },
1311        {
1312            .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
1313            .bit  = QCOW2_INCOMPAT_CORRUPT_BITNR,
1314            .name = "corrupt bit",
1315        },
1316        {
1317            .type = QCOW2_FEAT_TYPE_COMPATIBLE,
1318            .bit  = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
1319            .name = "lazy refcounts",
1320        },
1321    };
1322
1323    ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE,
1324                         features, sizeof(features), buflen);
1325    if (ret < 0) {
1326        goto fail;
1327    }
1328    buf += ret;
1329    buflen -= ret;
1330
1331    /* Keep unknown header extensions */
1332    QLIST_FOREACH(uext, &s->unknown_header_ext, next) {
1333        ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen);
1334        if (ret < 0) {
1335            goto fail;
1336        }
1337
1338        buf += ret;
1339        buflen -= ret;
1340    }
1341
1342    /* End of header extensions */
1343    ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen);
1344    if (ret < 0) {
1345        goto fail;
1346    }
1347
1348    buf += ret;
1349    buflen -= ret;
1350
1351    /* Backing file name */
1352    if (*bs->backing_file) {
1353        size_t backing_file_len = strlen(bs->backing_file);
1354
1355        if (buflen < backing_file_len) {
1356            ret = -ENOSPC;
1357            goto fail;
1358        }
1359
1360        /* Using strncpy is ok here, since buf is not NUL-terminated. */
1361        strncpy(buf, bs->backing_file, buflen);
1362
1363        header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
1364        header->backing_file_size   = cpu_to_be32(backing_file_len);
1365    }
1366
1367    /* Write the new header */
1368    ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size);
1369    if (ret < 0) {
1370        goto fail;
1371    }
1372
1373    ret = 0;
1374fail:
1375    qemu_vfree(header);
1376    return ret;
1377}
1378
1379static int qcow2_change_backing_file(BlockDriverState *bs,
1380    const char *backing_file, const char *backing_fmt)
1381{
1382    pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1383    pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1384
1385    return qcow2_update_header(bs);
1386}
1387
1388static int preallocate(BlockDriverState *bs)
1389{
1390    uint64_t nb_sectors;
1391    uint64_t offset;
1392    uint64_t host_offset = 0;
1393    int num;
1394    int ret;
1395    QCowL2Meta *meta;
1396
1397    nb_sectors = bdrv_getlength(bs) >> 9;
1398    offset = 0;
1399
1400    while (nb_sectors) {
1401        num = MIN(nb_sectors, INT_MAX >> 9);
1402        ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num,
1403                                         &host_offset, &meta);
1404        if (ret < 0) {
1405            return ret;
1406        }
1407
1408        ret = qcow2_alloc_cluster_link_l2(bs, meta);
1409        if (ret < 0) {
1410            qcow2_free_any_clusters(bs, meta->alloc_offset, meta->nb_clusters,
1411                                    QCOW2_DISCARD_NEVER);
1412            return ret;
1413        }
1414
1415        /* There are no dependent requests, but we need to remove our request
1416         * from the list of in-flight requests */
1417        if (meta != NULL) {
1418            QLIST_REMOVE(meta, next_in_flight);
1419        }
1420
1421        /* TODO Preallocate data if requested */
1422
1423        nb_sectors -= num;
1424        offset += num << 9;
1425    }
1426
1427    /*
1428     * It is expected that the image file is large enough to actually contain
1429     * all of the allocated clusters (otherwise we get failing reads after
1430     * EOF). Extend the image to the last allocated sector.
1431     */
1432    if (host_offset != 0) {
1433        uint8_t buf[512];
1434        memset(buf, 0, 512);
1435        ret = bdrv_write(bs->file, (host_offset >> 9) + num - 1, buf, 1);
1436        if (ret < 0) {
1437            return ret;
1438        }
1439    }
1440
1441    return 0;
1442}
1443
1444static int qcow2_create2(const char *filename, int64_t total_size,
1445                         const char *backing_file, const char *backing_format,
1446                         int flags, size_t cluster_size, int prealloc,
1447                         QEMUOptionParameter *options, int version,
1448                         Error **errp)
1449{
1450    /* Calculate cluster_bits */
1451    int cluster_bits;
1452    cluster_bits = ffs(cluster_size) - 1;
1453    if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
1454        (1 << cluster_bits) != cluster_size)
1455    {
1456        error_setg(errp, "Cluster size must be a power of two between %d and "
1457                   "%dk", 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10));
1458        return -EINVAL;
1459    }
1460
1461    /*
1462     * Open the image file and write a minimal qcow2 header.
1463     *
1464     * We keep things simple and start with a zero-sized image. We also
1465     * do without refcount blocks or a L1 table for now. We'll fix the
1466     * inconsistency later.
1467     *
1468     * We do need a refcount table because growing the refcount table means
1469     * allocating two new refcount blocks - the seconds of which would be at
1470     * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file
1471     * size for any qcow2 image.
1472     */
1473    BlockDriverState* bs;
1474    QCowHeader header;
1475    uint8_t* refcount_table;
1476    Error *local_err = NULL;
1477    int ret;
1478
1479    ret = bdrv_create_file(filename, options, &local_err);
1480    if (ret < 0) {
1481        error_propagate(errp, local_err);
1482        return ret;
1483    }
1484
1485    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR, &local_err);
1486    if (ret < 0) {
1487        error_propagate(errp, local_err);
1488        return ret;
1489    }
1490
1491    /* Write the header */
1492    memset(&header, 0, sizeof(header));
1493    header.magic = cpu_to_be32(QCOW_MAGIC);
1494    header.version = cpu_to_be32(version);
1495    header.cluster_bits = cpu_to_be32(cluster_bits);
1496    header.size = cpu_to_be64(0);
1497    header.l1_table_offset = cpu_to_be64(0);
1498    header.l1_size = cpu_to_be32(0);
1499    header.refcount_table_offset = cpu_to_be64(cluster_size);
1500    header.refcount_table_clusters = cpu_to_be32(1);
1501    header.refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT);
1502    header.header_length = cpu_to_be32(sizeof(header));
1503
1504    if (flags & BLOCK_FLAG_ENCRYPT) {
1505        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
1506    } else {
1507        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
1508    }
1509
1510    if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) {
1511        header.compatible_features |=
1512            cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
1513    }
1514
1515    ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
1516    if (ret < 0) {
1517        error_setg_errno(errp, -ret, "Could not write qcow2 header");
1518        goto out;
1519    }
1520
1521    /* Write an empty refcount table */
1522    refcount_table = g_malloc0(cluster_size);
1523    ret = bdrv_pwrite(bs, cluster_size, refcount_table, cluster_size);
1524    g_free(refcount_table);
1525
1526    if (ret < 0) {
1527        error_setg_errno(errp, -ret, "Could not write refcount table");
1528        goto out;
1529    }
1530
1531    bdrv_close(bs);
1532
1533    /*
1534     * And now open the image and make it consistent first (i.e. increase the
1535     * refcount of the cluster that is occupied by the header and the refcount
1536     * table)
1537     */
1538    BlockDriver* drv = bdrv_find_format("qcow2");
1539    assert(drv != NULL);
1540    ret = bdrv_open(bs, filename, NULL,
1541        BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv, &local_err);
1542    if (ret < 0) {
1543        error_propagate(errp, local_err);
1544        goto out;
1545    }
1546
1547    ret = qcow2_alloc_clusters(bs, 2 * cluster_size);
1548    if (ret < 0) {
1549        error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 "
1550                         "header and refcount table");
1551        goto out;
1552
1553    } else if (ret != 0) {
1554        error_report("Huh, first cluster in empty image is already in use?");
1555        abort();
1556    }
1557
1558    /* Okay, now that we have a valid image, let's give it the right size */
1559    ret = bdrv_truncate(bs, total_size * BDRV_SECTOR_SIZE);
1560    if (ret < 0) {
1561        error_setg_errno(errp, -ret, "Could not resize image");
1562        goto out;
1563    }
1564
1565    /* Want a backing file? There you go.*/
1566    if (backing_file) {
1567        ret = bdrv_change_backing_file(bs, backing_file, backing_format);
1568        if (ret < 0) {
1569            error_setg_errno(errp, -ret, "Could not assign backing file '%s' "
1570                             "with format '%s'", backing_file, backing_format);
1571            goto out;
1572        }
1573    }
1574
1575    /* And if we're supposed to preallocate metadata, do that now */
1576    if (prealloc) {
1577        BDRVQcowState *s = bs->opaque;
1578        qemu_co_mutex_lock(&s->lock);
1579        ret = preallocate(bs);
1580        qemu_co_mutex_unlock(&s->lock);
1581        if (ret < 0) {
1582            error_setg_errno(errp, -ret, "Could not preallocate metadata");
1583            goto out;
1584        }
1585    }
1586
1587    bdrv_close(bs);
1588
1589    /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning */
1590    ret = bdrv_open(bs, filename, NULL,
1591                    BDRV_O_RDWR | BDRV_O_CACHE_WB, drv, &local_err);
1592    if (error_is_set(&local_err)) {
1593        error_propagate(errp, local_err);
1594        goto out;
1595    }
1596
1597    ret = 0;
1598out:
1599    bdrv_unref(bs);
1600    return ret;
1601}
1602
1603static int qcow2_create(const char *filename, QEMUOptionParameter *options,
1604                        Error **errp)
1605{
1606    const char *backing_file = NULL;
1607    const char *backing_fmt = NULL;
1608    uint64_t sectors = 0;
1609    int flags = 0;
1610    size_t cluster_size = DEFAULT_CLUSTER_SIZE;
1611    int prealloc = 0;
1612    int version = 3;
1613    Error *local_err = NULL;
1614    int ret;
1615
1616    /* Read out options */
1617    while (options && options->name) {
1618        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1619            sectors = options->value.n / 512;
1620        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
1621            backing_file = options->value.s;
1622        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
1623            backing_fmt = options->value.s;
1624        } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) {
1625            flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0;
1626        } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
1627            if (options->value.n) {
1628                cluster_size = options->value.n;
1629            }
1630        } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
1631            if (!options->value.s || !strcmp(options->value.s, "off")) {
1632                prealloc = 0;
1633            } else if (!strcmp(options->value.s, "metadata")) {
1634                prealloc = 1;
1635            } else {
1636                error_setg(errp, "Invalid preallocation mode: '%s'",
1637                           options->value.s);
1638                return -EINVAL;
1639            }
1640        } else if (!strcmp(options->name, BLOCK_OPT_COMPAT_LEVEL)) {
1641            if (!options->value.s) {
1642                /* keep the default */
1643            } else if (!strcmp(options->value.s, "0.10")) {
1644                version = 2;
1645            } else if (!strcmp(options->value.s, "1.1")) {
1646                version = 3;
1647            } else {
1648                error_setg(errp, "Invalid compatibility level: '%s'",
1649                           options->value.s);
1650                return -EINVAL;
1651            }
1652        } else if (!strcmp(options->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
1653            flags |= options->value.n ? BLOCK_FLAG_LAZY_REFCOUNTS : 0;
1654        }
1655        options++;
1656    }
1657
1658    if (backing_file && prealloc) {
1659        error_setg(errp, "Backing file and preallocation cannot be used at "
1660                   "the same time");
1661        return -EINVAL;
1662    }
1663
1664    if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) {
1665        error_setg(errp, "Lazy refcounts only supported with compatibility "
1666                   "level 1.1 and above (use compat=1.1 or greater)");
1667        return -EINVAL;
1668    }
1669
1670    ret = qcow2_create2(filename, sectors, backing_file, backing_fmt, flags,
1671                        cluster_size, prealloc, options, version, &local_err);
1672    if (error_is_set(&local_err)) {
1673        error_propagate(errp, local_err);
1674    }
1675    return ret;
1676}
1677
1678static int qcow2_make_empty(BlockDriverState *bs)
1679{
1680#if 0
1681    /* XXX: not correct */
1682    BDRVQcowState *s = bs->opaque;
1683    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1684    int ret;
1685
1686    memset(s->l1_table, 0, l1_length);
1687    if (bdrv_pwrite(bs->file, s->l1_table_offset, s->l1_table, l1_length) < 0)
1688        return -1;
1689    ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length);
1690    if (ret < 0)
1691        return ret;
1692
1693    l2_cache_reset(bs);
1694#endif
1695    return 0;
1696}
1697
1698static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
1699    int64_t sector_num, int nb_sectors)
1700{
1701    int ret;
1702    BDRVQcowState *s = bs->opaque;
1703
1704    /* Emulate misaligned zero writes */
1705    if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) {
1706        return -ENOTSUP;
1707    }
1708
1709    /* Whatever is left can use real zero clusters */
1710    qemu_co_mutex_lock(&s->lock);
1711    ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS,
1712        nb_sectors);
1713    qemu_co_mutex_unlock(&s->lock);
1714
1715    return ret;
1716}
1717
1718static coroutine_fn int qcow2_co_discard(BlockDriverState *bs,
1719    int64_t sector_num, int nb_sectors)
1720{
1721    int ret;
1722    BDRVQcowState *s = bs->opaque;
1723
1724    qemu_co_mutex_lock(&s->lock);
1725    ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS,
1726        nb_sectors, QCOW2_DISCARD_REQUEST);
1727    qemu_co_mutex_unlock(&s->lock);
1728    return ret;
1729}
1730
1731static int qcow2_truncate(BlockDriverState *bs, int64_t offset)
1732{
1733    BDRVQcowState *s = bs->opaque;
1734    int64_t new_l1_size;
1735    int ret;
1736
1737    if (offset & 511) {
1738        error_report("The new size must be a multiple of 512");
1739        return -EINVAL;
1740    }
1741
1742    /* cannot proceed if image has snapshots */
1743    if (s->nb_snapshots) {
1744        error_report("Can't resize an image which has snapshots");
1745        return -ENOTSUP;
1746    }
1747
1748    /* shrinking is currently not supported */
1749    if (offset < bs->total_sectors * 512) {
1750        error_report("qcow2 doesn't support shrinking images yet");
1751        return -ENOTSUP;
1752    }
1753
1754    new_l1_size = size_to_l1(s, offset);
1755    ret = qcow2_grow_l1_table(bs, new_l1_size, true);
1756    if (ret < 0) {
1757        return ret;
1758    }
1759
1760    /* write updated header.size */
1761    offset = cpu_to_be64(offset);
1762    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size),
1763                           &offset, sizeof(uint64_t));
1764    if (ret < 0) {
1765        return ret;
1766    }
1767
1768    s->l1_vm_state_index = new_l1_size;
1769    return 0;
1770}
1771
1772/* XXX: put compressed sectors first, then all the cluster aligned
1773   tables to avoid losing bytes in alignment */
1774static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
1775                                  const uint8_t *buf, int nb_sectors)
1776{
1777    BDRVQcowState *s = bs->opaque;
1778    z_stream strm;
1779    int ret, out_len;
1780    uint8_t *out_buf;
1781    uint64_t cluster_offset;
1782
1783    if (nb_sectors == 0) {
1784        /* align end of file to a sector boundary to ease reading with
1785           sector based I/Os */
1786        cluster_offset = bdrv_getlength(bs->file);
1787        cluster_offset = (cluster_offset + 511) & ~511;
1788        bdrv_truncate(bs->file, cluster_offset);
1789        return 0;
1790    }
1791
1792    if (nb_sectors != s->cluster_sectors) {
1793        ret = -EINVAL;
1794
1795        /* Zero-pad last write if image size is not cluster aligned */
1796        if (sector_num + nb_sectors == bs->total_sectors &&
1797            nb_sectors < s->cluster_sectors) {
1798            uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size);
1799            memset(pad_buf, 0, s->cluster_size);
1800            memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE);
1801            ret = qcow2_write_compressed(bs, sector_num,
1802                                         pad_buf, s->cluster_sectors);
1803            qemu_vfree(pad_buf);
1804        }
1805        return ret;
1806    }
1807
1808    out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
1809
1810    /* best compression, small window, no zlib header */
1811    memset(&strm, 0, sizeof(strm));
1812    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1813                       Z_DEFLATED, -12,
1814                       9, Z_DEFAULT_STRATEGY);
1815    if (ret != 0) {
1816        ret = -EINVAL;
1817        goto fail;
1818    }
1819
1820    strm.avail_in = s->cluster_size;
1821    strm.next_in = (uint8_t *)buf;
1822    strm.avail_out = s->cluster_size;
1823    strm.next_out = out_buf;
1824
1825    ret = deflate(&strm, Z_FINISH);
1826    if (ret != Z_STREAM_END && ret != Z_OK) {
1827        deflateEnd(&strm);
1828        ret = -EINVAL;
1829        goto fail;
1830    }
1831    out_len = strm.next_out - out_buf;
1832
1833    deflateEnd(&strm);
1834
1835    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1836        /* could not compress: write normal cluster */
1837        ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors);
1838        if (ret < 0) {
1839            goto fail;
1840        }
1841    } else {
1842        cluster_offset = qcow2_alloc_compressed_cluster_offset(bs,
1843            sector_num << 9, out_len);
1844        if (!cluster_offset) {
1845            ret = -EIO;
1846            goto fail;
1847        }
1848        cluster_offset &= s->cluster_offset_mask;
1849
1850        ret = qcow2_pre_write_overlap_check(bs, 0, cluster_offset, out_len);
1851        if (ret < 0) {
1852            goto fail;
1853        }
1854
1855        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
1856        ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len);
1857        if (ret < 0) {
1858            goto fail;
1859        }
1860    }
1861
1862    ret = 0;
1863fail:
1864    g_free(out_buf);
1865    return ret;
1866}
1867
1868static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
1869{
1870    BDRVQcowState *s = bs->opaque;
1871    int ret;
1872
1873    qemu_co_mutex_lock(&s->lock);
1874    ret = qcow2_cache_flush(bs, s->l2_table_cache);
1875    if (ret < 0) {
1876        qemu_co_mutex_unlock(&s->lock);
1877        return ret;
1878    }
1879
1880    if (qcow2_need_accurate_refcounts(s)) {
1881        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
1882        if (ret < 0) {
1883            qemu_co_mutex_unlock(&s->lock);
1884            return ret;
1885        }
1886    }
1887    qemu_co_mutex_unlock(&s->lock);
1888
1889    return 0;
1890}
1891
1892static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1893{
1894    BDRVQcowState *s = bs->opaque;
1895    bdi->cluster_size = s->cluster_size;
1896    bdi->vm_state_offset = qcow2_vm_state_offset(s);
1897    return 0;
1898}
1899
1900static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs)
1901{
1902    BDRVQcowState *s = bs->opaque;
1903    ImageInfoSpecific *spec_info = g_new(ImageInfoSpecific, 1);
1904
1905    *spec_info = (ImageInfoSpecific){
1906        .kind  = IMAGE_INFO_SPECIFIC_KIND_QCOW2,
1907        {
1908            .qcow2 = g_new(ImageInfoSpecificQCow2, 1),
1909        },
1910    };
1911    if (s->qcow_version == 2) {
1912        *spec_info->qcow2 = (ImageInfoSpecificQCow2){
1913            .compat = g_strdup("0.10"),
1914        };
1915    } else if (s->qcow_version == 3) {
1916        *spec_info->qcow2 = (ImageInfoSpecificQCow2){
1917            .compat             = g_strdup("1.1"),
1918            .lazy_refcounts     = s->compatible_features &
1919                                  QCOW2_COMPAT_LAZY_REFCOUNTS,
1920            .has_lazy_refcounts = true,
1921        };
1922    }
1923
1924    return spec_info;
1925}
1926
1927#if 0
1928static void dump_refcounts(BlockDriverState *bs)
1929{
1930    BDRVQcowState *s = bs->opaque;
1931    int64_t nb_clusters, k, k1, size;
1932    int refcount;
1933
1934    size = bdrv_getlength(bs->file);
1935    nb_clusters = size_to_clusters(s, size);
1936    for(k = 0; k < nb_clusters;) {
1937        k1 = k;
1938        refcount = get_refcount(bs, k);
1939        k++;
1940        while (k < nb_clusters && get_refcount(bs, k) == refcount)
1941            k++;
1942        printf("%" PRId64 ": refcount=%d nb=%" PRId64 "\n", k, refcount,
1943               k - k1);
1944    }
1945}
1946#endif
1947
1948static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
1949                              int64_t pos)
1950{
1951    BDRVQcowState *s = bs->opaque;
1952    int64_t total_sectors = bs->total_sectors;
1953    int growable = bs->growable;
1954    bool zero_beyond_eof = bs->zero_beyond_eof;
1955    int ret;
1956
1957    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
1958    bs->growable = 1;
1959    bs->zero_beyond_eof = false;
1960    ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov);
1961    bs->growable = growable;
1962    bs->zero_beyond_eof = zero_beyond_eof;
1963
1964    /* bdrv_co_do_writev will have increased the total_sectors value to include
1965     * the VM state - the VM state is however not an actual part of the block
1966     * device, therefore, we need to restore the old value. */
1967    bs->total_sectors = total_sectors;
1968
1969    return ret;
1970}
1971
1972static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf,
1973                              int64_t pos, int size)
1974{
1975    BDRVQcowState *s = bs->opaque;
1976    int growable = bs->growable;
1977    bool zero_beyond_eof = bs->zero_beyond_eof;
1978    int ret;
1979
1980    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
1981    bs->growable = 1;
1982    bs->zero_beyond_eof = false;
1983    ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size);
1984    bs->growable = growable;
1985    bs->zero_beyond_eof = zero_beyond_eof;
1986
1987    return ret;
1988}
1989
1990/*
1991 * Downgrades an image's version. To achieve this, any incompatible features
1992 * have to be removed.
1993 */
1994static int qcow2_downgrade(BlockDriverState *bs, int target_version)
1995{
1996    BDRVQcowState *s = bs->opaque;
1997    int current_version = s->qcow_version;
1998    int ret;
1999
2000    if (target_version == current_version) {
2001        return 0;
2002    } else if (target_version > current_version) {
2003        return -EINVAL;
2004    } else if (target_version != 2) {
2005        return -EINVAL;
2006    }
2007
2008    if (s->refcount_order != 4) {
2009        /* we would have to convert the image to a refcount_order == 4 image
2010         * here; however, since qemu (at the time of writing this) does not
2011         * support anything different than 4 anyway, there is no point in doing
2012         * so right now; however, we should error out (if qemu supports this in
2013         * the future and this code has not been adapted) */
2014        error_report("qcow2_downgrade: Image refcount orders other than 4 are "
2015                     "currently not supported.");
2016        return -ENOTSUP;
2017    }
2018
2019    /* clear incompatible features */
2020    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
2021        ret = qcow2_mark_clean(bs);
2022        if (ret < 0) {
2023            return ret;
2024        }
2025    }
2026
2027    /* with QCOW2_INCOMPAT_CORRUPT, it is pretty much impossible to get here in
2028     * the first place; if that happens nonetheless, returning -ENOTSUP is the
2029     * best thing to do anyway */
2030
2031    if (s->incompatible_features) {
2032        return -ENOTSUP;
2033    }
2034
2035    /* since we can ignore compatible features, we can set them to 0 as well */
2036    s->compatible_features = 0;
2037    /* if lazy refcounts have been used, they have already been fixed through
2038     * clearing the dirty flag */
2039
2040    /* clearing autoclear features is trivial */
2041    s->autoclear_features = 0;
2042
2043    ret = qcow2_expand_zero_clusters(bs);
2044    if (ret < 0) {
2045        return ret;
2046    }
2047
2048    s->qcow_version = target_version;
2049    ret = qcow2_update_header(bs);
2050    if (ret < 0) {
2051        s->qcow_version = current_version;
2052        return ret;
2053    }
2054    return 0;
2055}
2056
2057static int qcow2_amend_options(BlockDriverState *bs,
2058                               QEMUOptionParameter *options)
2059{
2060    BDRVQcowState *s = bs->opaque;
2061    int old_version = s->qcow_version, new_version = old_version;
2062    uint64_t new_size = 0;
2063    const char *backing_file = NULL, *backing_format = NULL;
2064    bool lazy_refcounts = s->use_lazy_refcounts;
2065    int ret;
2066    int i;
2067
2068    for (i = 0; options[i].name; i++)
2069    {
2070        if (!options[i].assigned) {
2071            /* only change explicitly defined options */
2072            continue;
2073        }
2074
2075        if (!strcmp(options[i].name, "compat")) {
2076            if (!options[i].value.s) {
2077                /* preserve default */
2078            } else if (!strcmp(options[i].value.s, "0.10")) {
2079                new_version = 2;
2080            } else if (!strcmp(options[i].value.s, "1.1")) {
2081                new_version = 3;
2082            } else {
2083                fprintf(stderr, "Unknown compatibility level %s.\n",
2084                        options[i].value.s);
2085                return -EINVAL;
2086            }
2087        } else if (!strcmp(options[i].name, "preallocation")) {
2088            fprintf(stderr, "Cannot change preallocation mode.\n");
2089            return -ENOTSUP;
2090        } else if (!strcmp(options[i].name, "size")) {
2091            new_size = options[i].value.n;
2092        } else if (!strcmp(options[i].name, "backing_file")) {
2093            backing_file = options[i].value.s;
2094        } else if (!strcmp(options[i].name, "backing_fmt")) {
2095            backing_format = options[i].value.s;
2096        } else if (!strcmp(options[i].name, "encryption")) {
2097            if ((options[i].value.n != !!s->crypt_method)) {
2098                fprintf(stderr, "Changing the encryption flag is not "
2099                        "supported.\n");
2100                return -ENOTSUP;
2101            }
2102        } else if (!strcmp(options[i].name, "cluster_size")) {
2103            if (options[i].value.n != s->cluster_size) {
2104                fprintf(stderr, "Changing the cluster size is not "
2105                        "supported.\n");
2106                return -ENOTSUP;
2107            }
2108        } else if (!strcmp(options[i].name, "lazy_refcounts")) {
2109            lazy_refcounts = options[i].value.n;
2110        } else {
2111            /* if this assertion fails, this probably means a new option was
2112             * added without having it covered here */
2113            assert(false);
2114        }
2115    }
2116
2117    if (new_version != old_version) {
2118        if (new_version > old_version) {
2119            /* Upgrade */
2120            s->qcow_version = new_version;
2121            ret = qcow2_update_header(bs);
2122            if (ret < 0) {
2123                s->qcow_version = old_version;
2124                return ret;
2125            }
2126        } else {
2127            ret = qcow2_downgrade(bs, new_version);
2128            if (ret < 0) {
2129                return ret;
2130            }
2131        }
2132    }
2133
2134    if (backing_file || backing_format) {
2135        ret = qcow2_change_backing_file(bs, backing_file ?: bs->backing_file,
2136                                        backing_format ?: bs->backing_format);
2137        if (ret < 0) {
2138            return ret;
2139        }
2140    }
2141
2142    if (s->use_lazy_refcounts != lazy_refcounts) {
2143        if (lazy_refcounts) {
2144            if (s->qcow_version < 3) {
2145                fprintf(stderr, "Lazy refcounts only supported with compatibility "
2146                        "level 1.1 and above (use compat=1.1 or greater)\n");
2147                return -EINVAL;
2148            }
2149            s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS;
2150            ret = qcow2_update_header(bs);
2151            if (ret < 0) {
2152                s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS;
2153                return ret;
2154            }
2155            s->use_lazy_refcounts = true;
2156        } else {
2157            /* make image clean first */
2158            ret = qcow2_mark_clean(bs);
2159            if (ret < 0) {
2160                return ret;
2161            }
2162            /* now disallow lazy refcounts */
2163            s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS;
2164            ret = qcow2_update_header(bs);
2165            if (ret < 0) {
2166                s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS;
2167                return ret;
2168            }
2169            s->use_lazy_refcounts = false;
2170        }
2171    }
2172
2173    if (new_size) {
2174        ret = bdrv_truncate(bs, new_size);
2175        if (ret < 0) {
2176            return ret;
2177        }
2178    }
2179
2180    return 0;
2181}
2182
2183static QEMUOptionParameter qcow2_create_options[] = {
2184    {
2185        .name = BLOCK_OPT_SIZE,
2186        .type = OPT_SIZE,
2187        .help = "Virtual disk size"
2188    },
2189    {
2190        .name = BLOCK_OPT_COMPAT_LEVEL,
2191        .type = OPT_STRING,
2192        .help = "Compatibility level (0.10 or 1.1)"
2193    },
2194    {
2195        .name = BLOCK_OPT_BACKING_FILE,
2196        .type = OPT_STRING,
2197        .help = "File name of a base image"
2198    },
2199    {
2200        .name = BLOCK_OPT_BACKING_FMT,
2201        .type = OPT_STRING,
2202        .help = "Image format of the base image"
2203    },
2204    {
2205        .name = BLOCK_OPT_ENCRYPT,
2206        .type = OPT_FLAG,
2207        .help = "Encrypt the image"
2208    },
2209    {
2210        .name = BLOCK_OPT_CLUSTER_SIZE,
2211        .type = OPT_SIZE,
2212        .help = "qcow2 cluster size",
2213        .value = { .n = DEFAULT_CLUSTER_SIZE },
2214    },
2215    {
2216        .name = BLOCK_OPT_PREALLOC,
2217        .type = OPT_STRING,
2218        .help = "Preallocation mode (allowed values: off, metadata)"
2219    },
2220    {
2221        .name = BLOCK_OPT_LAZY_REFCOUNTS,
2222        .type = OPT_FLAG,
2223        .help = "Postpone refcount updates",
2224    },
2225    { NULL }
2226};
2227
2228static BlockDriver bdrv_qcow2 = {
2229    .format_name        = "qcow2",
2230    .instance_size      = sizeof(BDRVQcowState),
2231    .bdrv_probe         = qcow2_probe,
2232    .bdrv_open          = qcow2_open,
2233    .bdrv_close         = qcow2_close,
2234    .bdrv_reopen_prepare  = qcow2_reopen_prepare,
2235    .bdrv_create        = qcow2_create,
2236    .bdrv_has_zero_init = bdrv_has_zero_init_1,
2237    .bdrv_co_get_block_status = qcow2_co_get_block_status,
2238    .bdrv_set_key       = qcow2_set_key,
2239    .bdrv_make_empty    = qcow2_make_empty,
2240
2241    .bdrv_co_readv          = qcow2_co_readv,
2242    .bdrv_co_writev         = qcow2_co_writev,
2243    .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
2244
2245    .bdrv_co_write_zeroes   = qcow2_co_write_zeroes,
2246    .bdrv_co_discard        = qcow2_co_discard,
2247    .bdrv_truncate          = qcow2_truncate,
2248    .bdrv_write_compressed  = qcow2_write_compressed,
2249
2250    .bdrv_snapshot_create   = qcow2_snapshot_create,
2251    .bdrv_snapshot_goto     = qcow2_snapshot_goto,
2252    .bdrv_snapshot_delete   = qcow2_snapshot_delete,
2253    .bdrv_snapshot_list     = qcow2_snapshot_list,
2254    .bdrv_snapshot_load_tmp     = qcow2_snapshot_load_tmp,
2255    .bdrv_get_info      = qcow2_get_info,
2256    .bdrv_get_specific_info = qcow2_get_specific_info,
2257
2258    .bdrv_save_vmstate    = qcow2_save_vmstate,
2259    .bdrv_load_vmstate    = qcow2_load_vmstate,
2260
2261    .bdrv_change_backing_file   = qcow2_change_backing_file,
2262
2263    .bdrv_invalidate_cache      = qcow2_invalidate_cache,
2264
2265    .create_options = qcow2_create_options,
2266    .bdrv_check = qcow2_check,
2267    .bdrv_amend_options = qcow2_amend_options,
2268};
2269
2270static void bdrv_qcow2_init(void)
2271{
2272    bdrv_register(&bdrv_qcow2);
2273}
2274
2275block_init(bdrv_qcow2_init);
2276