qemu/block/qcow2-refcount.c
<<
>>
Prefs
   1/*
   2 * Block driver for the QCOW version 2 format
   3 *
   4 * Copyright (c) 2004-2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "qapi/error.h"
  27#include "qemu-common.h"
  28#include "block/block_int.h"
  29#include "block/qcow2.h"
  30#include "qemu/range.h"
  31#include "qemu/bswap.h"
  32
  33static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
  34static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
  35                            int64_t offset, int64_t length, uint64_t addend,
  36                            bool decrease, enum qcow2_discard_type type);
  37
  38static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
  39static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
  40static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
  41static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
  42static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
  43static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
  44static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
  45
  46static void set_refcount_ro0(void *refcount_array, uint64_t index,
  47                             uint64_t value);
  48static void set_refcount_ro1(void *refcount_array, uint64_t index,
  49                             uint64_t value);
  50static void set_refcount_ro2(void *refcount_array, uint64_t index,
  51                             uint64_t value);
  52static void set_refcount_ro3(void *refcount_array, uint64_t index,
  53                             uint64_t value);
  54static void set_refcount_ro4(void *refcount_array, uint64_t index,
  55                             uint64_t value);
  56static void set_refcount_ro5(void *refcount_array, uint64_t index,
  57                             uint64_t value);
  58static void set_refcount_ro6(void *refcount_array, uint64_t index,
  59                             uint64_t value);
  60
  61
  62static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
  63    &get_refcount_ro0,
  64    &get_refcount_ro1,
  65    &get_refcount_ro2,
  66    &get_refcount_ro3,
  67    &get_refcount_ro4,
  68    &get_refcount_ro5,
  69    &get_refcount_ro6
  70};
  71
  72static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
  73    &set_refcount_ro0,
  74    &set_refcount_ro1,
  75    &set_refcount_ro2,
  76    &set_refcount_ro3,
  77    &set_refcount_ro4,
  78    &set_refcount_ro5,
  79    &set_refcount_ro6
  80};
  81
  82
  83/*********************************************************/
  84/* refcount handling */
  85
  86int qcow2_refcount_init(BlockDriverState *bs)
  87{
  88    BDRVQcow2State *s = bs->opaque;
  89    unsigned int refcount_table_size2, i;
  90    int ret;
  91
  92    assert(s->refcount_order >= 0 && s->refcount_order <= 6);
  93
  94    s->get_refcount = get_refcount_funcs[s->refcount_order];
  95    s->set_refcount = set_refcount_funcs[s->refcount_order];
  96
  97    assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
  98    refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
  99    s->refcount_table = g_try_malloc(refcount_table_size2);
 100
 101    if (s->refcount_table_size > 0) {
 102        if (s->refcount_table == NULL) {
 103            ret = -ENOMEM;
 104            goto fail;
 105        }
 106        BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
 107        ret = bdrv_pread(bs->file->bs, s->refcount_table_offset,
 108                         s->refcount_table, refcount_table_size2);
 109        if (ret < 0) {
 110            goto fail;
 111        }
 112        for(i = 0; i < s->refcount_table_size; i++)
 113            be64_to_cpus(&s->refcount_table[i]);
 114    }
 115    return 0;
 116 fail:
 117    return ret;
 118}
 119
 120void qcow2_refcount_close(BlockDriverState *bs)
 121{
 122    BDRVQcow2State *s = bs->opaque;
 123    g_free(s->refcount_table);
 124}
 125
 126
 127static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
 128{
 129    return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
 130}
 131
 132static void set_refcount_ro0(void *refcount_array, uint64_t index,
 133                             uint64_t value)
 134{
 135    assert(!(value >> 1));
 136    ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
 137    ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
 138}
 139
 140static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
 141{
 142    return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
 143           & 0x3;
 144}
 145
 146static void set_refcount_ro1(void *refcount_array, uint64_t index,
 147                             uint64_t value)
 148{
 149    assert(!(value >> 2));
 150    ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
 151    ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
 152}
 153
 154static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
 155{
 156    return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
 157           & 0xf;
 158}
 159
 160static void set_refcount_ro2(void *refcount_array, uint64_t index,
 161                             uint64_t value)
 162{
 163    assert(!(value >> 4));
 164    ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
 165    ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
 166}
 167
 168static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
 169{
 170    return ((const uint8_t *)refcount_array)[index];
 171}
 172
 173static void set_refcount_ro3(void *refcount_array, uint64_t index,
 174                             uint64_t value)
 175{
 176    assert(!(value >> 8));
 177    ((uint8_t *)refcount_array)[index] = value;
 178}
 179
 180static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
 181{
 182    return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
 183}
 184
 185static void set_refcount_ro4(void *refcount_array, uint64_t index,
 186                             uint64_t value)
 187{
 188    assert(!(value >> 16));
 189    ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
 190}
 191
 192static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
 193{
 194    return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
 195}
 196
 197static void set_refcount_ro5(void *refcount_array, uint64_t index,
 198                             uint64_t value)
 199{
 200    assert(!(value >> 32));
 201    ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
 202}
 203
 204static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
 205{
 206    return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
 207}
 208
 209static void set_refcount_ro6(void *refcount_array, uint64_t index,
 210                             uint64_t value)
 211{
 212    ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
 213}
 214
 215
 216static int load_refcount_block(BlockDriverState *bs,
 217                               int64_t refcount_block_offset,
 218                               void **refcount_block)
 219{
 220    BDRVQcow2State *s = bs->opaque;
 221    int ret;
 222
 223    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
 224    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
 225        refcount_block);
 226
 227    return ret;
 228}
 229
 230/*
 231 * Retrieves the refcount of the cluster given by its index and stores it in
 232 * *refcount. Returns 0 on success and -errno on failure.
 233 */
 234int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
 235                       uint64_t *refcount)
 236{
 237    BDRVQcow2State *s = bs->opaque;
 238    uint64_t refcount_table_index, block_index;
 239    int64_t refcount_block_offset;
 240    int ret;
 241    void *refcount_block;
 242
 243    refcount_table_index = cluster_index >> s->refcount_block_bits;
 244    if (refcount_table_index >= s->refcount_table_size) {
 245        *refcount = 0;
 246        return 0;
 247    }
 248    refcount_block_offset =
 249        s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
 250    if (!refcount_block_offset) {
 251        *refcount = 0;
 252        return 0;
 253    }
 254
 255    if (offset_into_cluster(s, refcount_block_offset)) {
 256        qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
 257                                " unaligned (reftable index: %#" PRIx64 ")",
 258                                refcount_block_offset, refcount_table_index);
 259        return -EIO;
 260    }
 261
 262    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
 263                          &refcount_block);
 264    if (ret < 0) {
 265        return ret;
 266    }
 267
 268    block_index = cluster_index & (s->refcount_block_size - 1);
 269    *refcount = s->get_refcount(refcount_block, block_index);
 270
 271    qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
 272
 273    return 0;
 274}
 275
 276/*
 277 * Rounds the refcount table size up to avoid growing the table for each single
 278 * refcount block that is allocated.
 279 */
 280static unsigned int next_refcount_table_size(BDRVQcow2State *s,
 281    unsigned int min_size)
 282{
 283    unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1;
 284    unsigned int refcount_table_clusters =
 285        MAX(1, s->refcount_table_size >> (s->cluster_bits - 3));
 286
 287    while (min_clusters > refcount_table_clusters) {
 288        refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
 289    }
 290
 291    return refcount_table_clusters << (s->cluster_bits - 3);
 292}
 293
 294
 295/* Checks if two offsets are described by the same refcount block */
 296static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
 297    uint64_t offset_b)
 298{
 299    uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
 300    uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
 301
 302    return (block_a == block_b);
 303}
 304
 305/*
 306 * Loads a refcount block. If it doesn't exist yet, it is allocated first
 307 * (including growing the refcount table if needed).
 308 *
 309 * Returns 0 on success or -errno in error case
 310 */
 311static int alloc_refcount_block(BlockDriverState *bs,
 312                                int64_t cluster_index, void **refcount_block)
 313{
 314    BDRVQcow2State *s = bs->opaque;
 315    unsigned int refcount_table_index;
 316    int ret;
 317
 318    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
 319
 320    /* Find the refcount block for the given cluster */
 321    refcount_table_index = cluster_index >> s->refcount_block_bits;
 322
 323    if (refcount_table_index < s->refcount_table_size) {
 324
 325        uint64_t refcount_block_offset =
 326            s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
 327
 328        /* If it's already there, we're done */
 329        if (refcount_block_offset) {
 330            if (offset_into_cluster(s, refcount_block_offset)) {
 331                qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
 332                                        PRIx64 " unaligned (reftable index: "
 333                                        "%#x)", refcount_block_offset,
 334                                        refcount_table_index);
 335                return -EIO;
 336            }
 337
 338             return load_refcount_block(bs, refcount_block_offset,
 339                                        refcount_block);
 340        }
 341    }
 342
 343    /*
 344     * If we came here, we need to allocate something. Something is at least
 345     * a cluster for the new refcount block. It may also include a new refcount
 346     * table if the old refcount table is too small.
 347     *
 348     * Note that allocating clusters here needs some special care:
 349     *
 350     * - We can't use the normal qcow2_alloc_clusters(), it would try to
 351     *   increase the refcount and very likely we would end up with an endless
 352     *   recursion. Instead we must place the refcount blocks in a way that
 353     *   they can describe them themselves.
 354     *
 355     * - We need to consider that at this point we are inside update_refcounts
 356     *   and potentially doing an initial refcount increase. This means that
 357     *   some clusters have already been allocated by the caller, but their
 358     *   refcount isn't accurate yet. If we allocate clusters for metadata, we
 359     *   need to return -EAGAIN to signal the caller that it needs to restart
 360     *   the search for free clusters.
 361     *
 362     * - alloc_clusters_noref and qcow2_free_clusters may load a different
 363     *   refcount block into the cache
 364     */
 365
 366    *refcount_block = NULL;
 367
 368    /* We write to the refcount table, so we might depend on L2 tables */
 369    ret = qcow2_cache_flush(bs, s->l2_table_cache);
 370    if (ret < 0) {
 371        return ret;
 372    }
 373
 374    /* Allocate the refcount block itself and mark it as used */
 375    int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
 376    if (new_block < 0) {
 377        return new_block;
 378    }
 379
 380#ifdef DEBUG_ALLOC2
 381    fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
 382        " at %" PRIx64 "\n",
 383        refcount_table_index, cluster_index << s->cluster_bits, new_block);
 384#endif
 385
 386    if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
 387        /* Zero the new refcount block before updating it */
 388        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
 389                                    refcount_block);
 390        if (ret < 0) {
 391            goto fail_block;
 392        }
 393
 394        memset(*refcount_block, 0, s->cluster_size);
 395
 396        /* The block describes itself, need to update the cache */
 397        int block_index = (new_block >> s->cluster_bits) &
 398            (s->refcount_block_size - 1);
 399        s->set_refcount(*refcount_block, block_index, 1);
 400    } else {
 401        /* Described somewhere else. This can recurse at most twice before we
 402         * arrive at a block that describes itself. */
 403        ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
 404                              QCOW2_DISCARD_NEVER);
 405        if (ret < 0) {
 406            goto fail_block;
 407        }
 408
 409        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 410        if (ret < 0) {
 411            goto fail_block;
 412        }
 413
 414        /* Initialize the new refcount block only after updating its refcount,
 415         * update_refcount uses the refcount cache itself */
 416        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
 417                                    refcount_block);
 418        if (ret < 0) {
 419            goto fail_block;
 420        }
 421
 422        memset(*refcount_block, 0, s->cluster_size);
 423    }
 424
 425    /* Now the new refcount block needs to be written to disk */
 426    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
 427    qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, *refcount_block);
 428    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 429    if (ret < 0) {
 430        goto fail_block;
 431    }
 432
 433    /* If the refcount table is big enough, just hook the block up there */
 434    if (refcount_table_index < s->refcount_table_size) {
 435        uint64_t data64 = cpu_to_be64(new_block);
 436        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
 437        ret = bdrv_pwrite_sync(bs->file->bs,
 438            s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
 439            &data64, sizeof(data64));
 440        if (ret < 0) {
 441            goto fail_block;
 442        }
 443
 444        s->refcount_table[refcount_table_index] = new_block;
 445
 446        /* The new refcount block may be where the caller intended to put its
 447         * data, so let it restart the search. */
 448        return -EAGAIN;
 449    }
 450
 451    qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
 452
 453    /*
 454     * If we come here, we need to grow the refcount table. Again, a new
 455     * refcount table needs some space and we can't simply allocate to avoid
 456     * endless recursion.
 457     *
 458     * Therefore let's grab new refcount blocks at the end of the image, which
 459     * will describe themselves and the new refcount table. This way we can
 460     * reference them only in the new table and do the switch to the new
 461     * refcount table at once without producing an inconsistent state in
 462     * between.
 463     */
 464    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
 465
 466    /* Calculate the number of refcount blocks needed so far; this will be the
 467     * basis for calculating the index of the first cluster used for the
 468     * self-describing refcount structures which we are about to create.
 469     *
 470     * Because we reached this point, there cannot be any refcount entries for
 471     * cluster_index or higher indices yet. However, because new_block has been
 472     * allocated to describe that cluster (and it will assume this role later
 473     * on), we cannot use that index; also, new_block may actually have a higher
 474     * cluster index than cluster_index, so it needs to be taken into account
 475     * here (and 1 needs to be added to its value because that cluster is used).
 476     */
 477    uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
 478                                            (new_block >> s->cluster_bits) + 1),
 479                                        s->refcount_block_size);
 480
 481    if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
 482        return -EFBIG;
 483    }
 484
 485    /* And now we need at least one block more for the new metadata */
 486    uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
 487    uint64_t last_table_size;
 488    uint64_t blocks_clusters;
 489    do {
 490        uint64_t table_clusters =
 491            size_to_clusters(s, table_size * sizeof(uint64_t));
 492        blocks_clusters = 1 +
 493            ((table_clusters + s->refcount_block_size - 1)
 494            / s->refcount_block_size);
 495        uint64_t meta_clusters = table_clusters + blocks_clusters;
 496
 497        last_table_size = table_size;
 498        table_size = next_refcount_table_size(s, blocks_used +
 499            ((meta_clusters + s->refcount_block_size - 1)
 500            / s->refcount_block_size));
 501
 502    } while (last_table_size != table_size);
 503
 504#ifdef DEBUG_ALLOC2
 505    fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n",
 506        s->refcount_table_size, table_size);
 507#endif
 508
 509    /* Create the new refcount table and blocks */
 510    uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
 511        s->cluster_size;
 512    uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
 513    uint64_t *new_table = g_try_new0(uint64_t, table_size);
 514    void *new_blocks = g_try_malloc0(blocks_clusters * s->cluster_size);
 515
 516    assert(table_size > 0 && blocks_clusters > 0);
 517    if (new_table == NULL || new_blocks == NULL) {
 518        ret = -ENOMEM;
 519        goto fail_table;
 520    }
 521
 522    /* Fill the new refcount table */
 523    memcpy(new_table, s->refcount_table,
 524        s->refcount_table_size * sizeof(uint64_t));
 525    new_table[refcount_table_index] = new_block;
 526
 527    int i;
 528    for (i = 0; i < blocks_clusters; i++) {
 529        new_table[blocks_used + i] = meta_offset + (i * s->cluster_size);
 530    }
 531
 532    /* Fill the refcount blocks */
 533    uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t));
 534    int block = 0;
 535    for (i = 0; i < table_clusters + blocks_clusters; i++) {
 536        s->set_refcount(new_blocks, block++, 1);
 537    }
 538
 539    /* Write refcount blocks to disk */
 540    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
 541    ret = bdrv_pwrite_sync(bs->file->bs, meta_offset, new_blocks,
 542        blocks_clusters * s->cluster_size);
 543    g_free(new_blocks);
 544    new_blocks = NULL;
 545    if (ret < 0) {
 546        goto fail_table;
 547    }
 548
 549    /* Write refcount table to disk */
 550    for(i = 0; i < table_size; i++) {
 551        cpu_to_be64s(&new_table[i]);
 552    }
 553
 554    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
 555    ret = bdrv_pwrite_sync(bs->file->bs, table_offset, new_table,
 556        table_size * sizeof(uint64_t));
 557    if (ret < 0) {
 558        goto fail_table;
 559    }
 560
 561    for(i = 0; i < table_size; i++) {
 562        be64_to_cpus(&new_table[i]);
 563    }
 564
 565    /* Hook up the new refcount table in the qcow2 header */
 566    struct QEMU_PACKED {
 567        uint64_t d64;
 568        uint32_t d32;
 569    } data;
 570    cpu_to_be64w(&data.d64, table_offset);
 571    cpu_to_be32w(&data.d32, table_clusters);
 572    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
 573    ret = bdrv_pwrite_sync(bs->file->bs,
 574                           offsetof(QCowHeader, refcount_table_offset),
 575                           &data, sizeof(data));
 576    if (ret < 0) {
 577        goto fail_table;
 578    }
 579
 580    /* And switch it in memory */
 581    uint64_t old_table_offset = s->refcount_table_offset;
 582    uint64_t old_table_size = s->refcount_table_size;
 583
 584    g_free(s->refcount_table);
 585    s->refcount_table = new_table;
 586    s->refcount_table_size = table_size;
 587    s->refcount_table_offset = table_offset;
 588
 589    /* Free old table. */
 590    qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
 591                        QCOW2_DISCARD_OTHER);
 592
 593    ret = load_refcount_block(bs, new_block, refcount_block);
 594    if (ret < 0) {
 595        return ret;
 596    }
 597
 598    /* If we were trying to do the initial refcount update for some cluster
 599     * allocation, we might have used the same clusters to store newly
 600     * allocated metadata. Make the caller search some new space. */
 601    return -EAGAIN;
 602
 603fail_table:
 604    g_free(new_blocks);
 605    g_free(new_table);
 606fail_block:
 607    if (*refcount_block != NULL) {
 608        qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
 609    }
 610    return ret;
 611}
 612
 613void qcow2_process_discards(BlockDriverState *bs, int ret)
 614{
 615    BDRVQcow2State *s = bs->opaque;
 616    Qcow2DiscardRegion *d, *next;
 617
 618    QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
 619        QTAILQ_REMOVE(&s->discards, d, next);
 620
 621        /* Discard is optional, ignore the return value */
 622        if (ret >= 0) {
 623            bdrv_discard(bs->file->bs,
 624                         d->offset >> BDRV_SECTOR_BITS,
 625                         d->bytes >> BDRV_SECTOR_BITS);
 626        }
 627
 628        g_free(d);
 629    }
 630}
 631
 632static void update_refcount_discard(BlockDriverState *bs,
 633                                    uint64_t offset, uint64_t length)
 634{
 635    BDRVQcow2State *s = bs->opaque;
 636    Qcow2DiscardRegion *d, *p, *next;
 637
 638    QTAILQ_FOREACH(d, &s->discards, next) {
 639        uint64_t new_start = MIN(offset, d->offset);
 640        uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
 641
 642        if (new_end - new_start <= length + d->bytes) {
 643            /* There can't be any overlap, areas ending up here have no
 644             * references any more and therefore shouldn't get freed another
 645             * time. */
 646            assert(d->bytes + length == new_end - new_start);
 647            d->offset = new_start;
 648            d->bytes = new_end - new_start;
 649            goto found;
 650        }
 651    }
 652
 653    d = g_malloc(sizeof(*d));
 654    *d = (Qcow2DiscardRegion) {
 655        .bs     = bs,
 656        .offset = offset,
 657        .bytes  = length,
 658    };
 659    QTAILQ_INSERT_TAIL(&s->discards, d, next);
 660
 661found:
 662    /* Merge discard requests if they are adjacent now */
 663    QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
 664        if (p == d
 665            || p->offset > d->offset + d->bytes
 666            || d->offset > p->offset + p->bytes)
 667        {
 668            continue;
 669        }
 670
 671        /* Still no overlap possible */
 672        assert(p->offset == d->offset + d->bytes
 673            || d->offset == p->offset + p->bytes);
 674
 675        QTAILQ_REMOVE(&s->discards, p, next);
 676        d->offset = MIN(d->offset, p->offset);
 677        d->bytes += p->bytes;
 678        g_free(p);
 679    }
 680}
 681
 682/* XXX: cache several refcount block clusters ? */
 683/* @addend is the absolute value of the addend; if @decrease is set, @addend
 684 * will be subtracted from the current refcount, otherwise it will be added */
 685static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
 686                                                   int64_t offset,
 687                                                   int64_t length,
 688                                                   uint64_t addend,
 689                                                   bool decrease,
 690                                                   enum qcow2_discard_type type)
 691{
 692    BDRVQcow2State *s = bs->opaque;
 693    int64_t start, last, cluster_offset;
 694    void *refcount_block = NULL;
 695    int64_t old_table_index = -1;
 696    int ret;
 697
 698#ifdef DEBUG_ALLOC2
 699    fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
 700            " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
 701            addend);
 702#endif
 703    if (length < 0) {
 704        return -EINVAL;
 705    } else if (length == 0) {
 706        return 0;
 707    }
 708
 709    if (decrease) {
 710        qcow2_cache_set_dependency(bs, s->refcount_block_cache,
 711            s->l2_table_cache);
 712    }
 713
 714    start = start_of_cluster(s, offset);
 715    last = start_of_cluster(s, offset + length - 1);
 716    for(cluster_offset = start; cluster_offset <= last;
 717        cluster_offset += s->cluster_size)
 718    {
 719        int block_index;
 720        uint64_t refcount;
 721        int64_t cluster_index = cluster_offset >> s->cluster_bits;
 722        int64_t table_index = cluster_index >> s->refcount_block_bits;
 723
 724        /* Load the refcount block and allocate it if needed */
 725        if (table_index != old_table_index) {
 726            if (refcount_block) {
 727                qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
 728            }
 729            ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
 730            if (ret < 0) {
 731                goto fail;
 732            }
 733        }
 734        old_table_index = table_index;
 735
 736        qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
 737                                     refcount_block);
 738
 739        /* we can update the count and save it */
 740        block_index = cluster_index & (s->refcount_block_size - 1);
 741
 742        refcount = s->get_refcount(refcount_block, block_index);
 743        if (decrease ? (refcount - addend > refcount)
 744                     : (refcount + addend < refcount ||
 745                        refcount + addend > s->refcount_max))
 746        {
 747            ret = -EINVAL;
 748            goto fail;
 749        }
 750        if (decrease) {
 751            refcount -= addend;
 752        } else {
 753            refcount += addend;
 754        }
 755        if (refcount == 0 && cluster_index < s->free_cluster_index) {
 756            s->free_cluster_index = cluster_index;
 757        }
 758        s->set_refcount(refcount_block, block_index, refcount);
 759
 760        if (refcount == 0 && s->discard_passthrough[type]) {
 761            update_refcount_discard(bs, cluster_offset, s->cluster_size);
 762        }
 763    }
 764
 765    ret = 0;
 766fail:
 767    if (!s->cache_discards) {
 768        qcow2_process_discards(bs, ret);
 769    }
 770
 771    /* Write last changed block to disk */
 772    if (refcount_block) {
 773        qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
 774    }
 775
 776    /*
 777     * Try do undo any updates if an error is returned (This may succeed in
 778     * some cases like ENOSPC for allocating a new refcount block)
 779     */
 780    if (ret < 0) {
 781        int dummy;
 782        dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
 783                                !decrease, QCOW2_DISCARD_NEVER);
 784        (void)dummy;
 785    }
 786
 787    return ret;
 788}
 789
 790/*
 791 * Increases or decreases the refcount of a given cluster.
 792 *
 793 * @addend is the absolute value of the addend; if @decrease is set, @addend
 794 * will be subtracted from the current refcount, otherwise it will be added.
 795 *
 796 * On success 0 is returned; on failure -errno is returned.
 797 */
 798int qcow2_update_cluster_refcount(BlockDriverState *bs,
 799                                  int64_t cluster_index,
 800                                  uint64_t addend, bool decrease,
 801                                  enum qcow2_discard_type type)
 802{
 803    BDRVQcow2State *s = bs->opaque;
 804    int ret;
 805
 806    ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
 807                          decrease, type);
 808    if (ret < 0) {
 809        return ret;
 810    }
 811
 812    return 0;
 813}
 814
 815
 816
 817/*********************************************************/
 818/* cluster allocation functions */
 819
 820
 821
 822/* return < 0 if error */
 823static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
 824{
 825    BDRVQcow2State *s = bs->opaque;
 826    uint64_t i, nb_clusters, refcount;
 827    int ret;
 828
 829    /* We can't allocate clusters if they may still be queued for discard. */
 830    if (s->cache_discards) {
 831        qcow2_process_discards(bs, 0);
 832    }
 833
 834    nb_clusters = size_to_clusters(s, size);
 835retry:
 836    for(i = 0; i < nb_clusters; i++) {
 837        uint64_t next_cluster_index = s->free_cluster_index++;
 838        ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
 839
 840        if (ret < 0) {
 841            return ret;
 842        } else if (refcount != 0) {
 843            goto retry;
 844        }
 845    }
 846
 847    /* Make sure that all offsets in the "allocated" range are representable
 848     * in an int64_t */
 849    if (s->free_cluster_index > 0 &&
 850        s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits))
 851    {
 852        return -EFBIG;
 853    }
 854
 855#ifdef DEBUG_ALLOC2
 856    fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
 857            size,
 858            (s->free_cluster_index - nb_clusters) << s->cluster_bits);
 859#endif
 860    return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
 861}
 862
 863int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
 864{
 865    int64_t offset;
 866    int ret;
 867
 868    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
 869    do {
 870        offset = alloc_clusters_noref(bs, size);
 871        if (offset < 0) {
 872            return offset;
 873        }
 874
 875        ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
 876    } while (ret == -EAGAIN);
 877
 878    if (ret < 0) {
 879        return ret;
 880    }
 881
 882    return offset;
 883}
 884
 885int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
 886                                int64_t nb_clusters)
 887{
 888    BDRVQcow2State *s = bs->opaque;
 889    uint64_t cluster_index, refcount;
 890    uint64_t i;
 891    int ret;
 892
 893    assert(nb_clusters >= 0);
 894    if (nb_clusters == 0) {
 895        return 0;
 896    }
 897
 898    do {
 899        /* Check how many clusters there are free */
 900        cluster_index = offset >> s->cluster_bits;
 901        for(i = 0; i < nb_clusters; i++) {
 902            ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
 903            if (ret < 0) {
 904                return ret;
 905            } else if (refcount != 0) {
 906                break;
 907            }
 908        }
 909
 910        /* And then allocate them */
 911        ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
 912                              QCOW2_DISCARD_NEVER);
 913    } while (ret == -EAGAIN);
 914
 915    if (ret < 0) {
 916        return ret;
 917    }
 918
 919    return i;
 920}
 921
 922/* only used to allocate compressed sectors. We try to allocate
 923   contiguous sectors. size must be <= cluster_size */
 924int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
 925{
 926    BDRVQcow2State *s = bs->opaque;
 927    int64_t offset;
 928    size_t free_in_cluster;
 929    int ret;
 930
 931    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
 932    assert(size > 0 && size <= s->cluster_size);
 933    assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
 934
 935    offset = s->free_byte_offset;
 936
 937    if (offset) {
 938        uint64_t refcount;
 939        ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
 940        if (ret < 0) {
 941            return ret;
 942        }
 943
 944        if (refcount == s->refcount_max) {
 945            offset = 0;
 946        }
 947    }
 948
 949    free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
 950    do {
 951        if (!offset || free_in_cluster < size) {
 952            int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size);
 953            if (new_cluster < 0) {
 954                return new_cluster;
 955            }
 956
 957            if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
 958                offset = new_cluster;
 959                free_in_cluster = s->cluster_size;
 960            } else {
 961                free_in_cluster += s->cluster_size;
 962            }
 963        }
 964
 965        assert(offset);
 966        ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
 967        if (ret < 0) {
 968            offset = 0;
 969        }
 970    } while (ret == -EAGAIN);
 971    if (ret < 0) {
 972        return ret;
 973    }
 974
 975    /* The cluster refcount was incremented; refcount blocks must be flushed
 976     * before the caller's L2 table updates. */
 977    qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
 978
 979    s->free_byte_offset = offset + size;
 980    if (!offset_into_cluster(s, s->free_byte_offset)) {
 981        s->free_byte_offset = 0;
 982    }
 983
 984    return offset;
 985}
 986
 987void qcow2_free_clusters(BlockDriverState *bs,
 988                          int64_t offset, int64_t size,
 989                          enum qcow2_discard_type type)
 990{
 991    int ret;
 992
 993    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
 994    ret = update_refcount(bs, offset, size, 1, true, type);
 995    if (ret < 0) {
 996        fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
 997        /* TODO Remember the clusters to free them later and avoid leaking */
 998    }
 999}
1000
1001/*
1002 * Free a cluster using its L2 entry (handles clusters of all types, e.g.
1003 * normal cluster, compressed cluster, etc.)
1004 */
1005void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
1006                             int nb_clusters, enum qcow2_discard_type type)
1007{
1008    BDRVQcow2State *s = bs->opaque;
1009
1010    switch (qcow2_get_cluster_type(l2_entry)) {
1011    case QCOW2_CLUSTER_COMPRESSED:
1012        {
1013            int nb_csectors;
1014            nb_csectors = ((l2_entry >> s->csize_shift) &
1015                           s->csize_mask) + 1;
1016            qcow2_free_clusters(bs,
1017                (l2_entry & s->cluster_offset_mask) & ~511,
1018                nb_csectors * 512, type);
1019        }
1020        break;
1021    case QCOW2_CLUSTER_NORMAL:
1022    case QCOW2_CLUSTER_ZERO:
1023        if (l2_entry & L2E_OFFSET_MASK) {
1024            if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1025                qcow2_signal_corruption(bs, false, -1, -1,
1026                                        "Cannot free unaligned cluster %#llx",
1027                                        l2_entry & L2E_OFFSET_MASK);
1028            } else {
1029                qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1030                                    nb_clusters << s->cluster_bits, type);
1031            }
1032        }
1033        break;
1034    case QCOW2_CLUSTER_UNALLOCATED:
1035        break;
1036    default:
1037        abort();
1038    }
1039}
1040
1041
1042
1043/*********************************************************/
1044/* snapshots and image creation */
1045
1046
1047
1048/* update the refcounts of snapshots and the copied flag */
1049int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1050    int64_t l1_table_offset, int l1_size, int addend)
1051{
1052    BDRVQcow2State *s = bs->opaque;
1053    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, refcount;
1054    bool l1_allocated = false;
1055    int64_t old_offset, old_l2_offset;
1056    int i, j, l1_modified = 0, nb_csectors;
1057    int ret;
1058
1059    assert(addend >= -1 && addend <= 1);
1060
1061    l2_table = NULL;
1062    l1_table = NULL;
1063    l1_size2 = l1_size * sizeof(uint64_t);
1064
1065    s->cache_discards = true;
1066
1067    /* WARNING: qcow2_snapshot_goto relies on this function not using the
1068     * l1_table_offset when it is the current s->l1_table_offset! Be careful
1069     * when changing this! */
1070    if (l1_table_offset != s->l1_table_offset) {
1071        l1_table = g_try_malloc0(align_offset(l1_size2, 512));
1072        if (l1_size2 && l1_table == NULL) {
1073            ret = -ENOMEM;
1074            goto fail;
1075        }
1076        l1_allocated = true;
1077
1078        ret = bdrv_pread(bs->file->bs, l1_table_offset, l1_table, l1_size2);
1079        if (ret < 0) {
1080            goto fail;
1081        }
1082
1083        for(i = 0;i < l1_size; i++)
1084            be64_to_cpus(&l1_table[i]);
1085    } else {
1086        assert(l1_size == s->l1_size);
1087        l1_table = s->l1_table;
1088        l1_allocated = false;
1089    }
1090
1091    for(i = 0; i < l1_size; i++) {
1092        l2_offset = l1_table[i];
1093        if (l2_offset) {
1094            old_l2_offset = l2_offset;
1095            l2_offset &= L1E_OFFSET_MASK;
1096
1097            if (offset_into_cluster(s, l2_offset)) {
1098                qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1099                                        PRIx64 " unaligned (L1 index: %#x)",
1100                                        l2_offset, i);
1101                ret = -EIO;
1102                goto fail;
1103            }
1104
1105            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
1106                (void**) &l2_table);
1107            if (ret < 0) {
1108                goto fail;
1109            }
1110
1111            for(j = 0; j < s->l2_size; j++) {
1112                uint64_t cluster_index;
1113
1114                offset = be64_to_cpu(l2_table[j]);
1115                old_offset = offset;
1116                offset &= ~QCOW_OFLAG_COPIED;
1117
1118                switch (qcow2_get_cluster_type(offset)) {
1119                    case QCOW2_CLUSTER_COMPRESSED:
1120                        nb_csectors = ((offset >> s->csize_shift) &
1121                                       s->csize_mask) + 1;
1122                        if (addend != 0) {
1123                            ret = update_refcount(bs,
1124                                (offset & s->cluster_offset_mask) & ~511,
1125                                nb_csectors * 512, abs(addend), addend < 0,
1126                                QCOW2_DISCARD_SNAPSHOT);
1127                            if (ret < 0) {
1128                                goto fail;
1129                            }
1130                        }
1131                        /* compressed clusters are never modified */
1132                        refcount = 2;
1133                        break;
1134
1135                    case QCOW2_CLUSTER_NORMAL:
1136                    case QCOW2_CLUSTER_ZERO:
1137                        if (offset_into_cluster(s, offset & L2E_OFFSET_MASK)) {
1138                            qcow2_signal_corruption(bs, true, -1, -1, "Data "
1139                                                    "cluster offset %#llx "
1140                                                    "unaligned (L2 offset: %#"
1141                                                    PRIx64 ", L2 index: %#x)",
1142                                                    offset & L2E_OFFSET_MASK,
1143                                                    l2_offset, j);
1144                            ret = -EIO;
1145                            goto fail;
1146                        }
1147
1148                        cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
1149                        if (!cluster_index) {
1150                            /* unallocated */
1151                            refcount = 0;
1152                            break;
1153                        }
1154                        if (addend != 0) {
1155                            ret = qcow2_update_cluster_refcount(bs,
1156                                    cluster_index, abs(addend), addend < 0,
1157                                    QCOW2_DISCARD_SNAPSHOT);
1158                            if (ret < 0) {
1159                                goto fail;
1160                            }
1161                        }
1162
1163                        ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1164                        if (ret < 0) {
1165                            goto fail;
1166                        }
1167                        break;
1168
1169                    case QCOW2_CLUSTER_UNALLOCATED:
1170                        refcount = 0;
1171                        break;
1172
1173                    default:
1174                        abort();
1175                }
1176
1177                if (refcount == 1) {
1178                    offset |= QCOW_OFLAG_COPIED;
1179                }
1180                if (offset != old_offset) {
1181                    if (addend > 0) {
1182                        qcow2_cache_set_dependency(bs, s->l2_table_cache,
1183                            s->refcount_block_cache);
1184                    }
1185                    l2_table[j] = cpu_to_be64(offset);
1186                    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache,
1187                                                 l2_table);
1188                }
1189            }
1190
1191            qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
1192
1193            if (addend != 0) {
1194                ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1195                                                        s->cluster_bits,
1196                                                    abs(addend), addend < 0,
1197                                                    QCOW2_DISCARD_SNAPSHOT);
1198                if (ret < 0) {
1199                    goto fail;
1200                }
1201            }
1202            ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1203                                     &refcount);
1204            if (ret < 0) {
1205                goto fail;
1206            } else if (refcount == 1) {
1207                l2_offset |= QCOW_OFLAG_COPIED;
1208            }
1209            if (l2_offset != old_l2_offset) {
1210                l1_table[i] = l2_offset;
1211                l1_modified = 1;
1212            }
1213        }
1214    }
1215
1216    ret = bdrv_flush(bs);
1217fail:
1218    if (l2_table) {
1219        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
1220    }
1221
1222    s->cache_discards = false;
1223    qcow2_process_discards(bs, ret);
1224
1225    /* Update L1 only if it isn't deleted anyway (addend = -1) */
1226    if (ret == 0 && addend >= 0 && l1_modified) {
1227        for (i = 0; i < l1_size; i++) {
1228            cpu_to_be64s(&l1_table[i]);
1229        }
1230
1231        ret = bdrv_pwrite_sync(bs->file->bs, l1_table_offset,
1232                               l1_table, l1_size2);
1233
1234        for (i = 0; i < l1_size; i++) {
1235            be64_to_cpus(&l1_table[i]);
1236        }
1237    }
1238    if (l1_allocated)
1239        g_free(l1_table);
1240    return ret;
1241}
1242
1243
1244
1245
1246/*********************************************************/
1247/* refcount checking functions */
1248
1249
1250static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
1251{
1252    /* This assertion holds because there is no way we can address more than
1253     * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1254     * offsets have to be representable in bytes); due to every cluster
1255     * corresponding to one refcount entry, we are well below that limit */
1256    assert(entries < (UINT64_C(1) << (64 - 9)));
1257
1258    /* Thanks to the assertion this will not overflow, because
1259     * s->refcount_order < 7.
1260     * (note: x << s->refcount_order == x * s->refcount_bits) */
1261    return DIV_ROUND_UP(entries << s->refcount_order, 8);
1262}
1263
1264/**
1265 * Reallocates *array so that it can hold new_size entries. *size must contain
1266 * the current number of entries in *array. If the reallocation fails, *array
1267 * and *size will not be modified and -errno will be returned. If the
1268 * reallocation is successful, *array will be set to the new buffer, *size
1269 * will be set to new_size and 0 will be returned. The size of the reallocated
1270 * refcount array buffer will be aligned to a cluster boundary, and the newly
1271 * allocated area will be zeroed.
1272 */
1273static int realloc_refcount_array(BDRVQcow2State *s, void **array,
1274                                  int64_t *size, int64_t new_size)
1275{
1276    int64_t old_byte_size, new_byte_size;
1277    void *new_ptr;
1278
1279    /* Round to clusters so the array can be directly written to disk */
1280    old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1281                    * s->cluster_size;
1282    new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1283                    * s->cluster_size;
1284
1285    if (new_byte_size == old_byte_size) {
1286        *size = new_size;
1287        return 0;
1288    }
1289
1290    assert(new_byte_size > 0);
1291
1292    if (new_byte_size > SIZE_MAX) {
1293        return -ENOMEM;
1294    }
1295
1296    new_ptr = g_try_realloc(*array, new_byte_size);
1297    if (!new_ptr) {
1298        return -ENOMEM;
1299    }
1300
1301    if (new_byte_size > old_byte_size) {
1302        memset((char *)new_ptr + old_byte_size, 0,
1303               new_byte_size - old_byte_size);
1304    }
1305
1306    *array = new_ptr;
1307    *size  = new_size;
1308
1309    return 0;
1310}
1311
1312/*
1313 * Increases the refcount for a range of clusters in a given refcount table.
1314 * This is used to construct a temporary refcount table out of L1 and L2 tables
1315 * which can be compared to the refcount table saved in the image.
1316 *
1317 * Modifies the number of errors in res.
1318 */
1319static int inc_refcounts(BlockDriverState *bs,
1320                         BdrvCheckResult *res,
1321                         void **refcount_table,
1322                         int64_t *refcount_table_size,
1323                         int64_t offset, int64_t size)
1324{
1325    BDRVQcow2State *s = bs->opaque;
1326    uint64_t start, last, cluster_offset, k, refcount;
1327    int ret;
1328
1329    if (size <= 0) {
1330        return 0;
1331    }
1332
1333    start = start_of_cluster(s, offset);
1334    last = start_of_cluster(s, offset + size - 1);
1335    for(cluster_offset = start; cluster_offset <= last;
1336        cluster_offset += s->cluster_size) {
1337        k = cluster_offset >> s->cluster_bits;
1338        if (k >= *refcount_table_size) {
1339            ret = realloc_refcount_array(s, refcount_table,
1340                                         refcount_table_size, k + 1);
1341            if (ret < 0) {
1342                res->check_errors++;
1343                return ret;
1344            }
1345        }
1346
1347        refcount = s->get_refcount(*refcount_table, k);
1348        if (refcount == s->refcount_max) {
1349            fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1350                    "\n", cluster_offset);
1351            fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1352                    "width or qemu-img convert to create a clean copy if the "
1353                    "image cannot be opened for writing\n");
1354            res->corruptions++;
1355            continue;
1356        }
1357        s->set_refcount(*refcount_table, k, refcount + 1);
1358    }
1359
1360    return 0;
1361}
1362
1363/* Flags for check_refcounts_l1() and check_refcounts_l2() */
1364enum {
1365    CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1366};
1367
1368/*
1369 * Increases the refcount in the given refcount table for the all clusters
1370 * referenced in the L2 table. While doing so, performs some checks on L2
1371 * entries.
1372 *
1373 * Returns the number of errors found by the checks or -errno if an internal
1374 * error occurred.
1375 */
1376static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1377                              void **refcount_table,
1378                              int64_t *refcount_table_size, int64_t l2_offset,
1379                              int flags)
1380{
1381    BDRVQcow2State *s = bs->opaque;
1382    uint64_t *l2_table, l2_entry;
1383    uint64_t next_contiguous_offset = 0;
1384    int i, l2_size, nb_csectors, ret;
1385
1386    /* Read L2 table from disk */
1387    l2_size = s->l2_size * sizeof(uint64_t);
1388    l2_table = g_malloc(l2_size);
1389
1390    ret = bdrv_pread(bs->file->bs, l2_offset, l2_table, l2_size);
1391    if (ret < 0) {
1392        fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1393        res->check_errors++;
1394        goto fail;
1395    }
1396
1397    /* Do the actual checks */
1398    for(i = 0; i < s->l2_size; i++) {
1399        l2_entry = be64_to_cpu(l2_table[i]);
1400
1401        switch (qcow2_get_cluster_type(l2_entry)) {
1402        case QCOW2_CLUSTER_COMPRESSED:
1403            /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1404            if (l2_entry & QCOW_OFLAG_COPIED) {
1405                fprintf(stderr, "ERROR: cluster %" PRId64 ": "
1406                    "copied flag must never be set for compressed "
1407                    "clusters\n", l2_entry >> s->cluster_bits);
1408                l2_entry &= ~QCOW_OFLAG_COPIED;
1409                res->corruptions++;
1410            }
1411
1412            /* Mark cluster as used */
1413            nb_csectors = ((l2_entry >> s->csize_shift) &
1414                           s->csize_mask) + 1;
1415            l2_entry &= s->cluster_offset_mask;
1416            ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1417                                l2_entry & ~511, nb_csectors * 512);
1418            if (ret < 0) {
1419                goto fail;
1420            }
1421
1422            if (flags & CHECK_FRAG_INFO) {
1423                res->bfi.allocated_clusters++;
1424                res->bfi.compressed_clusters++;
1425
1426                /* Compressed clusters are fragmented by nature.  Since they
1427                 * take up sub-sector space but we only have sector granularity
1428                 * I/O we need to re-read the same sectors even for adjacent
1429                 * compressed clusters.
1430                 */
1431                res->bfi.fragmented_clusters++;
1432            }
1433            break;
1434
1435        case QCOW2_CLUSTER_ZERO:
1436            if ((l2_entry & L2E_OFFSET_MASK) == 0) {
1437                break;
1438            }
1439            /* fall through */
1440
1441        case QCOW2_CLUSTER_NORMAL:
1442        {
1443            uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1444
1445            if (flags & CHECK_FRAG_INFO) {
1446                res->bfi.allocated_clusters++;
1447                if (next_contiguous_offset &&
1448                    offset != next_contiguous_offset) {
1449                    res->bfi.fragmented_clusters++;
1450                }
1451                next_contiguous_offset = offset + s->cluster_size;
1452            }
1453
1454            /* Mark cluster as used */
1455            ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1456                                offset, s->cluster_size);
1457            if (ret < 0) {
1458                goto fail;
1459            }
1460
1461            /* Correct offsets are cluster aligned */
1462            if (offset_into_cluster(s, offset)) {
1463                fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
1464                    "properly aligned; L2 entry corrupted.\n", offset);
1465                res->corruptions++;
1466            }
1467            break;
1468        }
1469
1470        case QCOW2_CLUSTER_UNALLOCATED:
1471            break;
1472
1473        default:
1474            abort();
1475        }
1476    }
1477
1478    g_free(l2_table);
1479    return 0;
1480
1481fail:
1482    g_free(l2_table);
1483    return ret;
1484}
1485
1486/*
1487 * Increases the refcount for the L1 table, its L2 tables and all referenced
1488 * clusters in the given refcount table. While doing so, performs some checks
1489 * on L1 and L2 entries.
1490 *
1491 * Returns the number of errors found by the checks or -errno if an internal
1492 * error occurred.
1493 */
1494static int check_refcounts_l1(BlockDriverState *bs,
1495                              BdrvCheckResult *res,
1496                              void **refcount_table,
1497                              int64_t *refcount_table_size,
1498                              int64_t l1_table_offset, int l1_size,
1499                              int flags)
1500{
1501    BDRVQcow2State *s = bs->opaque;
1502    uint64_t *l1_table = NULL, l2_offset, l1_size2;
1503    int i, ret;
1504
1505    l1_size2 = l1_size * sizeof(uint64_t);
1506
1507    /* Mark L1 table as used */
1508    ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1509                        l1_table_offset, l1_size2);
1510    if (ret < 0) {
1511        goto fail;
1512    }
1513
1514    /* Read L1 table entries from disk */
1515    if (l1_size2 > 0) {
1516        l1_table = g_try_malloc(l1_size2);
1517        if (l1_table == NULL) {
1518            ret = -ENOMEM;
1519            res->check_errors++;
1520            goto fail;
1521        }
1522        ret = bdrv_pread(bs->file->bs, l1_table_offset, l1_table, l1_size2);
1523        if (ret < 0) {
1524            fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1525            res->check_errors++;
1526            goto fail;
1527        }
1528        for(i = 0;i < l1_size; i++)
1529            be64_to_cpus(&l1_table[i]);
1530    }
1531
1532    /* Do the actual checks */
1533    for(i = 0; i < l1_size; i++) {
1534        l2_offset = l1_table[i];
1535        if (l2_offset) {
1536            /* Mark L2 table as used */
1537            l2_offset &= L1E_OFFSET_MASK;
1538            ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1539                                l2_offset, s->cluster_size);
1540            if (ret < 0) {
1541                goto fail;
1542            }
1543
1544            /* L2 tables are cluster aligned */
1545            if (offset_into_cluster(s, l2_offset)) {
1546                fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1547                    "cluster aligned; L1 entry corrupted\n", l2_offset);
1548                res->corruptions++;
1549            }
1550
1551            /* Process and check L2 entries */
1552            ret = check_refcounts_l2(bs, res, refcount_table,
1553                                     refcount_table_size, l2_offset, flags);
1554            if (ret < 0) {
1555                goto fail;
1556            }
1557        }
1558    }
1559    g_free(l1_table);
1560    return 0;
1561
1562fail:
1563    g_free(l1_table);
1564    return ret;
1565}
1566
1567/*
1568 * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1569 *
1570 * This function does not print an error message nor does it increment
1571 * check_errors if qcow2_get_refcount fails (this is because such an error will
1572 * have been already detected and sufficiently signaled by the calling function
1573 * (qcow2_check_refcounts) by the time this function is called).
1574 */
1575static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1576                              BdrvCheckMode fix)
1577{
1578    BDRVQcow2State *s = bs->opaque;
1579    uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1580    int ret;
1581    uint64_t refcount;
1582    int i, j;
1583
1584    for (i = 0; i < s->l1_size; i++) {
1585        uint64_t l1_entry = s->l1_table[i];
1586        uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1587        bool l2_dirty = false;
1588
1589        if (!l2_offset) {
1590            continue;
1591        }
1592
1593        ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1594                                 &refcount);
1595        if (ret < 0) {
1596            /* don't print message nor increment check_errors */
1597            continue;
1598        }
1599        if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1600            fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1601                    "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1602                    fix & BDRV_FIX_ERRORS ? "Repairing" :
1603                                            "ERROR",
1604                    i, l1_entry, refcount);
1605            if (fix & BDRV_FIX_ERRORS) {
1606                s->l1_table[i] = refcount == 1
1607                               ? l1_entry |  QCOW_OFLAG_COPIED
1608                               : l1_entry & ~QCOW_OFLAG_COPIED;
1609                ret = qcow2_write_l1_entry(bs, i);
1610                if (ret < 0) {
1611                    res->check_errors++;
1612                    goto fail;
1613                }
1614                res->corruptions_fixed++;
1615            } else {
1616                res->corruptions++;
1617            }
1618        }
1619
1620        ret = bdrv_pread(bs->file->bs, l2_offset, l2_table,
1621                         s->l2_size * sizeof(uint64_t));
1622        if (ret < 0) {
1623            fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1624                    strerror(-ret));
1625            res->check_errors++;
1626            goto fail;
1627        }
1628
1629        for (j = 0; j < s->l2_size; j++) {
1630            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1631            uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
1632            int cluster_type = qcow2_get_cluster_type(l2_entry);
1633
1634            if ((cluster_type == QCOW2_CLUSTER_NORMAL) ||
1635                ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) {
1636                ret = qcow2_get_refcount(bs,
1637                                         data_offset >> s->cluster_bits,
1638                                         &refcount);
1639                if (ret < 0) {
1640                    /* don't print message nor increment check_errors */
1641                    continue;
1642                }
1643                if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
1644                    fprintf(stderr, "%s OFLAG_COPIED data cluster: "
1645                            "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1646                            fix & BDRV_FIX_ERRORS ? "Repairing" :
1647                                                    "ERROR",
1648                            l2_entry, refcount);
1649                    if (fix & BDRV_FIX_ERRORS) {
1650                        l2_table[j] = cpu_to_be64(refcount == 1
1651                                    ? l2_entry |  QCOW_OFLAG_COPIED
1652                                    : l2_entry & ~QCOW_OFLAG_COPIED);
1653                        l2_dirty = true;
1654                        res->corruptions_fixed++;
1655                    } else {
1656                        res->corruptions++;
1657                    }
1658                }
1659            }
1660        }
1661
1662        if (l2_dirty) {
1663            ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1664                                                l2_offset, s->cluster_size);
1665            if (ret < 0) {
1666                fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1667                        "overlap check failed: %s\n", strerror(-ret));
1668                res->check_errors++;
1669                goto fail;
1670            }
1671
1672            ret = bdrv_pwrite(bs->file->bs, l2_offset, l2_table,
1673                              s->cluster_size);
1674            if (ret < 0) {
1675                fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1676                        strerror(-ret));
1677                res->check_errors++;
1678                goto fail;
1679            }
1680        }
1681    }
1682
1683    ret = 0;
1684
1685fail:
1686    qemu_vfree(l2_table);
1687    return ret;
1688}
1689
1690/*
1691 * Checks consistency of refblocks and accounts for each refblock in
1692 * *refcount_table.
1693 */
1694static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
1695                           BdrvCheckMode fix, bool *rebuild,
1696                           void **refcount_table, int64_t *nb_clusters)
1697{
1698    BDRVQcow2State *s = bs->opaque;
1699    int64_t i, size;
1700    int ret;
1701
1702    for(i = 0; i < s->refcount_table_size; i++) {
1703        uint64_t offset, cluster;
1704        offset = s->refcount_table[i];
1705        cluster = offset >> s->cluster_bits;
1706
1707        /* Refcount blocks are cluster aligned */
1708        if (offset_into_cluster(s, offset)) {
1709            fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
1710                "cluster aligned; refcount table entry corrupted\n", i);
1711            res->corruptions++;
1712            *rebuild = true;
1713            continue;
1714        }
1715
1716        if (cluster >= *nb_clusters) {
1717            fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
1718                    fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
1719
1720            if (fix & BDRV_FIX_ERRORS) {
1721                int64_t new_nb_clusters;
1722
1723                if (offset > INT64_MAX - s->cluster_size) {
1724                    ret = -EINVAL;
1725                    goto resize_fail;
1726                }
1727
1728                ret = bdrv_truncate(bs->file->bs, offset + s->cluster_size);
1729                if (ret < 0) {
1730                    goto resize_fail;
1731                }
1732                size = bdrv_getlength(bs->file->bs);
1733                if (size < 0) {
1734                    ret = size;
1735                    goto resize_fail;
1736                }
1737
1738                new_nb_clusters = size_to_clusters(s, size);
1739                assert(new_nb_clusters >= *nb_clusters);
1740
1741                ret = realloc_refcount_array(s, refcount_table,
1742                                             nb_clusters, new_nb_clusters);
1743                if (ret < 0) {
1744                    res->check_errors++;
1745                    return ret;
1746                }
1747
1748                if (cluster >= *nb_clusters) {
1749                    ret = -EINVAL;
1750                    goto resize_fail;
1751                }
1752
1753                res->corruptions_fixed++;
1754                ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1755                                    offset, s->cluster_size);
1756                if (ret < 0) {
1757                    return ret;
1758                }
1759                /* No need to check whether the refcount is now greater than 1:
1760                 * This area was just allocated and zeroed, so it can only be
1761                 * exactly 1 after inc_refcounts() */
1762                continue;
1763
1764resize_fail:
1765                res->corruptions++;
1766                *rebuild = true;
1767                fprintf(stderr, "ERROR could not resize image: %s\n",
1768                        strerror(-ret));
1769            } else {
1770                res->corruptions++;
1771            }
1772            continue;
1773        }
1774
1775        if (offset != 0) {
1776            ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1777                                offset, s->cluster_size);
1778            if (ret < 0) {
1779                return ret;
1780            }
1781            if (s->get_refcount(*refcount_table, cluster) != 1) {
1782                fprintf(stderr, "ERROR refcount block %" PRId64
1783                        " refcount=%" PRIu64 "\n", i,
1784                        s->get_refcount(*refcount_table, cluster));
1785                res->corruptions++;
1786                *rebuild = true;
1787            }
1788        }
1789    }
1790
1791    return 0;
1792}
1793
1794/*
1795 * Calculates an in-memory refcount table.
1796 */
1797static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1798                               BdrvCheckMode fix, bool *rebuild,
1799                               void **refcount_table, int64_t *nb_clusters)
1800{
1801    BDRVQcow2State *s = bs->opaque;
1802    int64_t i;
1803    QCowSnapshot *sn;
1804    int ret;
1805
1806    if (!*refcount_table) {
1807        int64_t old_size = 0;
1808        ret = realloc_refcount_array(s, refcount_table,
1809                                     &old_size, *nb_clusters);
1810        if (ret < 0) {
1811            res->check_errors++;
1812            return ret;
1813        }
1814    }
1815
1816    /* header */
1817    ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1818                        0, s->cluster_size);
1819    if (ret < 0) {
1820        return ret;
1821    }
1822
1823    /* current L1 table */
1824    ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1825                             s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO);
1826    if (ret < 0) {
1827        return ret;
1828    }
1829
1830    /* snapshots */
1831    for (i = 0; i < s->nb_snapshots; i++) {
1832        sn = s->snapshots + i;
1833        ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1834                                 sn->l1_table_offset, sn->l1_size, 0);
1835        if (ret < 0) {
1836            return ret;
1837        }
1838    }
1839    ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1840                        s->snapshots_offset, s->snapshots_size);
1841    if (ret < 0) {
1842        return ret;
1843    }
1844
1845    /* refcount data */
1846    ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1847                        s->refcount_table_offset,
1848                        s->refcount_table_size * sizeof(uint64_t));
1849    if (ret < 0) {
1850        return ret;
1851    }
1852
1853    return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
1854}
1855
1856/*
1857 * Compares the actual reference count for each cluster in the image against the
1858 * refcount as reported by the refcount structures on-disk.
1859 */
1860static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1861                              BdrvCheckMode fix, bool *rebuild,
1862                              int64_t *highest_cluster,
1863                              void *refcount_table, int64_t nb_clusters)
1864{
1865    BDRVQcow2State *s = bs->opaque;
1866    int64_t i;
1867    uint64_t refcount1, refcount2;
1868    int ret;
1869
1870    for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
1871        ret = qcow2_get_refcount(bs, i, &refcount1);
1872        if (ret < 0) {
1873            fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
1874                    i, strerror(-ret));
1875            res->check_errors++;
1876            continue;
1877        }
1878
1879        refcount2 = s->get_refcount(refcount_table, i);
1880
1881        if (refcount1 > 0 || refcount2 > 0) {
1882            *highest_cluster = i;
1883        }
1884
1885        if (refcount1 != refcount2) {
1886            /* Check if we're allowed to fix the mismatch */
1887            int *num_fixed = NULL;
1888            if (refcount1 == 0) {
1889                *rebuild = true;
1890            } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
1891                num_fixed = &res->leaks_fixed;
1892            } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
1893                num_fixed = &res->corruptions_fixed;
1894            }
1895
1896            fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
1897                    " reference=%" PRIu64 "\n",
1898                   num_fixed != NULL     ? "Repairing" :
1899                   refcount1 < refcount2 ? "ERROR" :
1900                                           "Leaked",
1901                   i, refcount1, refcount2);
1902
1903            if (num_fixed) {
1904                ret = update_refcount(bs, i << s->cluster_bits, 1,
1905                                      refcount_diff(refcount1, refcount2),
1906                                      refcount1 > refcount2,
1907                                      QCOW2_DISCARD_ALWAYS);
1908                if (ret >= 0) {
1909                    (*num_fixed)++;
1910                    continue;
1911                }
1912            }
1913
1914            /* And if we couldn't, print an error */
1915            if (refcount1 < refcount2) {
1916                res->corruptions++;
1917            } else {
1918                res->leaks++;
1919            }
1920        }
1921    }
1922}
1923
1924/*
1925 * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
1926 * the on-disk refcount structures.
1927 *
1928 * On input, *first_free_cluster tells where to start looking, and need not
1929 * actually be a free cluster; the returned offset will not be before that
1930 * cluster.  On output, *first_free_cluster points to the first gap found, even
1931 * if that gap was too small to be used as the returned offset.
1932 *
1933 * Note that *first_free_cluster is a cluster index whereas the return value is
1934 * an offset.
1935 */
1936static int64_t alloc_clusters_imrt(BlockDriverState *bs,
1937                                   int cluster_count,
1938                                   void **refcount_table,
1939                                   int64_t *imrt_nb_clusters,
1940                                   int64_t *first_free_cluster)
1941{
1942    BDRVQcow2State *s = bs->opaque;
1943    int64_t cluster = *first_free_cluster, i;
1944    bool first_gap = true;
1945    int contiguous_free_clusters;
1946    int ret;
1947
1948    /* Starting at *first_free_cluster, find a range of at least cluster_count
1949     * continuously free clusters */
1950    for (contiguous_free_clusters = 0;
1951         cluster < *imrt_nb_clusters &&
1952         contiguous_free_clusters < cluster_count;
1953         cluster++)
1954    {
1955        if (!s->get_refcount(*refcount_table, cluster)) {
1956            contiguous_free_clusters++;
1957            if (first_gap) {
1958                /* If this is the first free cluster found, update
1959                 * *first_free_cluster accordingly */
1960                *first_free_cluster = cluster;
1961                first_gap = false;
1962            }
1963        } else if (contiguous_free_clusters) {
1964            contiguous_free_clusters = 0;
1965        }
1966    }
1967
1968    /* If contiguous_free_clusters is greater than zero, it contains the number
1969     * of continuously free clusters until the current cluster; the first free
1970     * cluster in the current "gap" is therefore
1971     * cluster - contiguous_free_clusters */
1972
1973    /* If no such range could be found, grow the in-memory refcount table
1974     * accordingly to append free clusters at the end of the image */
1975    if (contiguous_free_clusters < cluster_count) {
1976        /* contiguous_free_clusters clusters are already empty at the image end;
1977         * we need cluster_count clusters; therefore, we have to allocate
1978         * cluster_count - contiguous_free_clusters new clusters at the end of
1979         * the image (which is the current value of cluster; note that cluster
1980         * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
1981         * the image end) */
1982        ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
1983                                     cluster + cluster_count
1984                                     - contiguous_free_clusters);
1985        if (ret < 0) {
1986            return ret;
1987        }
1988    }
1989
1990    /* Go back to the first free cluster */
1991    cluster -= contiguous_free_clusters;
1992    for (i = 0; i < cluster_count; i++) {
1993        s->set_refcount(*refcount_table, cluster + i, 1);
1994    }
1995
1996    return cluster << s->cluster_bits;
1997}
1998
1999/*
2000 * Creates a new refcount structure based solely on the in-memory information
2001 * given through *refcount_table. All necessary allocations will be reflected
2002 * in that array.
2003 *
2004 * On success, the old refcount structure is leaked (it will be covered by the
2005 * new refcount structure).
2006 */
2007static int rebuild_refcount_structure(BlockDriverState *bs,
2008                                      BdrvCheckResult *res,
2009                                      void **refcount_table,
2010                                      int64_t *nb_clusters)
2011{
2012    BDRVQcow2State *s = bs->opaque;
2013    int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0;
2014    int64_t refblock_offset, refblock_start, refblock_index;
2015    uint32_t reftable_size = 0;
2016    uint64_t *on_disk_reftable = NULL;
2017    void *on_disk_refblock;
2018    int ret = 0;
2019    struct {
2020        uint64_t reftable_offset;
2021        uint32_t reftable_clusters;
2022    } QEMU_PACKED reftable_offset_and_clusters;
2023
2024    qcow2_cache_empty(bs, s->refcount_block_cache);
2025
2026write_refblocks:
2027    for (; cluster < *nb_clusters; cluster++) {
2028        if (!s->get_refcount(*refcount_table, cluster)) {
2029            continue;
2030        }
2031
2032        refblock_index = cluster >> s->refcount_block_bits;
2033        refblock_start = refblock_index << s->refcount_block_bits;
2034
2035        /* Don't allocate a cluster in a refblock already written to disk */
2036        if (first_free_cluster < refblock_start) {
2037            first_free_cluster = refblock_start;
2038        }
2039        refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2040                                              nb_clusters, &first_free_cluster);
2041        if (refblock_offset < 0) {
2042            fprintf(stderr, "ERROR allocating refblock: %s\n",
2043                    strerror(-refblock_offset));
2044            res->check_errors++;
2045            ret = refblock_offset;
2046            goto fail;
2047        }
2048
2049        if (reftable_size <= refblock_index) {
2050            uint32_t old_reftable_size = reftable_size;
2051            uint64_t *new_on_disk_reftable;
2052
2053            reftable_size = ROUND_UP((refblock_index + 1) * sizeof(uint64_t),
2054                                     s->cluster_size) / sizeof(uint64_t);
2055            new_on_disk_reftable = g_try_realloc(on_disk_reftable,
2056                                                 reftable_size *
2057                                                 sizeof(uint64_t));
2058            if (!new_on_disk_reftable) {
2059                res->check_errors++;
2060                ret = -ENOMEM;
2061                goto fail;
2062            }
2063            on_disk_reftable = new_on_disk_reftable;
2064
2065            memset(on_disk_reftable + old_reftable_size, 0,
2066                   (reftable_size - old_reftable_size) * sizeof(uint64_t));
2067
2068            /* The offset we have for the reftable is now no longer valid;
2069             * this will leak that range, but we can easily fix that by running
2070             * a leak-fixing check after this rebuild operation */
2071            reftable_offset = -1;
2072        }
2073        on_disk_reftable[refblock_index] = refblock_offset;
2074
2075        /* If this is apparently the last refblock (for now), try to squeeze the
2076         * reftable in */
2077        if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits &&
2078            reftable_offset < 0)
2079        {
2080            uint64_t reftable_clusters = size_to_clusters(s, reftable_size *
2081                                                          sizeof(uint64_t));
2082            reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2083                                                  refcount_table, nb_clusters,
2084                                                  &first_free_cluster);
2085            if (reftable_offset < 0) {
2086                fprintf(stderr, "ERROR allocating reftable: %s\n",
2087                        strerror(-reftable_offset));
2088                res->check_errors++;
2089                ret = reftable_offset;
2090                goto fail;
2091            }
2092        }
2093
2094        ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2095                                            s->cluster_size);
2096        if (ret < 0) {
2097            fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2098            goto fail;
2099        }
2100
2101        /* The size of *refcount_table is always cluster-aligned, therefore the
2102         * write operation will not overflow */
2103        on_disk_refblock = (void *)((char *) *refcount_table +
2104                                    refblock_index * s->cluster_size);
2105
2106        ret = bdrv_write(bs->file->bs, refblock_offset / BDRV_SECTOR_SIZE,
2107                         on_disk_refblock, s->cluster_sectors);
2108        if (ret < 0) {
2109            fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2110            goto fail;
2111        }
2112
2113        /* Go to the end of this refblock */
2114        cluster = refblock_start + s->refcount_block_size - 1;
2115    }
2116
2117    if (reftable_offset < 0) {
2118        uint64_t post_refblock_start, reftable_clusters;
2119
2120        post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size);
2121        reftable_clusters = size_to_clusters(s,
2122                                             reftable_size * sizeof(uint64_t));
2123        /* Not pretty but simple */
2124        if (first_free_cluster < post_refblock_start) {
2125            first_free_cluster = post_refblock_start;
2126        }
2127        reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2128                                              refcount_table, nb_clusters,
2129                                              &first_free_cluster);
2130        if (reftable_offset < 0) {
2131            fprintf(stderr, "ERROR allocating reftable: %s\n",
2132                    strerror(-reftable_offset));
2133            res->check_errors++;
2134            ret = reftable_offset;
2135            goto fail;
2136        }
2137
2138        goto write_refblocks;
2139    }
2140
2141    assert(on_disk_reftable);
2142
2143    for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2144        cpu_to_be64s(&on_disk_reftable[refblock_index]);
2145    }
2146
2147    ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset,
2148                                        reftable_size * sizeof(uint64_t));
2149    if (ret < 0) {
2150        fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2151        goto fail;
2152    }
2153
2154    assert(reftable_size < INT_MAX / sizeof(uint64_t));
2155    ret = bdrv_pwrite(bs->file->bs, reftable_offset, on_disk_reftable,
2156                      reftable_size * sizeof(uint64_t));
2157    if (ret < 0) {
2158        fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2159        goto fail;
2160    }
2161
2162    /* Enter new reftable into the image header */
2163    cpu_to_be64w(&reftable_offset_and_clusters.reftable_offset,
2164                 reftable_offset);
2165    cpu_to_be32w(&reftable_offset_and_clusters.reftable_clusters,
2166                 size_to_clusters(s, reftable_size * sizeof(uint64_t)));
2167    ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader,
2168                                                  refcount_table_offset),
2169                           &reftable_offset_and_clusters,
2170                           sizeof(reftable_offset_and_clusters));
2171    if (ret < 0) {
2172        fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret));
2173        goto fail;
2174    }
2175
2176    for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2177        be64_to_cpus(&on_disk_reftable[refblock_index]);
2178    }
2179    s->refcount_table = on_disk_reftable;
2180    s->refcount_table_offset = reftable_offset;
2181    s->refcount_table_size = reftable_size;
2182
2183    return 0;
2184
2185fail:
2186    g_free(on_disk_reftable);
2187    return ret;
2188}
2189
2190/*
2191 * Checks an image for refcount consistency.
2192 *
2193 * Returns 0 if no errors are found, the number of errors in case the image is
2194 * detected as corrupted, and -errno when an internal error occurred.
2195 */
2196int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2197                          BdrvCheckMode fix)
2198{
2199    BDRVQcow2State *s = bs->opaque;
2200    BdrvCheckResult pre_compare_res;
2201    int64_t size, highest_cluster, nb_clusters;
2202    void *refcount_table = NULL;
2203    bool rebuild = false;
2204    int ret;
2205
2206    size = bdrv_getlength(bs->file->bs);
2207    if (size < 0) {
2208        res->check_errors++;
2209        return size;
2210    }
2211
2212    nb_clusters = size_to_clusters(s, size);
2213    if (nb_clusters > INT_MAX) {
2214        res->check_errors++;
2215        return -EFBIG;
2216    }
2217
2218    res->bfi.total_clusters =
2219        size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2220
2221    ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2222                              &nb_clusters);
2223    if (ret < 0) {
2224        goto fail;
2225    }
2226
2227    /* In case we don't need to rebuild the refcount structure (but want to fix
2228     * something), this function is immediately called again, in which case the
2229     * result should be ignored */
2230    pre_compare_res = *res;
2231    compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2232                      nb_clusters);
2233
2234    if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2235        BdrvCheckResult old_res = *res;
2236        int fresh_leaks = 0;
2237
2238        fprintf(stderr, "Rebuilding refcount structure\n");
2239        ret = rebuild_refcount_structure(bs, res, &refcount_table,
2240                                         &nb_clusters);
2241        if (ret < 0) {
2242            goto fail;
2243        }
2244
2245        res->corruptions = 0;
2246        res->leaks = 0;
2247
2248        /* Because the old reftable has been exchanged for a new one the
2249         * references have to be recalculated */
2250        rebuild = false;
2251        memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2252        ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2253                                  &nb_clusters);
2254        if (ret < 0) {
2255            goto fail;
2256        }
2257
2258        if (fix & BDRV_FIX_LEAKS) {
2259            /* The old refcount structures are now leaked, fix it; the result
2260             * can be ignored, aside from leaks which were introduced by
2261             * rebuild_refcount_structure() that could not be fixed */
2262            BdrvCheckResult saved_res = *res;
2263            *res = (BdrvCheckResult){ 0 };
2264
2265            compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2266                              &highest_cluster, refcount_table, nb_clusters);
2267            if (rebuild) {
2268                fprintf(stderr, "ERROR rebuilt refcount structure is still "
2269                        "broken\n");
2270            }
2271
2272            /* Any leaks accounted for here were introduced by
2273             * rebuild_refcount_structure() because that function has created a
2274             * new refcount structure from scratch */
2275            fresh_leaks = res->leaks;
2276            *res = saved_res;
2277        }
2278
2279        if (res->corruptions < old_res.corruptions) {
2280            res->corruptions_fixed += old_res.corruptions - res->corruptions;
2281        }
2282        if (res->leaks < old_res.leaks) {
2283            res->leaks_fixed += old_res.leaks - res->leaks;
2284        }
2285        res->leaks += fresh_leaks;
2286    } else if (fix) {
2287        if (rebuild) {
2288            fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2289            res->check_errors++;
2290            ret = -EIO;
2291            goto fail;
2292        }
2293
2294        if (res->leaks || res->corruptions) {
2295            *res = pre_compare_res;
2296            compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2297                              refcount_table, nb_clusters);
2298        }
2299    }
2300
2301    /* check OFLAG_COPIED */
2302    ret = check_oflag_copied(bs, res, fix);
2303    if (ret < 0) {
2304        goto fail;
2305    }
2306
2307    res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2308    ret = 0;
2309
2310fail:
2311    g_free(refcount_table);
2312
2313    return ret;
2314}
2315
2316#define overlaps_with(ofs, sz) \
2317    ranges_overlap(offset, size, ofs, sz)
2318
2319/*
2320 * Checks if the given offset into the image file is actually free to use by
2321 * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2322 * i.e. a sanity check without relying on the refcount tables.
2323 *
2324 * The ign parameter specifies what checks not to perform (being a bitmask of
2325 * QCow2MetadataOverlap values), i.e., what sections to ignore.
2326 *
2327 * Returns:
2328 * - 0 if writing to this offset will not affect the mentioned metadata
2329 * - a positive QCow2MetadataOverlap value indicating one overlapping section
2330 * - a negative value (-errno) indicating an error while performing a check,
2331 *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
2332 */
2333int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2334                                 int64_t size)
2335{
2336    BDRVQcow2State *s = bs->opaque;
2337    int chk = s->overlap_check & ~ign;
2338    int i, j;
2339
2340    if (!size) {
2341        return 0;
2342    }
2343
2344    if (chk & QCOW2_OL_MAIN_HEADER) {
2345        if (offset < s->cluster_size) {
2346            return QCOW2_OL_MAIN_HEADER;
2347        }
2348    }
2349
2350    /* align range to test to cluster boundaries */
2351    size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
2352    offset = start_of_cluster(s, offset);
2353
2354    if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2355        if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
2356            return QCOW2_OL_ACTIVE_L1;
2357        }
2358    }
2359
2360    if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2361        if (overlaps_with(s->refcount_table_offset,
2362            s->refcount_table_size * sizeof(uint64_t))) {
2363            return QCOW2_OL_REFCOUNT_TABLE;
2364        }
2365    }
2366
2367    if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2368        if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2369            return QCOW2_OL_SNAPSHOT_TABLE;
2370        }
2371    }
2372
2373    if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2374        for (i = 0; i < s->nb_snapshots; i++) {
2375            if (s->snapshots[i].l1_size &&
2376                overlaps_with(s->snapshots[i].l1_table_offset,
2377                s->snapshots[i].l1_size * sizeof(uint64_t))) {
2378                return QCOW2_OL_INACTIVE_L1;
2379            }
2380        }
2381    }
2382
2383    if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2384        for (i = 0; i < s->l1_size; i++) {
2385            if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2386                overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2387                s->cluster_size)) {
2388                return QCOW2_OL_ACTIVE_L2;
2389            }
2390        }
2391    }
2392
2393    if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2394        for (i = 0; i < s->refcount_table_size; i++) {
2395            if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2396                overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2397                s->cluster_size)) {
2398                return QCOW2_OL_REFCOUNT_BLOCK;
2399            }
2400        }
2401    }
2402
2403    if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2404        for (i = 0; i < s->nb_snapshots; i++) {
2405            uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2406            uint32_t l1_sz  = s->snapshots[i].l1_size;
2407            uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
2408            uint64_t *l1 = g_try_malloc(l1_sz2);
2409            int ret;
2410
2411            if (l1_sz2 && l1 == NULL) {
2412                return -ENOMEM;
2413            }
2414
2415            ret = bdrv_pread(bs->file->bs, l1_ofs, l1, l1_sz2);
2416            if (ret < 0) {
2417                g_free(l1);
2418                return ret;
2419            }
2420
2421            for (j = 0; j < l1_sz; j++) {
2422                uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
2423                if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
2424                    g_free(l1);
2425                    return QCOW2_OL_INACTIVE_L2;
2426                }
2427            }
2428
2429            g_free(l1);
2430        }
2431    }
2432
2433    return 0;
2434}
2435
2436static const char *metadata_ol_names[] = {
2437    [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header",
2438    [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table",
2439    [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table",
2440    [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
2441    [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
2442    [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
2443    [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table",
2444    [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table",
2445};
2446
2447/*
2448 * First performs a check for metadata overlaps (through
2449 * qcow2_check_metadata_overlap); if that fails with a negative value (error
2450 * while performing a check), that value is returned. If an impending overlap
2451 * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
2452 * and -EIO returned.
2453 *
2454 * Returns 0 if there were neither overlaps nor errors while checking for
2455 * overlaps; or a negative value (-errno) on error.
2456 */
2457int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
2458                                  int64_t size)
2459{
2460    int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
2461
2462    if (ret < 0) {
2463        return ret;
2464    } else if (ret > 0) {
2465        int metadata_ol_bitnr = ctz32(ret);
2466        assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
2467
2468        qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
2469                                "write on metadata (overlaps with %s)",
2470                                metadata_ol_names[metadata_ol_bitnr]);
2471        return -EIO;
2472    }
2473
2474    return 0;
2475}
2476
2477/* A pointer to a function of this type is given to walk_over_reftable(). That
2478 * function will create refblocks and pass them to a RefblockFinishOp once they
2479 * are completed (@refblock). @refblock_empty is set if the refblock is
2480 * completely empty.
2481 *
2482 * Along with the refblock, a corresponding reftable entry is passed, in the
2483 * reftable @reftable (which may be reallocated) at @reftable_index.
2484 *
2485 * @allocated should be set to true if a new cluster has been allocated.
2486 */
2487typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
2488                               uint64_t reftable_index, uint64_t *reftable_size,
2489                               void *refblock, bool refblock_empty,
2490                               bool *allocated, Error **errp);
2491
2492/**
2493 * This "operation" for walk_over_reftable() allocates the refblock on disk (if
2494 * it is not empty) and inserts its offset into the new reftable. The size of
2495 * this new reftable is increased as required.
2496 */
2497static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
2498                          uint64_t reftable_index, uint64_t *reftable_size,
2499                          void *refblock, bool refblock_empty, bool *allocated,
2500                          Error **errp)
2501{
2502    BDRVQcow2State *s = bs->opaque;
2503    int64_t offset;
2504
2505    if (!refblock_empty && reftable_index >= *reftable_size) {
2506        uint64_t *new_reftable;
2507        uint64_t new_reftable_size;
2508
2509        new_reftable_size = ROUND_UP(reftable_index + 1,
2510                                     s->cluster_size / sizeof(uint64_t));
2511        if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
2512            error_setg(errp,
2513                       "This operation would make the refcount table grow "
2514                       "beyond the maximum size supported by QEMU, aborting");
2515            return -ENOTSUP;
2516        }
2517
2518        new_reftable = g_try_realloc(*reftable, new_reftable_size *
2519                                                sizeof(uint64_t));
2520        if (!new_reftable) {
2521            error_setg(errp, "Failed to increase reftable buffer size");
2522            return -ENOMEM;
2523        }
2524
2525        memset(new_reftable + *reftable_size, 0,
2526               (new_reftable_size - *reftable_size) * sizeof(uint64_t));
2527
2528        *reftable      = new_reftable;
2529        *reftable_size = new_reftable_size;
2530    }
2531
2532    if (!refblock_empty && !(*reftable)[reftable_index]) {
2533        offset = qcow2_alloc_clusters(bs, s->cluster_size);
2534        if (offset < 0) {
2535            error_setg_errno(errp, -offset, "Failed to allocate refblock");
2536            return offset;
2537        }
2538        (*reftable)[reftable_index] = offset;
2539        *allocated = true;
2540    }
2541
2542    return 0;
2543}
2544
2545/**
2546 * This "operation" for walk_over_reftable() writes the refblock to disk at the
2547 * offset specified by the new reftable's entry. It does not modify the new
2548 * reftable or change any refcounts.
2549 */
2550static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
2551                          uint64_t reftable_index, uint64_t *reftable_size,
2552                          void *refblock, bool refblock_empty, bool *allocated,
2553                          Error **errp)
2554{
2555    BDRVQcow2State *s = bs->opaque;
2556    int64_t offset;
2557    int ret;
2558
2559    if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
2560        offset = (*reftable)[reftable_index];
2561
2562        ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
2563        if (ret < 0) {
2564            error_setg_errno(errp, -ret, "Overlap check failed");
2565            return ret;
2566        }
2567
2568        ret = bdrv_pwrite(bs->file->bs, offset, refblock, s->cluster_size);
2569        if (ret < 0) {
2570            error_setg_errno(errp, -ret, "Failed to write refblock");
2571            return ret;
2572        }
2573    } else {
2574        assert(refblock_empty);
2575    }
2576
2577    return 0;
2578}
2579
2580/**
2581 * This function walks over the existing reftable and every referenced refblock;
2582 * if @new_set_refcount is non-NULL, it is called for every refcount entry to
2583 * create an equal new entry in the passed @new_refblock. Once that
2584 * @new_refblock is completely filled, @operation will be called.
2585 *
2586 * @status_cb and @cb_opaque are used for the amend operation's status callback.
2587 * @index is the index of the walk_over_reftable() calls and @total is the total
2588 * number of walk_over_reftable() calls per amend operation. Both are used for
2589 * calculating the parameters for the status callback.
2590 *
2591 * @allocated is set to true if a new cluster has been allocated.
2592 */
2593static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
2594                              uint64_t *new_reftable_index,
2595                              uint64_t *new_reftable_size,
2596                              void *new_refblock, int new_refblock_size,
2597                              int new_refcount_bits,
2598                              RefblockFinishOp *operation, bool *allocated,
2599                              Qcow2SetRefcountFunc *new_set_refcount,
2600                              BlockDriverAmendStatusCB *status_cb,
2601                              void *cb_opaque, int index, int total,
2602                              Error **errp)
2603{
2604    BDRVQcow2State *s = bs->opaque;
2605    uint64_t reftable_index;
2606    bool new_refblock_empty = true;
2607    int refblock_index;
2608    int new_refblock_index = 0;
2609    int ret;
2610
2611    for (reftable_index = 0; reftable_index < s->refcount_table_size;
2612         reftable_index++)
2613    {
2614        uint64_t refblock_offset = s->refcount_table[reftable_index]
2615                                 & REFT_OFFSET_MASK;
2616
2617        status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
2618                  (uint64_t)total * s->refcount_table_size, cb_opaque);
2619
2620        if (refblock_offset) {
2621            void *refblock;
2622
2623            if (offset_into_cluster(s, refblock_offset)) {
2624                qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
2625                                        PRIx64 " unaligned (reftable index: %#"
2626                                        PRIx64 ")", refblock_offset,
2627                                        reftable_index);
2628                error_setg(errp,
2629                           "Image is corrupt (unaligned refblock offset)");
2630                return -EIO;
2631            }
2632
2633            ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
2634                                  &refblock);
2635            if (ret < 0) {
2636                error_setg_errno(errp, -ret, "Failed to retrieve refblock");
2637                return ret;
2638            }
2639
2640            for (refblock_index = 0; refblock_index < s->refcount_block_size;
2641                 refblock_index++)
2642            {
2643                uint64_t refcount;
2644
2645                if (new_refblock_index >= new_refblock_size) {
2646                    /* new_refblock is now complete */
2647                    ret = operation(bs, new_reftable, *new_reftable_index,
2648                                    new_reftable_size, new_refblock,
2649                                    new_refblock_empty, allocated, errp);
2650                    if (ret < 0) {
2651                        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2652                        return ret;
2653                    }
2654
2655                    (*new_reftable_index)++;
2656                    new_refblock_index = 0;
2657                    new_refblock_empty = true;
2658                }
2659
2660                refcount = s->get_refcount(refblock, refblock_index);
2661                if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
2662                    uint64_t offset;
2663
2664                    qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2665
2666                    offset = ((reftable_index << s->refcount_block_bits)
2667                              + refblock_index) << s->cluster_bits;
2668
2669                    error_setg(errp, "Cannot decrease refcount entry width to "
2670                               "%i bits: Cluster at offset %#" PRIx64 " has a "
2671                               "refcount of %" PRIu64, new_refcount_bits,
2672                               offset, refcount);
2673                    return -EINVAL;
2674                }
2675
2676                if (new_set_refcount) {
2677                    new_set_refcount(new_refblock, new_refblock_index++,
2678                                     refcount);
2679                } else {
2680                    new_refblock_index++;
2681                }
2682                new_refblock_empty = new_refblock_empty && refcount == 0;
2683            }
2684
2685            qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2686        } else {
2687            /* No refblock means every refcount is 0 */
2688            for (refblock_index = 0; refblock_index < s->refcount_block_size;
2689                 refblock_index++)
2690            {
2691                if (new_refblock_index >= new_refblock_size) {
2692                    /* new_refblock is now complete */
2693                    ret = operation(bs, new_reftable, *new_reftable_index,
2694                                    new_reftable_size, new_refblock,
2695                                    new_refblock_empty, allocated, errp);
2696                    if (ret < 0) {
2697                        return ret;
2698                    }
2699
2700                    (*new_reftable_index)++;
2701                    new_refblock_index = 0;
2702                    new_refblock_empty = true;
2703                }
2704
2705                if (new_set_refcount) {
2706                    new_set_refcount(new_refblock, new_refblock_index++, 0);
2707                } else {
2708                    new_refblock_index++;
2709                }
2710            }
2711        }
2712    }
2713
2714    if (new_refblock_index > 0) {
2715        /* Complete the potentially existing partially filled final refblock */
2716        if (new_set_refcount) {
2717            for (; new_refblock_index < new_refblock_size;
2718                 new_refblock_index++)
2719            {
2720                new_set_refcount(new_refblock, new_refblock_index, 0);
2721            }
2722        }
2723
2724        ret = operation(bs, new_reftable, *new_reftable_index,
2725                        new_reftable_size, new_refblock, new_refblock_empty,
2726                        allocated, errp);
2727        if (ret < 0) {
2728            return ret;
2729        }
2730
2731        (*new_reftable_index)++;
2732    }
2733
2734    status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
2735              (uint64_t)total * s->refcount_table_size, cb_opaque);
2736
2737    return 0;
2738}
2739
2740int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
2741                                BlockDriverAmendStatusCB *status_cb,
2742                                void *cb_opaque, Error **errp)
2743{
2744    BDRVQcow2State *s = bs->opaque;
2745    Qcow2GetRefcountFunc *new_get_refcount;
2746    Qcow2SetRefcountFunc *new_set_refcount;
2747    void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
2748    uint64_t *new_reftable = NULL, new_reftable_size = 0;
2749    uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
2750    uint64_t new_reftable_index = 0;
2751    uint64_t i;
2752    int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
2753    int new_refblock_size, new_refcount_bits = 1 << refcount_order;
2754    int old_refcount_order;
2755    int walk_index = 0;
2756    int ret;
2757    bool new_allocation;
2758
2759    assert(s->qcow_version >= 3);
2760    assert(refcount_order >= 0 && refcount_order <= 6);
2761
2762    /* see qcow2_open() */
2763    new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
2764
2765    new_get_refcount = get_refcount_funcs[refcount_order];
2766    new_set_refcount = set_refcount_funcs[refcount_order];
2767
2768
2769    do {
2770        int total_walks;
2771
2772        new_allocation = false;
2773
2774        /* At least we have to do this walk and the one which writes the
2775         * refblocks; also, at least we have to do this loop here at least
2776         * twice (normally), first to do the allocations, and second to
2777         * determine that everything is correctly allocated, this then makes
2778         * three walks in total */
2779        total_walks = MAX(walk_index + 2, 3);
2780
2781        /* First, allocate the structures so they are present in the refcount
2782         * structures */
2783        ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
2784                                 &new_reftable_size, NULL, new_refblock_size,
2785                                 new_refcount_bits, &alloc_refblock,
2786                                 &new_allocation, NULL, status_cb, cb_opaque,
2787                                 walk_index++, total_walks, errp);
2788        if (ret < 0) {
2789            goto done;
2790        }
2791
2792        new_reftable_index = 0;
2793
2794        if (new_allocation) {
2795            if (new_reftable_offset) {
2796                qcow2_free_clusters(bs, new_reftable_offset,
2797                                    allocated_reftable_size * sizeof(uint64_t),
2798                                    QCOW2_DISCARD_NEVER);
2799            }
2800
2801            new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
2802                                                           sizeof(uint64_t));
2803            if (new_reftable_offset < 0) {
2804                error_setg_errno(errp, -new_reftable_offset,
2805                                 "Failed to allocate the new reftable");
2806                ret = new_reftable_offset;
2807                goto done;
2808            }
2809            allocated_reftable_size = new_reftable_size;
2810        }
2811    } while (new_allocation);
2812
2813    /* Second, write the new refblocks */
2814    ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
2815                             &new_reftable_size, new_refblock,
2816                             new_refblock_size, new_refcount_bits,
2817                             &flush_refblock, &new_allocation, new_set_refcount,
2818                             status_cb, cb_opaque, walk_index, walk_index + 1,
2819                             errp);
2820    if (ret < 0) {
2821        goto done;
2822    }
2823    assert(!new_allocation);
2824
2825
2826    /* Write the new reftable */
2827    ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
2828                                        new_reftable_size * sizeof(uint64_t));
2829    if (ret < 0) {
2830        error_setg_errno(errp, -ret, "Overlap check failed");
2831        goto done;
2832    }
2833
2834    for (i = 0; i < new_reftable_size; i++) {
2835        cpu_to_be64s(&new_reftable[i]);
2836    }
2837
2838    ret = bdrv_pwrite(bs->file->bs, new_reftable_offset, new_reftable,
2839                      new_reftable_size * sizeof(uint64_t));
2840
2841    for (i = 0; i < new_reftable_size; i++) {
2842        be64_to_cpus(&new_reftable[i]);
2843    }
2844
2845    if (ret < 0) {
2846        error_setg_errno(errp, -ret, "Failed to write the new reftable");
2847        goto done;
2848    }
2849
2850
2851    /* Empty the refcount cache */
2852    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
2853    if (ret < 0) {
2854        error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
2855        goto done;
2856    }
2857
2858    /* Update the image header to point to the new reftable; this only updates
2859     * the fields which are relevant to qcow2_update_header(); other fields
2860     * such as s->refcount_table or s->refcount_bits stay stale for now
2861     * (because we have to restore everything if qcow2_update_header() fails) */
2862    old_refcount_order  = s->refcount_order;
2863    old_reftable_size   = s->refcount_table_size;
2864    old_reftable_offset = s->refcount_table_offset;
2865
2866    s->refcount_order        = refcount_order;
2867    s->refcount_table_size   = new_reftable_size;
2868    s->refcount_table_offset = new_reftable_offset;
2869
2870    ret = qcow2_update_header(bs);
2871    if (ret < 0) {
2872        s->refcount_order        = old_refcount_order;
2873        s->refcount_table_size   = old_reftable_size;
2874        s->refcount_table_offset = old_reftable_offset;
2875        error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
2876        goto done;
2877    }
2878
2879    /* Now update the rest of the in-memory information */
2880    old_reftable = s->refcount_table;
2881    s->refcount_table = new_reftable;
2882
2883    s->refcount_bits = 1 << refcount_order;
2884    s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
2885    s->refcount_max += s->refcount_max - 1;
2886
2887    s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
2888    s->refcount_block_size = 1 << s->refcount_block_bits;
2889
2890    s->get_refcount = new_get_refcount;
2891    s->set_refcount = new_set_refcount;
2892
2893    /* For cleaning up all old refblocks and the old reftable below the "done"
2894     * label */
2895    new_reftable        = old_reftable;
2896    new_reftable_size   = old_reftable_size;
2897    new_reftable_offset = old_reftable_offset;
2898
2899done:
2900    if (new_reftable) {
2901        /* On success, new_reftable actually points to the old reftable (and
2902         * new_reftable_size is the old reftable's size); but that is just
2903         * fine */
2904        for (i = 0; i < new_reftable_size; i++) {
2905            uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
2906            if (offset) {
2907                qcow2_free_clusters(bs, offset, s->cluster_size,
2908                                    QCOW2_DISCARD_OTHER);
2909            }
2910        }
2911        g_free(new_reftable);
2912
2913        if (new_reftable_offset > 0) {
2914            qcow2_free_clusters(bs, new_reftable_offset,
2915                                new_reftable_size * sizeof(uint64_t),
2916                                QCOW2_DISCARD_OTHER);
2917        }
2918    }
2919
2920    qemu_vfree(new_refblock);
2921    return ret;
2922}
2923