qemu/block/qcow2-refcount.c
<<
>>
Prefs
   1/*
   2 * Block driver for the QCOW version 2 format
   3 *
   4 * Copyright (c) 2004-2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "block/block-io.h"
  27#include "qapi/error.h"
  28#include "qcow2.h"
  29#include "qemu/range.h"
  30#include "qemu/bswap.h"
  31#include "qemu/cutils.h"
  32#include "qemu/memalign.h"
  33#include "trace.h"
  34
  35static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
  36                                    uint64_t max);
  37
  38G_GNUC_WARN_UNUSED_RESULT
  39static int update_refcount(BlockDriverState *bs,
  40                           int64_t offset, int64_t length, uint64_t addend,
  41                           bool decrease, enum qcow2_discard_type type);
  42
  43static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
  44static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
  45static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
  46static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
  47static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
  48static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
  49static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
  50
  51static void set_refcount_ro0(void *refcount_array, uint64_t index,
  52                             uint64_t value);
  53static void set_refcount_ro1(void *refcount_array, uint64_t index,
  54                             uint64_t value);
  55static void set_refcount_ro2(void *refcount_array, uint64_t index,
  56                             uint64_t value);
  57static void set_refcount_ro3(void *refcount_array, uint64_t index,
  58                             uint64_t value);
  59static void set_refcount_ro4(void *refcount_array, uint64_t index,
  60                             uint64_t value);
  61static void set_refcount_ro5(void *refcount_array, uint64_t index,
  62                             uint64_t value);
  63static void set_refcount_ro6(void *refcount_array, uint64_t index,
  64                             uint64_t value);
  65
  66
  67static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
  68    &get_refcount_ro0,
  69    &get_refcount_ro1,
  70    &get_refcount_ro2,
  71    &get_refcount_ro3,
  72    &get_refcount_ro4,
  73    &get_refcount_ro5,
  74    &get_refcount_ro6
  75};
  76
  77static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
  78    &set_refcount_ro0,
  79    &set_refcount_ro1,
  80    &set_refcount_ro2,
  81    &set_refcount_ro3,
  82    &set_refcount_ro4,
  83    &set_refcount_ro5,
  84    &set_refcount_ro6
  85};
  86
  87
  88/*********************************************************/
  89/* refcount handling */
  90
  91static void update_max_refcount_table_index(BDRVQcow2State *s)
  92{
  93    unsigned i = s->refcount_table_size - 1;
  94    while (i > 0 && (s->refcount_table[i] & REFT_OFFSET_MASK) == 0) {
  95        i--;
  96    }
  97    /* Set s->max_refcount_table_index to the index of the last used entry */
  98    s->max_refcount_table_index = i;
  99}
 100
 101int coroutine_fn qcow2_refcount_init(BlockDriverState *bs)
 102{
 103    BDRVQcow2State *s = bs->opaque;
 104    unsigned int refcount_table_size2, i;
 105    int ret;
 106
 107    assert(s->refcount_order >= 0 && s->refcount_order <= 6);
 108
 109    s->get_refcount = get_refcount_funcs[s->refcount_order];
 110    s->set_refcount = set_refcount_funcs[s->refcount_order];
 111
 112    assert(s->refcount_table_size <= INT_MAX / REFTABLE_ENTRY_SIZE);
 113    refcount_table_size2 = s->refcount_table_size * REFTABLE_ENTRY_SIZE;
 114    s->refcount_table = g_try_malloc(refcount_table_size2);
 115
 116    if (s->refcount_table_size > 0) {
 117        if (s->refcount_table == NULL) {
 118            ret = -ENOMEM;
 119            goto fail;
 120        }
 121        BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
 122        ret = bdrv_co_pread(bs->file, s->refcount_table_offset,
 123                            refcount_table_size2, s->refcount_table, 0);
 124        if (ret < 0) {
 125            goto fail;
 126        }
 127        for(i = 0; i < s->refcount_table_size; i++)
 128            be64_to_cpus(&s->refcount_table[i]);
 129        update_max_refcount_table_index(s);
 130    }
 131    return 0;
 132 fail:
 133    return ret;
 134}
 135
 136void qcow2_refcount_close(BlockDriverState *bs)
 137{
 138    BDRVQcow2State *s = bs->opaque;
 139    g_free(s->refcount_table);
 140}
 141
 142
 143static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
 144{
 145    return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
 146}
 147
 148static void set_refcount_ro0(void *refcount_array, uint64_t index,
 149                             uint64_t value)
 150{
 151    assert(!(value >> 1));
 152    ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
 153    ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
 154}
 155
 156static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
 157{
 158    return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
 159           & 0x3;
 160}
 161
 162static void set_refcount_ro1(void *refcount_array, uint64_t index,
 163                             uint64_t value)
 164{
 165    assert(!(value >> 2));
 166    ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
 167    ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
 168}
 169
 170static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
 171{
 172    return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
 173           & 0xf;
 174}
 175
 176static void set_refcount_ro2(void *refcount_array, uint64_t index,
 177                             uint64_t value)
 178{
 179    assert(!(value >> 4));
 180    ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
 181    ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
 182}
 183
 184static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
 185{
 186    return ((const uint8_t *)refcount_array)[index];
 187}
 188
 189static void set_refcount_ro3(void *refcount_array, uint64_t index,
 190                             uint64_t value)
 191{
 192    assert(!(value >> 8));
 193    ((uint8_t *)refcount_array)[index] = value;
 194}
 195
 196static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
 197{
 198    return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
 199}
 200
 201static void set_refcount_ro4(void *refcount_array, uint64_t index,
 202                             uint64_t value)
 203{
 204    assert(!(value >> 16));
 205    ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
 206}
 207
 208static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
 209{
 210    return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
 211}
 212
 213static void set_refcount_ro5(void *refcount_array, uint64_t index,
 214                             uint64_t value)
 215{
 216    assert(!(value >> 32));
 217    ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
 218}
 219
 220static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
 221{
 222    return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
 223}
 224
 225static void set_refcount_ro6(void *refcount_array, uint64_t index,
 226                             uint64_t value)
 227{
 228    ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
 229}
 230
 231
 232static int load_refcount_block(BlockDriverState *bs,
 233                               int64_t refcount_block_offset,
 234                               void **refcount_block)
 235{
 236    BDRVQcow2State *s = bs->opaque;
 237
 238    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
 239    return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
 240                           refcount_block);
 241}
 242
 243/*
 244 * Retrieves the refcount of the cluster given by its index and stores it in
 245 * *refcount. Returns 0 on success and -errno on failure.
 246 */
 247int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
 248                       uint64_t *refcount)
 249{
 250    BDRVQcow2State *s = bs->opaque;
 251    uint64_t refcount_table_index, block_index;
 252    int64_t refcount_block_offset;
 253    int ret;
 254    void *refcount_block;
 255
 256    refcount_table_index = cluster_index >> s->refcount_block_bits;
 257    if (refcount_table_index >= s->refcount_table_size) {
 258        *refcount = 0;
 259        return 0;
 260    }
 261    refcount_block_offset =
 262        s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
 263    if (!refcount_block_offset) {
 264        *refcount = 0;
 265        return 0;
 266    }
 267
 268    if (offset_into_cluster(s, refcount_block_offset)) {
 269        qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
 270                                " unaligned (reftable index: %#" PRIx64 ")",
 271                                refcount_block_offset, refcount_table_index);
 272        return -EIO;
 273    }
 274
 275    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
 276                          &refcount_block);
 277    if (ret < 0) {
 278        return ret;
 279    }
 280
 281    block_index = cluster_index & (s->refcount_block_size - 1);
 282    *refcount = s->get_refcount(refcount_block, block_index);
 283
 284    qcow2_cache_put(s->refcount_block_cache, &refcount_block);
 285
 286    return 0;
 287}
 288
 289/* Checks if two offsets are described by the same refcount block */
 290static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
 291    uint64_t offset_b)
 292{
 293    uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
 294    uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
 295
 296    return (block_a == block_b);
 297}
 298
 299/*
 300 * Loads a refcount block. If it doesn't exist yet, it is allocated first
 301 * (including growing the refcount table if needed).
 302 *
 303 * Returns 0 on success or -errno in error case
 304 */
 305static int alloc_refcount_block(BlockDriverState *bs,
 306                                int64_t cluster_index, void **refcount_block)
 307{
 308    BDRVQcow2State *s = bs->opaque;
 309    unsigned int refcount_table_index;
 310    int64_t ret;
 311
 312    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
 313
 314    /* Find the refcount block for the given cluster */
 315    refcount_table_index = cluster_index >> s->refcount_block_bits;
 316
 317    if (refcount_table_index < s->refcount_table_size) {
 318
 319        uint64_t refcount_block_offset =
 320            s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
 321
 322        /* If it's already there, we're done */
 323        if (refcount_block_offset) {
 324            if (offset_into_cluster(s, refcount_block_offset)) {
 325                qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
 326                                        PRIx64 " unaligned (reftable index: "
 327                                        "%#x)", refcount_block_offset,
 328                                        refcount_table_index);
 329                return -EIO;
 330            }
 331
 332             return load_refcount_block(bs, refcount_block_offset,
 333                                        refcount_block);
 334        }
 335    }
 336
 337    /*
 338     * If we came here, we need to allocate something. Something is at least
 339     * a cluster for the new refcount block. It may also include a new refcount
 340     * table if the old refcount table is too small.
 341     *
 342     * Note that allocating clusters here needs some special care:
 343     *
 344     * - We can't use the normal qcow2_alloc_clusters(), it would try to
 345     *   increase the refcount and very likely we would end up with an endless
 346     *   recursion. Instead we must place the refcount blocks in a way that
 347     *   they can describe them themselves.
 348     *
 349     * - We need to consider that at this point we are inside update_refcounts
 350     *   and potentially doing an initial refcount increase. This means that
 351     *   some clusters have already been allocated by the caller, but their
 352     *   refcount isn't accurate yet. If we allocate clusters for metadata, we
 353     *   need to return -EAGAIN to signal the caller that it needs to restart
 354     *   the search for free clusters.
 355     *
 356     * - alloc_clusters_noref and qcow2_free_clusters may load a different
 357     *   refcount block into the cache
 358     */
 359
 360    *refcount_block = NULL;
 361
 362    /* We write to the refcount table, so we might depend on L2 tables */
 363    ret = qcow2_cache_flush(bs, s->l2_table_cache);
 364    if (ret < 0) {
 365        return ret;
 366    }
 367
 368    /* Allocate the refcount block itself and mark it as used */
 369    int64_t new_block = alloc_clusters_noref(bs, s->cluster_size, INT64_MAX);
 370    if (new_block < 0) {
 371        return new_block;
 372    }
 373
 374    /* The offset must fit in the offset field of the refcount table entry */
 375    assert((new_block & REFT_OFFSET_MASK) == new_block);
 376
 377    /* If we're allocating the block at offset 0 then something is wrong */
 378    if (new_block == 0) {
 379        qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
 380                                "allocation of refcount block at offset 0");
 381        return -EIO;
 382    }
 383
 384#ifdef DEBUG_ALLOC2
 385    fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
 386        " at %" PRIx64 "\n",
 387        refcount_table_index, cluster_index << s->cluster_bits, new_block);
 388#endif
 389
 390    if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
 391        /* Zero the new refcount block before updating it */
 392        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
 393                                    refcount_block);
 394        if (ret < 0) {
 395            goto fail;
 396        }
 397
 398        memset(*refcount_block, 0, s->cluster_size);
 399
 400        /* The block describes itself, need to update the cache */
 401        int block_index = (new_block >> s->cluster_bits) &
 402            (s->refcount_block_size - 1);
 403        s->set_refcount(*refcount_block, block_index, 1);
 404    } else {
 405        /* Described somewhere else. This can recurse at most twice before we
 406         * arrive at a block that describes itself. */
 407        ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
 408                              QCOW2_DISCARD_NEVER);
 409        if (ret < 0) {
 410            goto fail;
 411        }
 412
 413        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 414        if (ret < 0) {
 415            goto fail;
 416        }
 417
 418        /* Initialize the new refcount block only after updating its refcount,
 419         * update_refcount uses the refcount cache itself */
 420        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
 421                                    refcount_block);
 422        if (ret < 0) {
 423            goto fail;
 424        }
 425
 426        memset(*refcount_block, 0, s->cluster_size);
 427    }
 428
 429    /* Now the new refcount block needs to be written to disk */
 430    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
 431    qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
 432    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 433    if (ret < 0) {
 434        goto fail;
 435    }
 436
 437    /* If the refcount table is big enough, just hook the block up there */
 438    if (refcount_table_index < s->refcount_table_size) {
 439        uint64_t data64 = cpu_to_be64(new_block);
 440        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
 441        ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset +
 442                               refcount_table_index * REFTABLE_ENTRY_SIZE,
 443            sizeof(data64), &data64, 0);
 444        if (ret < 0) {
 445            goto fail;
 446        }
 447
 448        s->refcount_table[refcount_table_index] = new_block;
 449        /* If there's a hole in s->refcount_table then it can happen
 450         * that refcount_table_index < s->max_refcount_table_index */
 451        s->max_refcount_table_index =
 452            MAX(s->max_refcount_table_index, refcount_table_index);
 453
 454        /* The new refcount block may be where the caller intended to put its
 455         * data, so let it restart the search. */
 456        return -EAGAIN;
 457    }
 458
 459    qcow2_cache_put(s->refcount_block_cache, refcount_block);
 460
 461    /*
 462     * If we come here, we need to grow the refcount table. Again, a new
 463     * refcount table needs some space and we can't simply allocate to avoid
 464     * endless recursion.
 465     *
 466     * Therefore let's grab new refcount blocks at the end of the image, which
 467     * will describe themselves and the new refcount table. This way we can
 468     * reference them only in the new table and do the switch to the new
 469     * refcount table at once without producing an inconsistent state in
 470     * between.
 471     */
 472    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
 473
 474    /* Calculate the number of refcount blocks needed so far; this will be the
 475     * basis for calculating the index of the first cluster used for the
 476     * self-describing refcount structures which we are about to create.
 477     *
 478     * Because we reached this point, there cannot be any refcount entries for
 479     * cluster_index or higher indices yet. However, because new_block has been
 480     * allocated to describe that cluster (and it will assume this role later
 481     * on), we cannot use that index; also, new_block may actually have a higher
 482     * cluster index than cluster_index, so it needs to be taken into account
 483     * here (and 1 needs to be added to its value because that cluster is used).
 484     */
 485    uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
 486                                            (new_block >> s->cluster_bits) + 1),
 487                                        s->refcount_block_size);
 488
 489    /* Create the new refcount table and blocks */
 490    uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
 491        s->cluster_size;
 492
 493    ret = qcow2_refcount_area(bs, meta_offset, 0, false,
 494                              refcount_table_index, new_block);
 495    if (ret < 0) {
 496        return ret;
 497    }
 498
 499    ret = load_refcount_block(bs, new_block, refcount_block);
 500    if (ret < 0) {
 501        return ret;
 502    }
 503
 504    /* If we were trying to do the initial refcount update for some cluster
 505     * allocation, we might have used the same clusters to store newly
 506     * allocated metadata. Make the caller search some new space. */
 507    return -EAGAIN;
 508
 509fail:
 510    if (*refcount_block != NULL) {
 511        qcow2_cache_put(s->refcount_block_cache, refcount_block);
 512    }
 513    return ret;
 514}
 515
 516/*
 517 * Starting at @start_offset, this function creates new self-covering refcount
 518 * structures: A new refcount table and refcount blocks which cover all of
 519 * themselves, and a number of @additional_clusters beyond their end.
 520 * @start_offset must be at the end of the image file, that is, there must be
 521 * only empty space beyond it.
 522 * If @exact_size is false, the refcount table will have 50 % more entries than
 523 * necessary so it will not need to grow again soon.
 524 * If @new_refblock_offset is not zero, it contains the offset of a refcount
 525 * block that should be entered into the new refcount table at index
 526 * @new_refblock_index.
 527 *
 528 * Returns: The offset after the new refcount structures (i.e. where the
 529 *          @additional_clusters may be placed) on success, -errno on error.
 530 */
 531int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
 532                            uint64_t additional_clusters, bool exact_size,
 533                            int new_refblock_index,
 534                            uint64_t new_refblock_offset)
 535{
 536    BDRVQcow2State *s = bs->opaque;
 537    uint64_t total_refblock_count_u64, additional_refblock_count;
 538    int total_refblock_count, table_size, area_reftable_index, table_clusters;
 539    int i;
 540    uint64_t table_offset, block_offset, end_offset;
 541    int ret;
 542    uint64_t *new_table;
 543
 544    assert(!(start_offset % s->cluster_size));
 545
 546    qcow2_refcount_metadata_size(start_offset / s->cluster_size +
 547                                 additional_clusters,
 548                                 s->cluster_size, s->refcount_order,
 549                                 !exact_size, &total_refblock_count_u64);
 550    if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) {
 551        return -EFBIG;
 552    }
 553    total_refblock_count = total_refblock_count_u64;
 554
 555    /* Index in the refcount table of the first refcount block to cover the area
 556     * of refcount structures we are about to create; we know that
 557     * @total_refblock_count can cover @start_offset, so this will definitely
 558     * fit into an int. */
 559    area_reftable_index = (start_offset / s->cluster_size) /
 560                          s->refcount_block_size;
 561
 562    if (exact_size) {
 563        table_size = total_refblock_count;
 564    } else {
 565        table_size = total_refblock_count +
 566                     DIV_ROUND_UP(total_refblock_count, 2);
 567    }
 568    /* The qcow2 file can only store the reftable size in number of clusters */
 569    table_size = ROUND_UP(table_size, s->cluster_size / REFTABLE_ENTRY_SIZE);
 570    table_clusters = (table_size * REFTABLE_ENTRY_SIZE) / s->cluster_size;
 571
 572    if (table_size > QCOW_MAX_REFTABLE_SIZE) {
 573        return -EFBIG;
 574    }
 575
 576    new_table = g_try_new0(uint64_t, table_size);
 577
 578    assert(table_size > 0);
 579    if (new_table == NULL) {
 580        ret = -ENOMEM;
 581        goto fail;
 582    }
 583
 584    /* Fill the new refcount table */
 585    if (table_size > s->max_refcount_table_index) {
 586        /* We're actually growing the reftable */
 587        memcpy(new_table, s->refcount_table,
 588               (s->max_refcount_table_index + 1) * REFTABLE_ENTRY_SIZE);
 589    } else {
 590        /* Improbable case: We're shrinking the reftable. However, the caller
 591         * has assured us that there is only empty space beyond @start_offset,
 592         * so we can simply drop all of the refblocks that won't fit into the
 593         * new reftable. */
 594        memcpy(new_table, s->refcount_table, table_size * REFTABLE_ENTRY_SIZE);
 595    }
 596
 597    if (new_refblock_offset) {
 598        assert(new_refblock_index < total_refblock_count);
 599        new_table[new_refblock_index] = new_refblock_offset;
 600    }
 601
 602    /* Count how many new refblocks we have to create */
 603    additional_refblock_count = 0;
 604    for (i = area_reftable_index; i < total_refblock_count; i++) {
 605        if (!new_table[i]) {
 606            additional_refblock_count++;
 607        }
 608    }
 609
 610    table_offset = start_offset + additional_refblock_count * s->cluster_size;
 611    end_offset = table_offset + table_clusters * s->cluster_size;
 612
 613    /* Fill the refcount blocks, and create new ones, if necessary */
 614    block_offset = start_offset;
 615    for (i = area_reftable_index; i < total_refblock_count; i++) {
 616        void *refblock_data;
 617        uint64_t first_offset_covered;
 618
 619        /* Reuse an existing refblock if possible, create a new one otherwise */
 620        if (new_table[i]) {
 621            ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i],
 622                                  &refblock_data);
 623            if (ret < 0) {
 624                goto fail;
 625            }
 626        } else {
 627            ret = qcow2_cache_get_empty(bs, s->refcount_block_cache,
 628                                        block_offset, &refblock_data);
 629            if (ret < 0) {
 630                goto fail;
 631            }
 632            memset(refblock_data, 0, s->cluster_size);
 633            qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
 634                                         refblock_data);
 635
 636            new_table[i] = block_offset;
 637            block_offset += s->cluster_size;
 638        }
 639
 640        /* First host offset covered by this refblock */
 641        first_offset_covered = (uint64_t)i * s->refcount_block_size *
 642                               s->cluster_size;
 643        if (first_offset_covered < end_offset) {
 644            int j, end_index;
 645
 646            /* Set the refcount of all of the new refcount structures to 1 */
 647
 648            if (first_offset_covered < start_offset) {
 649                assert(i == area_reftable_index);
 650                j = (start_offset - first_offset_covered) / s->cluster_size;
 651                assert(j < s->refcount_block_size);
 652            } else {
 653                j = 0;
 654            }
 655
 656            end_index = MIN((end_offset - first_offset_covered) /
 657                            s->cluster_size,
 658                            s->refcount_block_size);
 659
 660            for (; j < end_index; j++) {
 661                /* The caller guaranteed us this space would be empty */
 662                assert(s->get_refcount(refblock_data, j) == 0);
 663                s->set_refcount(refblock_data, j, 1);
 664            }
 665
 666            qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
 667                                         refblock_data);
 668        }
 669
 670        qcow2_cache_put(s->refcount_block_cache, &refblock_data);
 671    }
 672
 673    assert(block_offset == table_offset);
 674
 675    /* Write refcount blocks to disk */
 676    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
 677    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 678    if (ret < 0) {
 679        goto fail;
 680    }
 681
 682    /* Write refcount table to disk */
 683    for (i = 0; i < total_refblock_count; i++) {
 684        cpu_to_be64s(&new_table[i]);
 685    }
 686
 687    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
 688    ret = bdrv_pwrite_sync(bs->file, table_offset,
 689                           table_size * REFTABLE_ENTRY_SIZE, new_table, 0);
 690    if (ret < 0) {
 691        goto fail;
 692    }
 693
 694    for (i = 0; i < total_refblock_count; i++) {
 695        be64_to_cpus(&new_table[i]);
 696    }
 697
 698    /* Hook up the new refcount table in the qcow2 header */
 699    struct QEMU_PACKED {
 700        uint64_t d64;
 701        uint32_t d32;
 702    } data;
 703    data.d64 = cpu_to_be64(table_offset);
 704    data.d32 = cpu_to_be32(table_clusters);
 705    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
 706    ret = bdrv_pwrite_sync(bs->file,
 707                           offsetof(QCowHeader, refcount_table_offset),
 708                           sizeof(data), &data, 0);
 709    if (ret < 0) {
 710        goto fail;
 711    }
 712
 713    /* And switch it in memory */
 714    uint64_t old_table_offset = s->refcount_table_offset;
 715    uint64_t old_table_size = s->refcount_table_size;
 716
 717    g_free(s->refcount_table);
 718    s->refcount_table = new_table;
 719    s->refcount_table_size = table_size;
 720    s->refcount_table_offset = table_offset;
 721    update_max_refcount_table_index(s);
 722
 723    /* Free old table. */
 724    qcow2_free_clusters(bs, old_table_offset,
 725                        old_table_size * REFTABLE_ENTRY_SIZE,
 726                        QCOW2_DISCARD_OTHER);
 727
 728    return end_offset;
 729
 730fail:
 731    g_free(new_table);
 732    return ret;
 733}
 734
 735void qcow2_process_discards(BlockDriverState *bs, int ret)
 736{
 737    BDRVQcow2State *s = bs->opaque;
 738    Qcow2DiscardRegion *d, *next;
 739
 740    QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
 741        QTAILQ_REMOVE(&s->discards, d, next);
 742
 743        /* Discard is optional, ignore the return value */
 744        if (ret >= 0) {
 745            int r2 = bdrv_pdiscard(bs->file, d->offset, d->bytes);
 746            if (r2 < 0) {
 747                trace_qcow2_process_discards_failed_region(d->offset, d->bytes,
 748                                                           r2);
 749            }
 750        }
 751
 752        g_free(d);
 753    }
 754}
 755
 756static void update_refcount_discard(BlockDriverState *bs,
 757                                    uint64_t offset, uint64_t length)
 758{
 759    BDRVQcow2State *s = bs->opaque;
 760    Qcow2DiscardRegion *d, *p, *next;
 761
 762    QTAILQ_FOREACH(d, &s->discards, next) {
 763        uint64_t new_start = MIN(offset, d->offset);
 764        uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
 765
 766        if (new_end - new_start <= length + d->bytes) {
 767            /* There can't be any overlap, areas ending up here have no
 768             * references any more and therefore shouldn't get freed another
 769             * time. */
 770            assert(d->bytes + length == new_end - new_start);
 771            d->offset = new_start;
 772            d->bytes = new_end - new_start;
 773            goto found;
 774        }
 775    }
 776
 777    d = g_malloc(sizeof(*d));
 778    *d = (Qcow2DiscardRegion) {
 779        .bs     = bs,
 780        .offset = offset,
 781        .bytes  = length,
 782    };
 783    QTAILQ_INSERT_TAIL(&s->discards, d, next);
 784
 785found:
 786    /* Merge discard requests if they are adjacent now */
 787    QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
 788        if (p == d
 789            || p->offset > d->offset + d->bytes
 790            || d->offset > p->offset + p->bytes)
 791        {
 792            continue;
 793        }
 794
 795        /* Still no overlap possible */
 796        assert(p->offset == d->offset + d->bytes
 797            || d->offset == p->offset + p->bytes);
 798
 799        QTAILQ_REMOVE(&s->discards, p, next);
 800        d->offset = MIN(d->offset, p->offset);
 801        d->bytes += p->bytes;
 802        g_free(p);
 803    }
 804}
 805
 806/* XXX: cache several refcount block clusters ? */
 807/* @addend is the absolute value of the addend; if @decrease is set, @addend
 808 * will be subtracted from the current refcount, otherwise it will be added */
 809static int update_refcount(BlockDriverState *bs,
 810                           int64_t offset,
 811                           int64_t length,
 812                           uint64_t addend,
 813                           bool decrease,
 814                           enum qcow2_discard_type type)
 815{
 816    BDRVQcow2State *s = bs->opaque;
 817    int64_t start, last, cluster_offset;
 818    void *refcount_block = NULL;
 819    int64_t old_table_index = -1;
 820    int ret;
 821
 822#ifdef DEBUG_ALLOC2
 823    fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
 824            " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
 825            addend);
 826#endif
 827    if (length < 0) {
 828        return -EINVAL;
 829    } else if (length == 0) {
 830        return 0;
 831    }
 832
 833    if (decrease) {
 834        qcow2_cache_set_dependency(bs, s->refcount_block_cache,
 835            s->l2_table_cache);
 836    }
 837
 838    start = start_of_cluster(s, offset);
 839    last = start_of_cluster(s, offset + length - 1);
 840    for(cluster_offset = start; cluster_offset <= last;
 841        cluster_offset += s->cluster_size)
 842    {
 843        int block_index;
 844        uint64_t refcount;
 845        int64_t cluster_index = cluster_offset >> s->cluster_bits;
 846        int64_t table_index = cluster_index >> s->refcount_block_bits;
 847
 848        /* Load the refcount block and allocate it if needed */
 849        if (table_index != old_table_index) {
 850            if (refcount_block) {
 851                qcow2_cache_put(s->refcount_block_cache, &refcount_block);
 852            }
 853            ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
 854            /* If the caller needs to restart the search for free clusters,
 855             * try the same ones first to see if they're still free. */
 856            if (ret == -EAGAIN) {
 857                if (s->free_cluster_index > (start >> s->cluster_bits)) {
 858                    s->free_cluster_index = (start >> s->cluster_bits);
 859                }
 860            }
 861            if (ret < 0) {
 862                goto fail;
 863            }
 864        }
 865        old_table_index = table_index;
 866
 867        qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
 868
 869        /* we can update the count and save it */
 870        block_index = cluster_index & (s->refcount_block_size - 1);
 871
 872        refcount = s->get_refcount(refcount_block, block_index);
 873        if (decrease ? (refcount - addend > refcount)
 874                     : (refcount + addend < refcount ||
 875                        refcount + addend > s->refcount_max))
 876        {
 877            ret = -EINVAL;
 878            goto fail;
 879        }
 880        if (decrease) {
 881            refcount -= addend;
 882        } else {
 883            refcount += addend;
 884        }
 885        if (refcount == 0 && cluster_index < s->free_cluster_index) {
 886            s->free_cluster_index = cluster_index;
 887        }
 888        s->set_refcount(refcount_block, block_index, refcount);
 889
 890        if (refcount == 0) {
 891            void *table;
 892
 893            table = qcow2_cache_is_table_offset(s->refcount_block_cache,
 894                                                offset);
 895            if (table != NULL) {
 896                qcow2_cache_put(s->refcount_block_cache, &refcount_block);
 897                old_table_index = -1;
 898                qcow2_cache_discard(s->refcount_block_cache, table);
 899            }
 900
 901            table = qcow2_cache_is_table_offset(s->l2_table_cache, offset);
 902            if (table != NULL) {
 903                qcow2_cache_discard(s->l2_table_cache, table);
 904            }
 905
 906            if (s->discard_passthrough[type]) {
 907                update_refcount_discard(bs, cluster_offset, s->cluster_size);
 908            }
 909        }
 910    }
 911
 912    ret = 0;
 913fail:
 914    if (!s->cache_discards) {
 915        qcow2_process_discards(bs, ret);
 916    }
 917
 918    /* Write last changed block to disk */
 919    if (refcount_block) {
 920        qcow2_cache_put(s->refcount_block_cache, &refcount_block);
 921    }
 922
 923    /*
 924     * Try do undo any updates if an error is returned (This may succeed in
 925     * some cases like ENOSPC for allocating a new refcount block)
 926     */
 927    if (ret < 0) {
 928        int dummy;
 929        dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
 930                                !decrease, QCOW2_DISCARD_NEVER);
 931        (void)dummy;
 932    }
 933
 934    return ret;
 935}
 936
 937/*
 938 * Increases or decreases the refcount of a given cluster.
 939 *
 940 * @addend is the absolute value of the addend; if @decrease is set, @addend
 941 * will be subtracted from the current refcount, otherwise it will be added.
 942 *
 943 * On success 0 is returned; on failure -errno is returned.
 944 */
 945int qcow2_update_cluster_refcount(BlockDriverState *bs,
 946                                  int64_t cluster_index,
 947                                  uint64_t addend, bool decrease,
 948                                  enum qcow2_discard_type type)
 949{
 950    BDRVQcow2State *s = bs->opaque;
 951    int ret;
 952
 953    ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
 954                          decrease, type);
 955    if (ret < 0) {
 956        return ret;
 957    }
 958
 959    return 0;
 960}
 961
 962
 963
 964/*********************************************************/
 965/* cluster allocation functions */
 966
 967
 968
 969/* return < 0 if error */
 970static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
 971                                    uint64_t max)
 972{
 973    BDRVQcow2State *s = bs->opaque;
 974    uint64_t i, nb_clusters, refcount;
 975    int ret;
 976
 977    /* We can't allocate clusters if they may still be queued for discard. */
 978    if (s->cache_discards) {
 979        qcow2_process_discards(bs, 0);
 980    }
 981
 982    nb_clusters = size_to_clusters(s, size);
 983retry:
 984    for(i = 0; i < nb_clusters; i++) {
 985        uint64_t next_cluster_index = s->free_cluster_index++;
 986        ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
 987
 988        if (ret < 0) {
 989            return ret;
 990        } else if (refcount != 0) {
 991            goto retry;
 992        }
 993    }
 994
 995    /* Make sure that all offsets in the "allocated" range are representable
 996     * in the requested max */
 997    if (s->free_cluster_index > 0 &&
 998        s->free_cluster_index - 1 > (max >> s->cluster_bits))
 999    {
1000        return -EFBIG;
1001    }
1002
1003#ifdef DEBUG_ALLOC2
1004    fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
1005            size,
1006            (s->free_cluster_index - nb_clusters) << s->cluster_bits);
1007#endif
1008    return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
1009}
1010
1011int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
1012{
1013    int64_t offset;
1014    int ret;
1015
1016    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
1017    do {
1018        offset = alloc_clusters_noref(bs, size, QCOW_MAX_CLUSTER_OFFSET);
1019        if (offset < 0) {
1020            return offset;
1021        }
1022
1023        ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1024    } while (ret == -EAGAIN);
1025
1026    if (ret < 0) {
1027        return ret;
1028    }
1029
1030    return offset;
1031}
1032
1033int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
1034                                int64_t nb_clusters)
1035{
1036    BDRVQcow2State *s = bs->opaque;
1037    uint64_t cluster_index, refcount;
1038    uint64_t i;
1039    int ret;
1040
1041    assert(nb_clusters >= 0);
1042    if (nb_clusters == 0) {
1043        return 0;
1044    }
1045
1046    do {
1047        /* Check how many clusters there are free */
1048        cluster_index = offset >> s->cluster_bits;
1049        for(i = 0; i < nb_clusters; i++) {
1050            ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
1051            if (ret < 0) {
1052                return ret;
1053            } else if (refcount != 0) {
1054                break;
1055            }
1056        }
1057
1058        /* And then allocate them */
1059        ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
1060                              QCOW2_DISCARD_NEVER);
1061    } while (ret == -EAGAIN);
1062
1063    if (ret < 0) {
1064        return ret;
1065    }
1066
1067    return i;
1068}
1069
1070/* only used to allocate compressed sectors. We try to allocate
1071   contiguous sectors. size must be <= cluster_size */
1072int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
1073{
1074    BDRVQcow2State *s = bs->opaque;
1075    int64_t offset;
1076    size_t free_in_cluster;
1077    int ret;
1078
1079    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
1080    assert(size > 0 && size <= s->cluster_size);
1081    assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
1082
1083    offset = s->free_byte_offset;
1084
1085    if (offset) {
1086        uint64_t refcount;
1087        ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
1088        if (ret < 0) {
1089            return ret;
1090        }
1091
1092        if (refcount == s->refcount_max) {
1093            offset = 0;
1094        }
1095    }
1096
1097    free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
1098    do {
1099        if (!offset || free_in_cluster < size) {
1100            int64_t new_cluster;
1101
1102            new_cluster = alloc_clusters_noref(bs, s->cluster_size,
1103                                               MIN(s->cluster_offset_mask,
1104                                                   QCOW_MAX_CLUSTER_OFFSET));
1105            if (new_cluster < 0) {
1106                return new_cluster;
1107            }
1108
1109            if (new_cluster == 0) {
1110                qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
1111                                        "allocation of compressed cluster "
1112                                        "at offset 0");
1113                return -EIO;
1114            }
1115
1116            if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
1117                offset = new_cluster;
1118                free_in_cluster = s->cluster_size;
1119            } else {
1120                free_in_cluster += s->cluster_size;
1121            }
1122        }
1123
1124        assert(offset);
1125        ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1126        if (ret < 0) {
1127            offset = 0;
1128        }
1129    } while (ret == -EAGAIN);
1130    if (ret < 0) {
1131        return ret;
1132    }
1133
1134    /* The cluster refcount was incremented; refcount blocks must be flushed
1135     * before the caller's L2 table updates. */
1136    qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
1137
1138    s->free_byte_offset = offset + size;
1139    if (!offset_into_cluster(s, s->free_byte_offset)) {
1140        s->free_byte_offset = 0;
1141    }
1142
1143    return offset;
1144}
1145
1146void qcow2_free_clusters(BlockDriverState *bs,
1147                          int64_t offset, int64_t size,
1148                          enum qcow2_discard_type type)
1149{
1150    int ret;
1151
1152    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
1153    ret = update_refcount(bs, offset, size, 1, true, type);
1154    if (ret < 0) {
1155        fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
1156        /* TODO Remember the clusters to free them later and avoid leaking */
1157    }
1158}
1159
1160/*
1161 * Free a cluster using its L2 entry (handles clusters of all types, e.g.
1162 * normal cluster, compressed cluster, etc.)
1163 */
1164void qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry,
1165                            enum qcow2_discard_type type)
1166{
1167    BDRVQcow2State *s = bs->opaque;
1168    QCow2ClusterType ctype = qcow2_get_cluster_type(bs, l2_entry);
1169
1170    if (has_data_file(bs)) {
1171        if (s->discard_passthrough[type] &&
1172            (ctype == QCOW2_CLUSTER_NORMAL ||
1173             ctype == QCOW2_CLUSTER_ZERO_ALLOC))
1174        {
1175            bdrv_pdiscard(s->data_file, l2_entry & L2E_OFFSET_MASK,
1176                          s->cluster_size);
1177        }
1178        return;
1179    }
1180
1181    switch (ctype) {
1182    case QCOW2_CLUSTER_COMPRESSED:
1183        {
1184            uint64_t coffset;
1185            int csize;
1186
1187            qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
1188            qcow2_free_clusters(bs, coffset, csize, type);
1189        }
1190        break;
1191    case QCOW2_CLUSTER_NORMAL:
1192    case QCOW2_CLUSTER_ZERO_ALLOC:
1193        if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1194            qcow2_signal_corruption(bs, false, -1, -1,
1195                                    "Cannot free unaligned cluster %#llx",
1196                                    l2_entry & L2E_OFFSET_MASK);
1197        } else {
1198            qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1199                                s->cluster_size, type);
1200        }
1201        break;
1202    case QCOW2_CLUSTER_ZERO_PLAIN:
1203    case QCOW2_CLUSTER_UNALLOCATED:
1204        break;
1205    default:
1206        abort();
1207    }
1208}
1209
1210int qcow2_write_caches(BlockDriverState *bs)
1211{
1212    BDRVQcow2State *s = bs->opaque;
1213    int ret;
1214
1215    ret = qcow2_cache_write(bs, s->l2_table_cache);
1216    if (ret < 0) {
1217        return ret;
1218    }
1219
1220    if (qcow2_need_accurate_refcounts(s)) {
1221        ret = qcow2_cache_write(bs, s->refcount_block_cache);
1222        if (ret < 0) {
1223            return ret;
1224        }
1225    }
1226
1227    return 0;
1228}
1229
1230int qcow2_flush_caches(BlockDriverState *bs)
1231{
1232    int ret = qcow2_write_caches(bs);
1233    if (ret < 0) {
1234        return ret;
1235    }
1236
1237    return bdrv_flush(bs->file->bs);
1238}
1239
1240/*********************************************************/
1241/* snapshots and image creation */
1242
1243
1244
1245/* update the refcounts of snapshots and the copied flag */
1246int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1247    int64_t l1_table_offset, int l1_size, int addend)
1248{
1249    BDRVQcow2State *s = bs->opaque;
1250    uint64_t *l1_table, *l2_slice, l2_offset, entry, l1_size2, refcount;
1251    bool l1_allocated = false;
1252    int64_t old_entry, old_l2_offset;
1253    unsigned slice, slice_size2, n_slices;
1254    int i, j, l1_modified = 0;
1255    int ret;
1256
1257    assert(addend >= -1 && addend <= 1);
1258
1259    l2_slice = NULL;
1260    l1_table = NULL;
1261    l1_size2 = l1_size * L1E_SIZE;
1262    slice_size2 = s->l2_slice_size * l2_entry_size(s);
1263    n_slices = s->cluster_size / slice_size2;
1264
1265    s->cache_discards = true;
1266
1267    /* WARNING: qcow2_snapshot_goto relies on this function not using the
1268     * l1_table_offset when it is the current s->l1_table_offset! Be careful
1269     * when changing this! */
1270    if (l1_table_offset != s->l1_table_offset) {
1271        l1_table = g_try_malloc0(l1_size2);
1272        if (l1_size2 && l1_table == NULL) {
1273            ret = -ENOMEM;
1274            goto fail;
1275        }
1276        l1_allocated = true;
1277
1278        ret = bdrv_pread(bs->file, l1_table_offset, l1_size2, l1_table, 0);
1279        if (ret < 0) {
1280            goto fail;
1281        }
1282
1283        for (i = 0; i < l1_size; i++) {
1284            be64_to_cpus(&l1_table[i]);
1285        }
1286    } else {
1287        assert(l1_size == s->l1_size);
1288        l1_table = s->l1_table;
1289        l1_allocated = false;
1290    }
1291
1292    for (i = 0; i < l1_size; i++) {
1293        l2_offset = l1_table[i];
1294        if (l2_offset) {
1295            old_l2_offset = l2_offset;
1296            l2_offset &= L1E_OFFSET_MASK;
1297
1298            if (offset_into_cluster(s, l2_offset)) {
1299                qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1300                                        PRIx64 " unaligned (L1 index: %#x)",
1301                                        l2_offset, i);
1302                ret = -EIO;
1303                goto fail;
1304            }
1305
1306            for (slice = 0; slice < n_slices; slice++) {
1307                ret = qcow2_cache_get(bs, s->l2_table_cache,
1308                                      l2_offset + slice * slice_size2,
1309                                      (void **) &l2_slice);
1310                if (ret < 0) {
1311                    goto fail;
1312                }
1313
1314                for (j = 0; j < s->l2_slice_size; j++) {
1315                    uint64_t cluster_index;
1316                    uint64_t offset;
1317
1318                    entry = get_l2_entry(s, l2_slice, j);
1319                    old_entry = entry;
1320                    entry &= ~QCOW_OFLAG_COPIED;
1321                    offset = entry & L2E_OFFSET_MASK;
1322
1323                    switch (qcow2_get_cluster_type(bs, entry)) {
1324                    case QCOW2_CLUSTER_COMPRESSED:
1325                        if (addend != 0) {
1326                            uint64_t coffset;
1327                            int csize;
1328
1329                            qcow2_parse_compressed_l2_entry(bs, entry,
1330                                                            &coffset, &csize);
1331                            ret = update_refcount(
1332                                bs, coffset, csize,
1333                                abs(addend), addend < 0,
1334                                QCOW2_DISCARD_SNAPSHOT);
1335                            if (ret < 0) {
1336                                goto fail;
1337                            }
1338                        }
1339                        /* compressed clusters are never modified */
1340                        refcount = 2;
1341                        break;
1342
1343                    case QCOW2_CLUSTER_NORMAL:
1344                    case QCOW2_CLUSTER_ZERO_ALLOC:
1345                        if (offset_into_cluster(s, offset)) {
1346                            /* Here l2_index means table (not slice) index */
1347                            int l2_index = slice * s->l2_slice_size + j;
1348                            qcow2_signal_corruption(
1349                                bs, true, -1, -1, "Cluster "
1350                                "allocation offset %#" PRIx64
1351                                " unaligned (L2 offset: %#"
1352                                PRIx64 ", L2 index: %#x)",
1353                                offset, l2_offset, l2_index);
1354                            ret = -EIO;
1355                            goto fail;
1356                        }
1357
1358                        cluster_index = offset >> s->cluster_bits;
1359                        assert(cluster_index);
1360                        if (addend != 0) {
1361                            ret = qcow2_update_cluster_refcount(
1362                                bs, cluster_index, abs(addend), addend < 0,
1363                                QCOW2_DISCARD_SNAPSHOT);
1364                            if (ret < 0) {
1365                                goto fail;
1366                            }
1367                        }
1368
1369                        ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1370                        if (ret < 0) {
1371                            goto fail;
1372                        }
1373                        break;
1374
1375                    case QCOW2_CLUSTER_ZERO_PLAIN:
1376                    case QCOW2_CLUSTER_UNALLOCATED:
1377                        refcount = 0;
1378                        break;
1379
1380                    default:
1381                        abort();
1382                    }
1383
1384                    if (refcount == 1) {
1385                        entry |= QCOW_OFLAG_COPIED;
1386                    }
1387                    if (entry != old_entry) {
1388                        if (addend > 0) {
1389                            qcow2_cache_set_dependency(bs, s->l2_table_cache,
1390                                                       s->refcount_block_cache);
1391                        }
1392                        set_l2_entry(s, l2_slice, j, entry);
1393                        qcow2_cache_entry_mark_dirty(s->l2_table_cache,
1394                                                     l2_slice);
1395                    }
1396                }
1397
1398                qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1399            }
1400
1401            if (addend != 0) {
1402                ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1403                                                        s->cluster_bits,
1404                                                    abs(addend), addend < 0,
1405                                                    QCOW2_DISCARD_SNAPSHOT);
1406                if (ret < 0) {
1407                    goto fail;
1408                }
1409            }
1410            ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1411                                     &refcount);
1412            if (ret < 0) {
1413                goto fail;
1414            } else if (refcount == 1) {
1415                l2_offset |= QCOW_OFLAG_COPIED;
1416            }
1417            if (l2_offset != old_l2_offset) {
1418                l1_table[i] = l2_offset;
1419                l1_modified = 1;
1420            }
1421        }
1422    }
1423
1424    ret = bdrv_flush(bs);
1425fail:
1426    if (l2_slice) {
1427        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1428    }
1429
1430    s->cache_discards = false;
1431    qcow2_process_discards(bs, ret);
1432
1433    /* Update L1 only if it isn't deleted anyway (addend = -1) */
1434    if (ret == 0 && addend >= 0 && l1_modified) {
1435        for (i = 0; i < l1_size; i++) {
1436            cpu_to_be64s(&l1_table[i]);
1437        }
1438
1439        ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_size2, l1_table,
1440                               0);
1441
1442        for (i = 0; i < l1_size; i++) {
1443            be64_to_cpus(&l1_table[i]);
1444        }
1445    }
1446    if (l1_allocated)
1447        g_free(l1_table);
1448    return ret;
1449}
1450
1451
1452
1453
1454/*********************************************************/
1455/* refcount checking functions */
1456
1457
1458static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
1459{
1460    /* This assertion holds because there is no way we can address more than
1461     * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1462     * offsets have to be representable in bytes); due to every cluster
1463     * corresponding to one refcount entry, we are well below that limit */
1464    assert(entries < (UINT64_C(1) << (64 - 9)));
1465
1466    /* Thanks to the assertion this will not overflow, because
1467     * s->refcount_order < 7.
1468     * (note: x << s->refcount_order == x * s->refcount_bits) */
1469    return DIV_ROUND_UP(entries << s->refcount_order, 8);
1470}
1471
1472/**
1473 * Reallocates *array so that it can hold new_size entries. *size must contain
1474 * the current number of entries in *array. If the reallocation fails, *array
1475 * and *size will not be modified and -errno will be returned. If the
1476 * reallocation is successful, *array will be set to the new buffer, *size
1477 * will be set to new_size and 0 will be returned. The size of the reallocated
1478 * refcount array buffer will be aligned to a cluster boundary, and the newly
1479 * allocated area will be zeroed.
1480 */
1481static int realloc_refcount_array(BDRVQcow2State *s, void **array,
1482                                  int64_t *size, int64_t new_size)
1483{
1484    int64_t old_byte_size, new_byte_size;
1485    void *new_ptr;
1486
1487    /* Round to clusters so the array can be directly written to disk */
1488    old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1489                    * s->cluster_size;
1490    new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1491                    * s->cluster_size;
1492
1493    if (new_byte_size == old_byte_size) {
1494        *size = new_size;
1495        return 0;
1496    }
1497
1498    assert(new_byte_size > 0);
1499
1500    if (new_byte_size > SIZE_MAX) {
1501        return -ENOMEM;
1502    }
1503
1504    new_ptr = g_try_realloc(*array, new_byte_size);
1505    if (!new_ptr) {
1506        return -ENOMEM;
1507    }
1508
1509    if (new_byte_size > old_byte_size) {
1510        memset((char *)new_ptr + old_byte_size, 0,
1511               new_byte_size - old_byte_size);
1512    }
1513
1514    *array = new_ptr;
1515    *size  = new_size;
1516
1517    return 0;
1518}
1519
1520/*
1521 * Increases the refcount for a range of clusters in a given refcount table.
1522 * This is used to construct a temporary refcount table out of L1 and L2 tables
1523 * which can be compared to the refcount table saved in the image.
1524 *
1525 * Modifies the number of errors in res.
1526 */
1527int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
1528                             void **refcount_table,
1529                             int64_t *refcount_table_size,
1530                             int64_t offset, int64_t size)
1531{
1532    BDRVQcow2State *s = bs->opaque;
1533    uint64_t start, last, cluster_offset, k, refcount;
1534    int64_t file_len;
1535    int ret;
1536
1537    if (size <= 0) {
1538        return 0;
1539    }
1540
1541    file_len = bdrv_getlength(bs->file->bs);
1542    if (file_len < 0) {
1543        return file_len;
1544    }
1545
1546    /*
1547     * Last cluster of qcow2 image may be semi-allocated, so it may be OK to
1548     * reference some space after file end but it should be less than one
1549     * cluster.
1550     */
1551    if (offset + size - file_len >= s->cluster_size) {
1552        fprintf(stderr, "ERROR: counting reference for region exceeding the "
1553                "end of the file by one cluster or more: offset 0x%" PRIx64
1554                " size 0x%" PRIx64 "\n", offset, size);
1555        res->corruptions++;
1556        return 0;
1557    }
1558
1559    start = start_of_cluster(s, offset);
1560    last = start_of_cluster(s, offset + size - 1);
1561    for(cluster_offset = start; cluster_offset <= last;
1562        cluster_offset += s->cluster_size) {
1563        k = cluster_offset >> s->cluster_bits;
1564        if (k >= *refcount_table_size) {
1565            ret = realloc_refcount_array(s, refcount_table,
1566                                         refcount_table_size, k + 1);
1567            if (ret < 0) {
1568                res->check_errors++;
1569                return ret;
1570            }
1571        }
1572
1573        refcount = s->get_refcount(*refcount_table, k);
1574        if (refcount == s->refcount_max) {
1575            fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1576                    "\n", cluster_offset);
1577            fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1578                    "width or qemu-img convert to create a clean copy if the "
1579                    "image cannot be opened for writing\n");
1580            res->corruptions++;
1581            continue;
1582        }
1583        s->set_refcount(*refcount_table, k, refcount + 1);
1584    }
1585
1586    return 0;
1587}
1588
1589/* Flags for check_refcounts_l1() and check_refcounts_l2() */
1590enum {
1591    CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1592};
1593
1594/*
1595 * Fix L2 entry by making it QCOW2_CLUSTER_ZERO_PLAIN (or making all its present
1596 * subclusters QCOW2_SUBCLUSTER_ZERO_PLAIN).
1597 *
1598 * This function decrements res->corruptions on success, so the caller is
1599 * responsible to increment res->corruptions prior to the call.
1600 *
1601 * On failure in-memory @l2_table may be modified.
1602 */
1603static int fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res,
1604                                uint64_t l2_offset,
1605                                uint64_t *l2_table, int l2_index, bool active,
1606                                bool *metadata_overlap)
1607{
1608    BDRVQcow2State *s = bs->opaque;
1609    int ret;
1610    int idx = l2_index * (l2_entry_size(s) / sizeof(uint64_t));
1611    uint64_t l2e_offset = l2_offset + (uint64_t)l2_index * l2_entry_size(s);
1612    int ign = active ? QCOW2_OL_ACTIVE_L2 : QCOW2_OL_INACTIVE_L2;
1613
1614    if (has_subclusters(s)) {
1615        uint64_t l2_bitmap = get_l2_bitmap(s, l2_table, l2_index);
1616
1617        /* Allocated subclusters become zero */
1618        l2_bitmap |= l2_bitmap << 32;
1619        l2_bitmap &= QCOW_L2_BITMAP_ALL_ZEROES;
1620
1621        set_l2_bitmap(s, l2_table, l2_index, l2_bitmap);
1622        set_l2_entry(s, l2_table, l2_index, 0);
1623    } else {
1624        set_l2_entry(s, l2_table, l2_index, QCOW_OFLAG_ZERO);
1625    }
1626
1627    ret = qcow2_pre_write_overlap_check(bs, ign, l2e_offset, l2_entry_size(s),
1628                                        false);
1629    if (metadata_overlap) {
1630        *metadata_overlap = ret < 0;
1631    }
1632    if (ret < 0) {
1633        fprintf(stderr, "ERROR: Overlap check failed\n");
1634        goto fail;
1635    }
1636
1637    ret = bdrv_pwrite_sync(bs->file, l2e_offset, l2_entry_size(s),
1638                           &l2_table[idx], 0);
1639    if (ret < 0) {
1640        fprintf(stderr, "ERROR: Failed to overwrite L2 "
1641                "table entry: %s\n", strerror(-ret));
1642        goto fail;
1643    }
1644
1645    res->corruptions--;
1646    res->corruptions_fixed++;
1647    return 0;
1648
1649fail:
1650    res->check_errors++;
1651    return ret;
1652}
1653
1654/*
1655 * Increases the refcount in the given refcount table for the all clusters
1656 * referenced in the L2 table. While doing so, performs some checks on L2
1657 * entries.
1658 *
1659 * Returns the number of errors found by the checks or -errno if an internal
1660 * error occurred.
1661 */
1662static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1663                              void **refcount_table,
1664                              int64_t *refcount_table_size, int64_t l2_offset,
1665                              int flags, BdrvCheckMode fix, bool active)
1666{
1667    BDRVQcow2State *s = bs->opaque;
1668    uint64_t l2_entry, l2_bitmap;
1669    uint64_t next_contiguous_offset = 0;
1670    int i, ret;
1671    size_t l2_size_bytes = s->l2_size * l2_entry_size(s);
1672    g_autofree uint64_t *l2_table = g_malloc(l2_size_bytes);
1673    bool metadata_overlap;
1674
1675    /* Read L2 table from disk */
1676    ret = bdrv_pread(bs->file, l2_offset, l2_size_bytes, l2_table, 0);
1677    if (ret < 0) {
1678        fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1679        res->check_errors++;
1680        return ret;
1681    }
1682
1683    /* Do the actual checks */
1684    for (i = 0; i < s->l2_size; i++) {
1685        uint64_t coffset;
1686        int csize;
1687        QCow2ClusterType type;
1688
1689        l2_entry = get_l2_entry(s, l2_table, i);
1690        l2_bitmap = get_l2_bitmap(s, l2_table, i);
1691        type = qcow2_get_cluster_type(bs, l2_entry);
1692
1693        if (type != QCOW2_CLUSTER_COMPRESSED) {
1694            /* Check reserved bits of Standard Cluster Descriptor */
1695            if (l2_entry & L2E_STD_RESERVED_MASK) {
1696                fprintf(stderr, "ERROR found l2 entry with reserved bits set: "
1697                        "%" PRIx64 "\n", l2_entry);
1698                res->corruptions++;
1699            }
1700        }
1701
1702        switch (type) {
1703        case QCOW2_CLUSTER_COMPRESSED:
1704            /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1705            if (l2_entry & QCOW_OFLAG_COPIED) {
1706                fprintf(stderr, "ERROR: coffset=0x%" PRIx64 ": "
1707                    "copied flag must never be set for compressed "
1708                    "clusters\n", l2_entry & s->cluster_offset_mask);
1709                l2_entry &= ~QCOW_OFLAG_COPIED;
1710                res->corruptions++;
1711            }
1712
1713            if (has_data_file(bs)) {
1714                fprintf(stderr, "ERROR compressed cluster %d with data file, "
1715                        "entry=0x%" PRIx64 "\n", i, l2_entry);
1716                res->corruptions++;
1717                break;
1718            }
1719
1720            if (l2_bitmap) {
1721                fprintf(stderr, "ERROR compressed cluster %d with non-zero "
1722                        "subcluster allocation bitmap, entry=0x%" PRIx64 "\n",
1723                        i, l2_entry);
1724                res->corruptions++;
1725                break;
1726            }
1727
1728            /* Mark cluster as used */
1729            qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
1730            ret = qcow2_inc_refcounts_imrt(
1731                bs, res, refcount_table, refcount_table_size, coffset, csize);
1732            if (ret < 0) {
1733                return ret;
1734            }
1735
1736            if (flags & CHECK_FRAG_INFO) {
1737                res->bfi.allocated_clusters++;
1738                res->bfi.compressed_clusters++;
1739
1740                /*
1741                 * Compressed clusters are fragmented by nature.  Since they
1742                 * take up sub-sector space but we only have sector granularity
1743                 * I/O we need to re-read the same sectors even for adjacent
1744                 * compressed clusters.
1745                 */
1746                res->bfi.fragmented_clusters++;
1747            }
1748            break;
1749
1750        case QCOW2_CLUSTER_ZERO_ALLOC:
1751        case QCOW2_CLUSTER_NORMAL:
1752        {
1753            uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1754
1755            if ((l2_bitmap >> 32) & l2_bitmap) {
1756                res->corruptions++;
1757                fprintf(stderr, "ERROR offset=%" PRIx64 ": Allocated "
1758                        "cluster has corrupted subcluster allocation bitmap\n",
1759                        offset);
1760            }
1761
1762            /* Correct offsets are cluster aligned */
1763            if (offset_into_cluster(s, offset)) {
1764                bool contains_data;
1765                res->corruptions++;
1766
1767                if (has_subclusters(s)) {
1768                    contains_data = (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC);
1769                } else {
1770                    contains_data = !(l2_entry & QCOW_OFLAG_ZERO);
1771                }
1772
1773                if (!contains_data) {
1774                    fprintf(stderr, "%s offset=%" PRIx64 ": Preallocated "
1775                            "cluster is not properly aligned; L2 entry "
1776                            "corrupted.\n",
1777                            fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
1778                            offset);
1779                    if (fix & BDRV_FIX_ERRORS) {
1780                        ret = fix_l2_entry_by_zero(bs, res, l2_offset,
1781                                                   l2_table, i, active,
1782                                                   &metadata_overlap);
1783                        if (metadata_overlap) {
1784                            /*
1785                             * Something is seriously wrong, so abort checking
1786                             * this L2 table.
1787                             */
1788                            return ret;
1789                        }
1790
1791                        if (ret == 0) {
1792                            /*
1793                             * Skip marking the cluster as used
1794                             * (it is unused now).
1795                             */
1796                            continue;
1797                        }
1798
1799                        /*
1800                         * Failed to fix.
1801                         * Do not abort, continue checking the rest of this
1802                         * L2 table's entries.
1803                         */
1804                    }
1805                } else {
1806                    fprintf(stderr, "ERROR offset=%" PRIx64 ": Data cluster is "
1807                        "not properly aligned; L2 entry corrupted.\n", offset);
1808                }
1809            }
1810
1811            if (flags & CHECK_FRAG_INFO) {
1812                res->bfi.allocated_clusters++;
1813                if (next_contiguous_offset &&
1814                    offset != next_contiguous_offset) {
1815                    res->bfi.fragmented_clusters++;
1816                }
1817                next_contiguous_offset = offset + s->cluster_size;
1818            }
1819
1820            /* Mark cluster as used */
1821            if (!has_data_file(bs)) {
1822                ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table,
1823                                               refcount_table_size,
1824                                               offset, s->cluster_size);
1825                if (ret < 0) {
1826                    return ret;
1827                }
1828            }
1829            break;
1830        }
1831
1832        case QCOW2_CLUSTER_ZERO_PLAIN:
1833            /* Impossible when image has subclusters */
1834            assert(!l2_bitmap);
1835            break;
1836
1837        case QCOW2_CLUSTER_UNALLOCATED:
1838            if (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC) {
1839                res->corruptions++;
1840                fprintf(stderr, "ERROR: Unallocated "
1841                        "cluster has non-zero subcluster allocation map\n");
1842            }
1843            break;
1844
1845        default:
1846            abort();
1847        }
1848    }
1849
1850    return 0;
1851}
1852
1853/*
1854 * Increases the refcount for the L1 table, its L2 tables and all referenced
1855 * clusters in the given refcount table. While doing so, performs some checks
1856 * on L1 and L2 entries.
1857 *
1858 * Returns the number of errors found by the checks or -errno if an internal
1859 * error occurred.
1860 */
1861static int check_refcounts_l1(BlockDriverState *bs,
1862                              BdrvCheckResult *res,
1863                              void **refcount_table,
1864                              int64_t *refcount_table_size,
1865                              int64_t l1_table_offset, int l1_size,
1866                              int flags, BdrvCheckMode fix, bool active)
1867{
1868    BDRVQcow2State *s = bs->opaque;
1869    size_t l1_size_bytes = l1_size * L1E_SIZE;
1870    g_autofree uint64_t *l1_table = NULL;
1871    uint64_t l2_offset;
1872    int i, ret;
1873
1874    if (!l1_size) {
1875        return 0;
1876    }
1877
1878    /* Mark L1 table as used */
1879    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size,
1880                                   l1_table_offset, l1_size_bytes);
1881    if (ret < 0) {
1882        return ret;
1883    }
1884
1885    l1_table = g_try_malloc(l1_size_bytes);
1886    if (l1_table == NULL) {
1887        res->check_errors++;
1888        return -ENOMEM;
1889    }
1890
1891    /* Read L1 table entries from disk */
1892    ret = bdrv_pread(bs->file, l1_table_offset, l1_size_bytes, l1_table, 0);
1893    if (ret < 0) {
1894        fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1895        res->check_errors++;
1896        return ret;
1897    }
1898
1899    for (i = 0; i < l1_size; i++) {
1900        be64_to_cpus(&l1_table[i]);
1901    }
1902
1903    /* Do the actual checks */
1904    for (i = 0; i < l1_size; i++) {
1905        if (!l1_table[i]) {
1906            continue;
1907        }
1908
1909        if (l1_table[i] & L1E_RESERVED_MASK) {
1910            fprintf(stderr, "ERROR found L1 entry with reserved bits set: "
1911                    "%" PRIx64 "\n", l1_table[i]);
1912            res->corruptions++;
1913        }
1914
1915        l2_offset = l1_table[i] & L1E_OFFSET_MASK;
1916
1917        /* Mark L2 table as used */
1918        ret = qcow2_inc_refcounts_imrt(bs, res,
1919                                       refcount_table, refcount_table_size,
1920                                       l2_offset, s->cluster_size);
1921        if (ret < 0) {
1922            return ret;
1923        }
1924
1925        /* L2 tables are cluster aligned */
1926        if (offset_into_cluster(s, l2_offset)) {
1927            fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1928                "cluster aligned; L1 entry corrupted\n", l2_offset);
1929            res->corruptions++;
1930        }
1931
1932        /* Process and check L2 entries */
1933        ret = check_refcounts_l2(bs, res, refcount_table,
1934                                 refcount_table_size, l2_offset, flags,
1935                                 fix, active);
1936        if (ret < 0) {
1937            return ret;
1938        }
1939    }
1940
1941    return 0;
1942}
1943
1944/*
1945 * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1946 *
1947 * This function does not print an error message nor does it increment
1948 * check_errors if qcow2_get_refcount fails (this is because such an error will
1949 * have been already detected and sufficiently signaled by the calling function
1950 * (qcow2_check_refcounts) by the time this function is called).
1951 */
1952static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1953                              BdrvCheckMode fix)
1954{
1955    BDRVQcow2State *s = bs->opaque;
1956    uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1957    int ret;
1958    uint64_t refcount;
1959    int i, j;
1960    bool repair;
1961
1962    if (fix & BDRV_FIX_ERRORS) {
1963        /* Always repair */
1964        repair = true;
1965    } else if (fix & BDRV_FIX_LEAKS) {
1966        /* Repair only if that seems safe: This function is always
1967         * called after the refcounts have been fixed, so the refcount
1968         * is accurate if that repair was successful */
1969        repair = !res->check_errors && !res->corruptions && !res->leaks;
1970    } else {
1971        repair = false;
1972    }
1973
1974    for (i = 0; i < s->l1_size; i++) {
1975        uint64_t l1_entry = s->l1_table[i];
1976        uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1977        int l2_dirty = 0;
1978
1979        if (!l2_offset) {
1980            continue;
1981        }
1982
1983        ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1984                                 &refcount);
1985        if (ret < 0) {
1986            /* don't print message nor increment check_errors */
1987            continue;
1988        }
1989        if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1990            res->corruptions++;
1991            fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1992                    "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1993                    repair ? "Repairing" : "ERROR", i, l1_entry, refcount);
1994            if (repair) {
1995                s->l1_table[i] = refcount == 1
1996                               ? l1_entry |  QCOW_OFLAG_COPIED
1997                               : l1_entry & ~QCOW_OFLAG_COPIED;
1998                ret = qcow2_write_l1_entry(bs, i);
1999                if (ret < 0) {
2000                    res->check_errors++;
2001                    goto fail;
2002                }
2003                res->corruptions--;
2004                res->corruptions_fixed++;
2005            }
2006        }
2007
2008        ret = bdrv_pread(bs->file, l2_offset, s->l2_size * l2_entry_size(s),
2009                         l2_table, 0);
2010        if (ret < 0) {
2011            fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
2012                    strerror(-ret));
2013            res->check_errors++;
2014            goto fail;
2015        }
2016
2017        for (j = 0; j < s->l2_size; j++) {
2018            uint64_t l2_entry = get_l2_entry(s, l2_table, j);
2019            uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
2020            QCow2ClusterType cluster_type = qcow2_get_cluster_type(bs, l2_entry);
2021
2022            if (cluster_type == QCOW2_CLUSTER_NORMAL ||
2023                cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) {
2024                if (has_data_file(bs)) {
2025                    refcount = 1;
2026                } else {
2027                    ret = qcow2_get_refcount(bs,
2028                                             data_offset >> s->cluster_bits,
2029                                             &refcount);
2030                    if (ret < 0) {
2031                        /* don't print message nor increment check_errors */
2032                        continue;
2033                    }
2034                }
2035                if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
2036                    res->corruptions++;
2037                    fprintf(stderr, "%s OFLAG_COPIED data cluster: "
2038                            "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
2039                            repair ? "Repairing" : "ERROR", l2_entry, refcount);
2040                    if (repair) {
2041                        set_l2_entry(s, l2_table, j,
2042                                     refcount == 1 ?
2043                                     l2_entry |  QCOW_OFLAG_COPIED :
2044                                     l2_entry & ~QCOW_OFLAG_COPIED);
2045                        l2_dirty++;
2046                    }
2047                }
2048            }
2049        }
2050
2051        if (l2_dirty > 0) {
2052            ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
2053                                                l2_offset, s->cluster_size,
2054                                                false);
2055            if (ret < 0) {
2056                fprintf(stderr, "ERROR: Could not write L2 table; metadata "
2057                        "overlap check failed: %s\n", strerror(-ret));
2058                res->check_errors++;
2059                goto fail;
2060            }
2061
2062            ret = bdrv_pwrite(bs->file, l2_offset, s->cluster_size, l2_table,
2063                              0);
2064            if (ret < 0) {
2065                fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
2066                        strerror(-ret));
2067                res->check_errors++;
2068                goto fail;
2069            }
2070            res->corruptions -= l2_dirty;
2071            res->corruptions_fixed += l2_dirty;
2072        }
2073    }
2074
2075    ret = 0;
2076
2077fail:
2078    qemu_vfree(l2_table);
2079    return ret;
2080}
2081
2082/*
2083 * Checks consistency of refblocks and accounts for each refblock in
2084 * *refcount_table.
2085 */
2086static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
2087                           BdrvCheckMode fix, bool *rebuild,
2088                           void **refcount_table, int64_t *nb_clusters)
2089{
2090    BDRVQcow2State *s = bs->opaque;
2091    int64_t i, size;
2092    int ret;
2093
2094    for(i = 0; i < s->refcount_table_size; i++) {
2095        uint64_t offset, cluster;
2096        offset = s->refcount_table[i] & REFT_OFFSET_MASK;
2097        cluster = offset >> s->cluster_bits;
2098
2099        if (s->refcount_table[i] & REFT_RESERVED_MASK) {
2100            fprintf(stderr, "ERROR refcount table entry %" PRId64 " has "
2101                    "reserved bits set\n", i);
2102            res->corruptions++;
2103            *rebuild = true;
2104            continue;
2105        }
2106
2107        /* Refcount blocks are cluster aligned */
2108        if (offset_into_cluster(s, offset)) {
2109            fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
2110                "cluster aligned; refcount table entry corrupted\n", i);
2111            res->corruptions++;
2112            *rebuild = true;
2113            continue;
2114        }
2115
2116        if (cluster >= *nb_clusters) {
2117            res->corruptions++;
2118            fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
2119                    fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
2120
2121            if (fix & BDRV_FIX_ERRORS) {
2122                int64_t new_nb_clusters;
2123                Error *local_err = NULL;
2124
2125                if (offset > INT64_MAX - s->cluster_size) {
2126                    ret = -EINVAL;
2127                    goto resize_fail;
2128                }
2129
2130                ret = bdrv_truncate(bs->file, offset + s->cluster_size, false,
2131                                    PREALLOC_MODE_OFF, 0, &local_err);
2132                if (ret < 0) {
2133                    error_report_err(local_err);
2134                    goto resize_fail;
2135                }
2136                size = bdrv_getlength(bs->file->bs);
2137                if (size < 0) {
2138                    ret = size;
2139                    goto resize_fail;
2140                }
2141
2142                new_nb_clusters = size_to_clusters(s, size);
2143                assert(new_nb_clusters >= *nb_clusters);
2144
2145                ret = realloc_refcount_array(s, refcount_table,
2146                                             nb_clusters, new_nb_clusters);
2147                if (ret < 0) {
2148                    res->check_errors++;
2149                    return ret;
2150                }
2151
2152                if (cluster >= *nb_clusters) {
2153                    ret = -EINVAL;
2154                    goto resize_fail;
2155                }
2156
2157                res->corruptions--;
2158                res->corruptions_fixed++;
2159                ret = qcow2_inc_refcounts_imrt(bs, res,
2160                                               refcount_table, nb_clusters,
2161                                               offset, s->cluster_size);
2162                if (ret < 0) {
2163                    return ret;
2164                }
2165                /* No need to check whether the refcount is now greater than 1:
2166                 * This area was just allocated and zeroed, so it can only be
2167                 * exactly 1 after qcow2_inc_refcounts_imrt() */
2168                continue;
2169
2170resize_fail:
2171                *rebuild = true;
2172                fprintf(stderr, "ERROR could not resize image: %s\n",
2173                        strerror(-ret));
2174            }
2175            continue;
2176        }
2177
2178        if (offset != 0) {
2179            ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2180                                           offset, s->cluster_size);
2181            if (ret < 0) {
2182                return ret;
2183            }
2184            if (s->get_refcount(*refcount_table, cluster) != 1) {
2185                fprintf(stderr, "ERROR refcount block %" PRId64
2186                        " refcount=%" PRIu64 "\n", i,
2187                        s->get_refcount(*refcount_table, cluster));
2188                res->corruptions++;
2189                *rebuild = true;
2190            }
2191        }
2192    }
2193
2194    return 0;
2195}
2196
2197/*
2198 * Calculates an in-memory refcount table.
2199 */
2200static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2201                               BdrvCheckMode fix, bool *rebuild,
2202                               void **refcount_table, int64_t *nb_clusters)
2203{
2204    BDRVQcow2State *s = bs->opaque;
2205    int64_t i;
2206    QCowSnapshot *sn;
2207    int ret;
2208
2209    if (!*refcount_table) {
2210        int64_t old_size = 0;
2211        ret = realloc_refcount_array(s, refcount_table,
2212                                     &old_size, *nb_clusters);
2213        if (ret < 0) {
2214            res->check_errors++;
2215            return ret;
2216        }
2217    }
2218
2219    /* header */
2220    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2221                                   0, s->cluster_size);
2222    if (ret < 0) {
2223        return ret;
2224    }
2225
2226    /* current L1 table */
2227    ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2228                             s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO,
2229                             fix, true);
2230    if (ret < 0) {
2231        return ret;
2232    }
2233
2234    /* snapshots */
2235    if (has_data_file(bs) && s->nb_snapshots) {
2236        fprintf(stderr, "ERROR %d snapshots in image with data file\n",
2237                s->nb_snapshots);
2238        res->corruptions++;
2239    }
2240
2241    for (i = 0; i < s->nb_snapshots; i++) {
2242        sn = s->snapshots + i;
2243        if (offset_into_cluster(s, sn->l1_table_offset)) {
2244            fprintf(stderr, "ERROR snapshot %s (%s) l1_offset=%#" PRIx64 ": "
2245                    "L1 table is not cluster aligned; snapshot table entry "
2246                    "corrupted\n", sn->id_str, sn->name, sn->l1_table_offset);
2247            res->corruptions++;
2248            continue;
2249        }
2250        if (sn->l1_size > QCOW_MAX_L1_SIZE / L1E_SIZE) {
2251            fprintf(stderr, "ERROR snapshot %s (%s) l1_size=%#" PRIx32 ": "
2252                    "L1 table is too large; snapshot table entry corrupted\n",
2253                    sn->id_str, sn->name, sn->l1_size);
2254            res->corruptions++;
2255            continue;
2256        }
2257        ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2258                                 sn->l1_table_offset, sn->l1_size, 0, fix,
2259                                 false);
2260        if (ret < 0) {
2261            return ret;
2262        }
2263    }
2264    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2265                                   s->snapshots_offset, s->snapshots_size);
2266    if (ret < 0) {
2267        return ret;
2268    }
2269
2270    /* refcount data */
2271    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2272                                   s->refcount_table_offset,
2273                                   s->refcount_table_size *
2274                                   REFTABLE_ENTRY_SIZE);
2275    if (ret < 0) {
2276        return ret;
2277    }
2278
2279    /* encryption */
2280    if (s->crypto_header.length) {
2281        ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2282                                       s->crypto_header.offset,
2283                                       s->crypto_header.length);
2284        if (ret < 0) {
2285            return ret;
2286        }
2287    }
2288
2289    /* bitmaps */
2290    ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters);
2291    if (ret < 0) {
2292        return ret;
2293    }
2294
2295    return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
2296}
2297
2298/*
2299 * Compares the actual reference count for each cluster in the image against the
2300 * refcount as reported by the refcount structures on-disk.
2301 */
2302static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2303                              BdrvCheckMode fix, bool *rebuild,
2304                              int64_t *highest_cluster,
2305                              void *refcount_table, int64_t nb_clusters)
2306{
2307    BDRVQcow2State *s = bs->opaque;
2308    int64_t i;
2309    uint64_t refcount1, refcount2;
2310    int ret;
2311
2312    for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
2313        ret = qcow2_get_refcount(bs, i, &refcount1);
2314        if (ret < 0) {
2315            fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
2316                    i, strerror(-ret));
2317            res->check_errors++;
2318            continue;
2319        }
2320
2321        refcount2 = s->get_refcount(refcount_table, i);
2322
2323        if (refcount1 > 0 || refcount2 > 0) {
2324            *highest_cluster = i;
2325        }
2326
2327        if (refcount1 != refcount2) {
2328            /* Check if we're allowed to fix the mismatch */
2329            int *num_fixed = NULL;
2330            if (refcount1 == 0) {
2331                *rebuild = true;
2332            } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
2333                num_fixed = &res->leaks_fixed;
2334            } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
2335                num_fixed = &res->corruptions_fixed;
2336            }
2337
2338            fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
2339                    " reference=%" PRIu64 "\n",
2340                   num_fixed != NULL     ? "Repairing" :
2341                   refcount1 < refcount2 ? "ERROR" :
2342                                           "Leaked",
2343                   i, refcount1, refcount2);
2344
2345            if (num_fixed) {
2346                ret = update_refcount(bs, i << s->cluster_bits, 1,
2347                                      refcount_diff(refcount1, refcount2),
2348                                      refcount1 > refcount2,
2349                                      QCOW2_DISCARD_ALWAYS);
2350                if (ret >= 0) {
2351                    (*num_fixed)++;
2352                    continue;
2353                }
2354            }
2355
2356            /* And if we couldn't, print an error */
2357            if (refcount1 < refcount2) {
2358                res->corruptions++;
2359            } else {
2360                res->leaks++;
2361            }
2362        }
2363    }
2364}
2365
2366/*
2367 * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
2368 * the on-disk refcount structures.
2369 *
2370 * On input, *first_free_cluster tells where to start looking, and need not
2371 * actually be a free cluster; the returned offset will not be before that
2372 * cluster.  On output, *first_free_cluster points to the first gap found, even
2373 * if that gap was too small to be used as the returned offset.
2374 *
2375 * Note that *first_free_cluster is a cluster index whereas the return value is
2376 * an offset.
2377 */
2378static int64_t alloc_clusters_imrt(BlockDriverState *bs,
2379                                   int cluster_count,
2380                                   void **refcount_table,
2381                                   int64_t *imrt_nb_clusters,
2382                                   int64_t *first_free_cluster)
2383{
2384    BDRVQcow2State *s = bs->opaque;
2385    int64_t cluster = *first_free_cluster, i;
2386    bool first_gap = true;
2387    int contiguous_free_clusters;
2388    int ret;
2389
2390    /* Starting at *first_free_cluster, find a range of at least cluster_count
2391     * continuously free clusters */
2392    for (contiguous_free_clusters = 0;
2393         cluster < *imrt_nb_clusters &&
2394         contiguous_free_clusters < cluster_count;
2395         cluster++)
2396    {
2397        if (!s->get_refcount(*refcount_table, cluster)) {
2398            contiguous_free_clusters++;
2399            if (first_gap) {
2400                /* If this is the first free cluster found, update
2401                 * *first_free_cluster accordingly */
2402                *first_free_cluster = cluster;
2403                first_gap = false;
2404            }
2405        } else if (contiguous_free_clusters) {
2406            contiguous_free_clusters = 0;
2407        }
2408    }
2409
2410    /* If contiguous_free_clusters is greater than zero, it contains the number
2411     * of continuously free clusters until the current cluster; the first free
2412     * cluster in the current "gap" is therefore
2413     * cluster - contiguous_free_clusters */
2414
2415    /* If no such range could be found, grow the in-memory refcount table
2416     * accordingly to append free clusters at the end of the image */
2417    if (contiguous_free_clusters < cluster_count) {
2418        /* contiguous_free_clusters clusters are already empty at the image end;
2419         * we need cluster_count clusters; therefore, we have to allocate
2420         * cluster_count - contiguous_free_clusters new clusters at the end of
2421         * the image (which is the current value of cluster; note that cluster
2422         * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
2423         * the image end) */
2424        ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
2425                                     cluster + cluster_count
2426                                     - contiguous_free_clusters);
2427        if (ret < 0) {
2428            return ret;
2429        }
2430    }
2431
2432    /* Go back to the first free cluster */
2433    cluster -= contiguous_free_clusters;
2434    for (i = 0; i < cluster_count; i++) {
2435        s->set_refcount(*refcount_table, cluster + i, 1);
2436    }
2437
2438    return cluster << s->cluster_bits;
2439}
2440
2441/*
2442 * Helper function for rebuild_refcount_structure().
2443 *
2444 * Scan the range of clusters [first_cluster, end_cluster) for allocated
2445 * clusters and write all corresponding refblocks to disk.  The refblock
2446 * and allocation data is taken from the in-memory refcount table
2447 * *refcount_table[] (of size *nb_clusters), which is basically one big
2448 * (unlimited size) refblock for the whole image.
2449 *
2450 * For these refblocks, clusters are allocated using said in-memory
2451 * refcount table.  Care is taken that these allocations are reflected
2452 * in the refblocks written to disk.
2453 *
2454 * The refblocks' offsets are written into a reftable, which is
2455 * *on_disk_reftable_ptr[] (of size *on_disk_reftable_entries_ptr).  If
2456 * that reftable is of insufficient size, it will be resized to fit.
2457 * This reftable is not written to disk.
2458 *
2459 * (If *on_disk_reftable_ptr is not NULL, the entries within are assumed
2460 * to point to existing valid refblocks that do not need to be allocated
2461 * again.)
2462 *
2463 * Return whether the on-disk reftable array was resized (true/false),
2464 * or -errno on error.
2465 */
2466static int rebuild_refcounts_write_refblocks(
2467        BlockDriverState *bs, void **refcount_table, int64_t *nb_clusters,
2468        int64_t first_cluster, int64_t end_cluster,
2469        uint64_t **on_disk_reftable_ptr, uint32_t *on_disk_reftable_entries_ptr,
2470        Error **errp
2471    )
2472{
2473    BDRVQcow2State *s = bs->opaque;
2474    int64_t cluster;
2475    int64_t refblock_offset, refblock_start, refblock_index;
2476    int64_t first_free_cluster = 0;
2477    uint64_t *on_disk_reftable = *on_disk_reftable_ptr;
2478    uint32_t on_disk_reftable_entries = *on_disk_reftable_entries_ptr;
2479    void *on_disk_refblock;
2480    bool reftable_grown = false;
2481    int ret;
2482
2483    for (cluster = first_cluster; cluster < end_cluster; cluster++) {
2484        /* Check all clusters to find refblocks that contain non-zero entries */
2485        if (!s->get_refcount(*refcount_table, cluster)) {
2486            continue;
2487        }
2488
2489        /*
2490         * This cluster is allocated, so we need to create a refblock
2491         * for it.  The data we will write to disk is just the
2492         * respective slice from *refcount_table, so it will contain
2493         * accurate refcounts for all clusters belonging to this
2494         * refblock.  After we have written it, we will therefore skip
2495         * all remaining clusters in this refblock.
2496         */
2497
2498        refblock_index = cluster >> s->refcount_block_bits;
2499        refblock_start = refblock_index << s->refcount_block_bits;
2500
2501        if (on_disk_reftable_entries > refblock_index &&
2502            on_disk_reftable[refblock_index])
2503        {
2504            /*
2505             * We can get here after a `goto write_refblocks`: We have a
2506             * reftable from a previous run, and the refblock is already
2507             * allocated.  No need to allocate it again.
2508             */
2509            refblock_offset = on_disk_reftable[refblock_index];
2510        } else {
2511            int64_t refblock_cluster_index;
2512
2513            /* Don't allocate a cluster in a refblock already written to disk */
2514            if (first_free_cluster < refblock_start) {
2515                first_free_cluster = refblock_start;
2516            }
2517            refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2518                                                  nb_clusters,
2519                                                  &first_free_cluster);
2520            if (refblock_offset < 0) {
2521                error_setg_errno(errp, -refblock_offset,
2522                                 "ERROR allocating refblock");
2523                return refblock_offset;
2524            }
2525
2526            refblock_cluster_index = refblock_offset / s->cluster_size;
2527            if (refblock_cluster_index >= end_cluster) {
2528                /*
2529                 * We must write the refblock that holds this refblock's
2530                 * refcount
2531                 */
2532                end_cluster = refblock_cluster_index + 1;
2533            }
2534
2535            if (on_disk_reftable_entries <= refblock_index) {
2536                on_disk_reftable_entries =
2537                    ROUND_UP((refblock_index + 1) * REFTABLE_ENTRY_SIZE,
2538                             s->cluster_size) / REFTABLE_ENTRY_SIZE;
2539                on_disk_reftable =
2540                    g_try_realloc(on_disk_reftable,
2541                                  on_disk_reftable_entries *
2542                                  REFTABLE_ENTRY_SIZE);
2543                if (!on_disk_reftable) {
2544                    error_setg(errp, "ERROR allocating reftable memory");
2545                    return -ENOMEM;
2546                }
2547
2548                memset(on_disk_reftable + *on_disk_reftable_entries_ptr, 0,
2549                       (on_disk_reftable_entries -
2550                        *on_disk_reftable_entries_ptr) *
2551                       REFTABLE_ENTRY_SIZE);
2552
2553                *on_disk_reftable_ptr = on_disk_reftable;
2554                *on_disk_reftable_entries_ptr = on_disk_reftable_entries;
2555
2556                reftable_grown = true;
2557            } else {
2558                assert(on_disk_reftable);
2559            }
2560            on_disk_reftable[refblock_index] = refblock_offset;
2561        }
2562
2563        /* Refblock is allocated, write it to disk */
2564
2565        ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2566                                            s->cluster_size, false);
2567        if (ret < 0) {
2568            error_setg_errno(errp, -ret, "ERROR writing refblock");
2569            return ret;
2570        }
2571
2572        /*
2573         * The refblock is simply a slice of *refcount_table.
2574         * Note that the size of *refcount_table is always aligned to
2575         * whole clusters, so the write operation will not result in
2576         * out-of-bounds accesses.
2577         */
2578        on_disk_refblock = (void *)((char *) *refcount_table +
2579                                    refblock_index * s->cluster_size);
2580
2581        ret = bdrv_pwrite(bs->file, refblock_offset, s->cluster_size,
2582                          on_disk_refblock, 0);
2583        if (ret < 0) {
2584            error_setg_errno(errp, -ret, "ERROR writing refblock");
2585            return ret;
2586        }
2587
2588        /* This refblock is done, skip to its end */
2589        cluster = refblock_start + s->refcount_block_size - 1;
2590    }
2591
2592    return reftable_grown;
2593}
2594
2595/*
2596 * Creates a new refcount structure based solely on the in-memory information
2597 * given through *refcount_table (this in-memory information is basically just
2598 * the concatenation of all refblocks).  All necessary allocations will be
2599 * reflected in that array.
2600 *
2601 * On success, the old refcount structure is leaked (it will be covered by the
2602 * new refcount structure).
2603 */
2604static int rebuild_refcount_structure(BlockDriverState *bs,
2605                                      BdrvCheckResult *res,
2606                                      void **refcount_table,
2607                                      int64_t *nb_clusters,
2608                                      Error **errp)
2609{
2610    BDRVQcow2State *s = bs->opaque;
2611    int64_t reftable_offset = -1;
2612    int64_t reftable_length = 0;
2613    int64_t reftable_clusters;
2614    int64_t refblock_index;
2615    uint32_t on_disk_reftable_entries = 0;
2616    uint64_t *on_disk_reftable = NULL;
2617    int ret = 0;
2618    int reftable_size_changed = 0;
2619    struct {
2620        uint64_t reftable_offset;
2621        uint32_t reftable_clusters;
2622    } QEMU_PACKED reftable_offset_and_clusters;
2623
2624    qcow2_cache_empty(bs, s->refcount_block_cache);
2625
2626    /*
2627     * For each refblock containing entries, we try to allocate a
2628     * cluster (in the in-memory refcount table) and write its offset
2629     * into on_disk_reftable[].  We then write the whole refblock to
2630     * disk (as a slice of the in-memory refcount table).
2631     * This is done by rebuild_refcounts_write_refblocks().
2632     *
2633     * Once we have scanned all clusters, we try to find space for the
2634     * reftable.  This will dirty the in-memory refcount table (i.e.
2635     * make it differ from the refblocks we have already written), so we
2636     * need to run rebuild_refcounts_write_refblocks() again for the
2637     * range of clusters where the reftable has been allocated.
2638     *
2639     * This second run might make the reftable grow again, in which case
2640     * we will need to allocate another space for it, which is why we
2641     * repeat all this until the reftable stops growing.
2642     *
2643     * (This loop will terminate, because with every cluster the
2644     * reftable grows, it can accomodate a multitude of more refcounts,
2645     * so that at some point this must be able to cover the reftable
2646     * and all refblocks describing it.)
2647     *
2648     * We then convert the reftable to big-endian and write it to disk.
2649     *
2650     * Note that we never free any reftable allocations.  Doing so would
2651     * needlessly complicate the algorithm: The eventual second check
2652     * run we do will clean up all leaks we have caused.
2653     */
2654
2655    reftable_size_changed =
2656        rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters,
2657                                          0, *nb_clusters,
2658                                          &on_disk_reftable,
2659                                          &on_disk_reftable_entries, errp);
2660    if (reftable_size_changed < 0) {
2661        res->check_errors++;
2662        ret = reftable_size_changed;
2663        goto fail;
2664    }
2665
2666    /*
2667     * There was no reftable before, so rebuild_refcounts_write_refblocks()
2668     * must have increased its size (from 0 to something).
2669     */
2670    assert(reftable_size_changed);
2671
2672    do {
2673        int64_t reftable_start_cluster, reftable_end_cluster;
2674        int64_t first_free_cluster = 0;
2675
2676        reftable_length = on_disk_reftable_entries * REFTABLE_ENTRY_SIZE;
2677        reftable_clusters = size_to_clusters(s, reftable_length);
2678
2679        reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2680                                              refcount_table, nb_clusters,
2681                                              &first_free_cluster);
2682        if (reftable_offset < 0) {
2683            error_setg_errno(errp, -reftable_offset,
2684                             "ERROR allocating reftable");
2685            res->check_errors++;
2686            ret = reftable_offset;
2687            goto fail;
2688        }
2689
2690        /*
2691         * We need to update the affected refblocks, so re-run the
2692         * write_refblocks loop for the reftable's range of clusters.
2693         */
2694        assert(offset_into_cluster(s, reftable_offset) == 0);
2695        reftable_start_cluster = reftable_offset / s->cluster_size;
2696        reftable_end_cluster = reftable_start_cluster + reftable_clusters;
2697        reftable_size_changed =
2698            rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters,
2699                                              reftable_start_cluster,
2700                                              reftable_end_cluster,
2701                                              &on_disk_reftable,
2702                                              &on_disk_reftable_entries, errp);
2703        if (reftable_size_changed < 0) {
2704            res->check_errors++;
2705            ret = reftable_size_changed;
2706            goto fail;
2707        }
2708
2709        /*
2710         * If the reftable size has changed, we will need to find a new
2711         * allocation, repeating the loop.
2712         */
2713    } while (reftable_size_changed);
2714
2715    /* The above loop must have run at least once */
2716    assert(reftable_offset >= 0);
2717
2718    /*
2719     * All allocations are done, all refblocks are written, convert the
2720     * reftable to big-endian and write it to disk.
2721     */
2722
2723    for (refblock_index = 0; refblock_index < on_disk_reftable_entries;
2724         refblock_index++)
2725    {
2726        cpu_to_be64s(&on_disk_reftable[refblock_index]);
2727    }
2728
2729    ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset, reftable_length,
2730                                        false);
2731    if (ret < 0) {
2732        error_setg_errno(errp, -ret, "ERROR writing reftable");
2733        goto fail;
2734    }
2735
2736    assert(reftable_length < INT_MAX);
2737    ret = bdrv_pwrite(bs->file, reftable_offset, reftable_length,
2738                      on_disk_reftable, 0);
2739    if (ret < 0) {
2740        error_setg_errno(errp, -ret, "ERROR writing reftable");
2741        goto fail;
2742    }
2743
2744    /* Enter new reftable into the image header */
2745    reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
2746    reftable_offset_and_clusters.reftable_clusters =
2747        cpu_to_be32(reftable_clusters);
2748    ret = bdrv_pwrite_sync(bs->file,
2749                           offsetof(QCowHeader, refcount_table_offset),
2750                           sizeof(reftable_offset_and_clusters),
2751                           &reftable_offset_and_clusters, 0);
2752    if (ret < 0) {
2753        error_setg_errno(errp, -ret, "ERROR setting reftable");
2754        goto fail;
2755    }
2756
2757    for (refblock_index = 0; refblock_index < on_disk_reftable_entries;
2758         refblock_index++)
2759    {
2760        be64_to_cpus(&on_disk_reftable[refblock_index]);
2761    }
2762    s->refcount_table = on_disk_reftable;
2763    s->refcount_table_offset = reftable_offset;
2764    s->refcount_table_size = on_disk_reftable_entries;
2765    update_max_refcount_table_index(s);
2766
2767    return 0;
2768
2769fail:
2770    g_free(on_disk_reftable);
2771    return ret;
2772}
2773
2774/*
2775 * Checks an image for refcount consistency.
2776 *
2777 * Returns 0 if no errors are found, the number of errors in case the image is
2778 * detected as corrupted, and -errno when an internal error occurred.
2779 */
2780int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2781                          BdrvCheckMode fix)
2782{
2783    BDRVQcow2State *s = bs->opaque;
2784    BdrvCheckResult pre_compare_res;
2785    int64_t size, highest_cluster, nb_clusters;
2786    void *refcount_table = NULL;
2787    bool rebuild = false;
2788    int ret;
2789
2790    size = bdrv_getlength(bs->file->bs);
2791    if (size < 0) {
2792        res->check_errors++;
2793        return size;
2794    }
2795
2796    nb_clusters = size_to_clusters(s, size);
2797    if (nb_clusters > INT_MAX) {
2798        res->check_errors++;
2799        return -EFBIG;
2800    }
2801
2802    res->bfi.total_clusters =
2803        size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2804
2805    ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2806                              &nb_clusters);
2807    if (ret < 0) {
2808        goto fail;
2809    }
2810
2811    /* In case we don't need to rebuild the refcount structure (but want to fix
2812     * something), this function is immediately called again, in which case the
2813     * result should be ignored */
2814    pre_compare_res = *res;
2815    compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2816                      nb_clusters);
2817
2818    if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2819        BdrvCheckResult old_res = *res;
2820        int fresh_leaks = 0;
2821        Error *local_err = NULL;
2822
2823        fprintf(stderr, "Rebuilding refcount structure\n");
2824        ret = rebuild_refcount_structure(bs, res, &refcount_table,
2825                                         &nb_clusters, &local_err);
2826        if (ret < 0) {
2827            error_report_err(local_err);
2828            goto fail;
2829        }
2830
2831        res->corruptions = 0;
2832        res->leaks = 0;
2833
2834        /* Because the old reftable has been exchanged for a new one the
2835         * references have to be recalculated */
2836        rebuild = false;
2837        memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2838        ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2839                                  &nb_clusters);
2840        if (ret < 0) {
2841            goto fail;
2842        }
2843
2844        if (fix & BDRV_FIX_LEAKS) {
2845            /* The old refcount structures are now leaked, fix it; the result
2846             * can be ignored, aside from leaks which were introduced by
2847             * rebuild_refcount_structure() that could not be fixed */
2848            BdrvCheckResult saved_res = *res;
2849            *res = (BdrvCheckResult){ 0 };
2850
2851            compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2852                              &highest_cluster, refcount_table, nb_clusters);
2853            if (rebuild) {
2854                fprintf(stderr, "ERROR rebuilt refcount structure is still "
2855                        "broken\n");
2856            }
2857
2858            /* Any leaks accounted for here were introduced by
2859             * rebuild_refcount_structure() because that function has created a
2860             * new refcount structure from scratch */
2861            fresh_leaks = res->leaks;
2862            *res = saved_res;
2863        }
2864
2865        if (res->corruptions < old_res.corruptions) {
2866            res->corruptions_fixed += old_res.corruptions - res->corruptions;
2867        }
2868        if (res->leaks < old_res.leaks) {
2869            res->leaks_fixed += old_res.leaks - res->leaks;
2870        }
2871        res->leaks += fresh_leaks;
2872    } else if (fix) {
2873        if (rebuild) {
2874            fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2875            res->check_errors++;
2876            ret = -EIO;
2877            goto fail;
2878        }
2879
2880        if (res->leaks || res->corruptions) {
2881            *res = pre_compare_res;
2882            compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2883                              refcount_table, nb_clusters);
2884        }
2885    }
2886
2887    /* check OFLAG_COPIED */
2888    ret = check_oflag_copied(bs, res, fix);
2889    if (ret < 0) {
2890        goto fail;
2891    }
2892
2893    res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2894    ret = 0;
2895
2896fail:
2897    g_free(refcount_table);
2898
2899    return ret;
2900}
2901
2902#define overlaps_with(ofs, sz) \
2903    ranges_overlap(offset, size, ofs, sz)
2904
2905/*
2906 * Checks if the given offset into the image file is actually free to use by
2907 * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2908 * i.e. a sanity check without relying on the refcount tables.
2909 *
2910 * The ign parameter specifies what checks not to perform (being a bitmask of
2911 * QCow2MetadataOverlap values), i.e., what sections to ignore.
2912 *
2913 * Returns:
2914 * - 0 if writing to this offset will not affect the mentioned metadata
2915 * - a positive QCow2MetadataOverlap value indicating one overlapping section
2916 * - a negative value (-errno) indicating an error while performing a check,
2917 *   e.g. when bdrv_pread failed on QCOW2_OL_INACTIVE_L2
2918 */
2919int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2920                                 int64_t size)
2921{
2922    BDRVQcow2State *s = bs->opaque;
2923    int chk = s->overlap_check & ~ign;
2924    int i, j;
2925
2926    if (!size) {
2927        return 0;
2928    }
2929
2930    if (chk & QCOW2_OL_MAIN_HEADER) {
2931        if (offset < s->cluster_size) {
2932            return QCOW2_OL_MAIN_HEADER;
2933        }
2934    }
2935
2936    /* align range to test to cluster boundaries */
2937    size = ROUND_UP(offset_into_cluster(s, offset) + size, s->cluster_size);
2938    offset = start_of_cluster(s, offset);
2939
2940    if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2941        if (overlaps_with(s->l1_table_offset, s->l1_size * L1E_SIZE)) {
2942            return QCOW2_OL_ACTIVE_L1;
2943        }
2944    }
2945
2946    if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2947        if (overlaps_with(s->refcount_table_offset,
2948            s->refcount_table_size * REFTABLE_ENTRY_SIZE)) {
2949            return QCOW2_OL_REFCOUNT_TABLE;
2950        }
2951    }
2952
2953    if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2954        if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2955            return QCOW2_OL_SNAPSHOT_TABLE;
2956        }
2957    }
2958
2959    if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2960        for (i = 0; i < s->nb_snapshots; i++) {
2961            if (s->snapshots[i].l1_size &&
2962                overlaps_with(s->snapshots[i].l1_table_offset,
2963                s->snapshots[i].l1_size * L1E_SIZE)) {
2964                return QCOW2_OL_INACTIVE_L1;
2965            }
2966        }
2967    }
2968
2969    if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2970        for (i = 0; i < s->l1_size; i++) {
2971            if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2972                overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2973                s->cluster_size)) {
2974                return QCOW2_OL_ACTIVE_L2;
2975            }
2976        }
2977    }
2978
2979    if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2980        unsigned last_entry = s->max_refcount_table_index;
2981        assert(last_entry < s->refcount_table_size);
2982        assert(last_entry + 1 == s->refcount_table_size ||
2983               (s->refcount_table[last_entry + 1] & REFT_OFFSET_MASK) == 0);
2984        for (i = 0; i <= last_entry; i++) {
2985            if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2986                overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2987                s->cluster_size)) {
2988                return QCOW2_OL_REFCOUNT_BLOCK;
2989            }
2990        }
2991    }
2992
2993    if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2994        for (i = 0; i < s->nb_snapshots; i++) {
2995            uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2996            uint32_t l1_sz  = s->snapshots[i].l1_size;
2997            uint64_t l1_sz2 = l1_sz * L1E_SIZE;
2998            uint64_t *l1;
2999            int ret;
3000
3001            ret = qcow2_validate_table(bs, l1_ofs, l1_sz, L1E_SIZE,
3002                                       QCOW_MAX_L1_SIZE, "", NULL);
3003            if (ret < 0) {
3004                return ret;
3005            }
3006
3007            l1 = g_try_malloc(l1_sz2);
3008
3009            if (l1_sz2 && l1 == NULL) {
3010                return -ENOMEM;
3011            }
3012
3013            ret = bdrv_pread(bs->file, l1_ofs, l1_sz2, l1, 0);
3014            if (ret < 0) {
3015                g_free(l1);
3016                return ret;
3017            }
3018
3019            for (j = 0; j < l1_sz; j++) {
3020                uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
3021                if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
3022                    g_free(l1);
3023                    return QCOW2_OL_INACTIVE_L2;
3024                }
3025            }
3026
3027            g_free(l1);
3028        }
3029    }
3030
3031    if ((chk & QCOW2_OL_BITMAP_DIRECTORY) &&
3032        (s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS))
3033    {
3034        if (overlaps_with(s->bitmap_directory_offset,
3035                          s->bitmap_directory_size))
3036        {
3037            return QCOW2_OL_BITMAP_DIRECTORY;
3038        }
3039    }
3040
3041    return 0;
3042}
3043
3044static const char *metadata_ol_names[] = {
3045    [QCOW2_OL_MAIN_HEADER_BITNR]        = "qcow2_header",
3046    [QCOW2_OL_ACTIVE_L1_BITNR]          = "active L1 table",
3047    [QCOW2_OL_ACTIVE_L2_BITNR]          = "active L2 table",
3048    [QCOW2_OL_REFCOUNT_TABLE_BITNR]     = "refcount table",
3049    [QCOW2_OL_REFCOUNT_BLOCK_BITNR]     = "refcount block",
3050    [QCOW2_OL_SNAPSHOT_TABLE_BITNR]     = "snapshot table",
3051    [QCOW2_OL_INACTIVE_L1_BITNR]        = "inactive L1 table",
3052    [QCOW2_OL_INACTIVE_L2_BITNR]        = "inactive L2 table",
3053    [QCOW2_OL_BITMAP_DIRECTORY_BITNR]   = "bitmap directory",
3054};
3055QEMU_BUILD_BUG_ON(QCOW2_OL_MAX_BITNR != ARRAY_SIZE(metadata_ol_names));
3056
3057/*
3058 * First performs a check for metadata overlaps (through
3059 * qcow2_check_metadata_overlap); if that fails with a negative value (error
3060 * while performing a check), that value is returned. If an impending overlap
3061 * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
3062 * and -EIO returned.
3063 *
3064 * Returns 0 if there were neither overlaps nor errors while checking for
3065 * overlaps; or a negative value (-errno) on error.
3066 */
3067int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
3068                                  int64_t size, bool data_file)
3069{
3070    int ret;
3071
3072    if (data_file && has_data_file(bs)) {
3073        return 0;
3074    }
3075
3076    ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
3077    if (ret < 0) {
3078        return ret;
3079    } else if (ret > 0) {
3080        int metadata_ol_bitnr = ctz32(ret);
3081        assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
3082
3083        qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
3084                                "write on metadata (overlaps with %s)",
3085                                metadata_ol_names[metadata_ol_bitnr]);
3086        return -EIO;
3087    }
3088
3089    return 0;
3090}
3091
3092/* A pointer to a function of this type is given to walk_over_reftable(). That
3093 * function will create refblocks and pass them to a RefblockFinishOp once they
3094 * are completed (@refblock). @refblock_empty is set if the refblock is
3095 * completely empty.
3096 *
3097 * Along with the refblock, a corresponding reftable entry is passed, in the
3098 * reftable @reftable (which may be reallocated) at @reftable_index.
3099 *
3100 * @allocated should be set to true if a new cluster has been allocated.
3101 */
3102typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
3103                               uint64_t reftable_index, uint64_t *reftable_size,
3104                               void *refblock, bool refblock_empty,
3105                               bool *allocated, Error **errp);
3106
3107/**
3108 * This "operation" for walk_over_reftable() allocates the refblock on disk (if
3109 * it is not empty) and inserts its offset into the new reftable. The size of
3110 * this new reftable is increased as required.
3111 */
3112static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
3113                          uint64_t reftable_index, uint64_t *reftable_size,
3114                          void *refblock, bool refblock_empty, bool *allocated,
3115                          Error **errp)
3116{
3117    BDRVQcow2State *s = bs->opaque;
3118    int64_t offset;
3119
3120    if (!refblock_empty && reftable_index >= *reftable_size) {
3121        uint64_t *new_reftable;
3122        uint64_t new_reftable_size;
3123
3124        new_reftable_size = ROUND_UP(reftable_index + 1,
3125                                     s->cluster_size / REFTABLE_ENTRY_SIZE);
3126        if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / REFTABLE_ENTRY_SIZE) {
3127            error_setg(errp,
3128                       "This operation would make the refcount table grow "
3129                       "beyond the maximum size supported by QEMU, aborting");
3130            return -ENOTSUP;
3131        }
3132
3133        new_reftable = g_try_realloc(*reftable, new_reftable_size *
3134                                                REFTABLE_ENTRY_SIZE);
3135        if (!new_reftable) {
3136            error_setg(errp, "Failed to increase reftable buffer size");
3137            return -ENOMEM;
3138        }
3139
3140        memset(new_reftable + *reftable_size, 0,
3141               (new_reftable_size - *reftable_size) * REFTABLE_ENTRY_SIZE);
3142
3143        *reftable      = new_reftable;
3144        *reftable_size = new_reftable_size;
3145    }
3146
3147    if (!refblock_empty && !(*reftable)[reftable_index]) {
3148        offset = qcow2_alloc_clusters(bs, s->cluster_size);
3149        if (offset < 0) {
3150            error_setg_errno(errp, -offset, "Failed to allocate refblock");
3151            return offset;
3152        }
3153        (*reftable)[reftable_index] = offset;
3154        *allocated = true;
3155    }
3156
3157    return 0;
3158}
3159
3160/**
3161 * This "operation" for walk_over_reftable() writes the refblock to disk at the
3162 * offset specified by the new reftable's entry. It does not modify the new
3163 * reftable or change any refcounts.
3164 */
3165static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
3166                          uint64_t reftable_index, uint64_t *reftable_size,
3167                          void *refblock, bool refblock_empty, bool *allocated,
3168                          Error **errp)
3169{
3170    BDRVQcow2State *s = bs->opaque;
3171    int64_t offset;
3172    int ret;
3173
3174    if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
3175        offset = (*reftable)[reftable_index];
3176
3177        ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size,
3178                                            false);
3179        if (ret < 0) {
3180            error_setg_errno(errp, -ret, "Overlap check failed");
3181            return ret;
3182        }
3183
3184        ret = bdrv_pwrite(bs->file, offset, s->cluster_size, refblock, 0);
3185        if (ret < 0) {
3186            error_setg_errno(errp, -ret, "Failed to write refblock");
3187            return ret;
3188        }
3189    } else {
3190        assert(refblock_empty);
3191    }
3192
3193    return 0;
3194}
3195
3196/**
3197 * This function walks over the existing reftable and every referenced refblock;
3198 * if @new_set_refcount is non-NULL, it is called for every refcount entry to
3199 * create an equal new entry in the passed @new_refblock. Once that
3200 * @new_refblock is completely filled, @operation will be called.
3201 *
3202 * @status_cb and @cb_opaque are used for the amend operation's status callback.
3203 * @index is the index of the walk_over_reftable() calls and @total is the total
3204 * number of walk_over_reftable() calls per amend operation. Both are used for
3205 * calculating the parameters for the status callback.
3206 *
3207 * @allocated is set to true if a new cluster has been allocated.
3208 */
3209static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
3210                              uint64_t *new_reftable_index,
3211                              uint64_t *new_reftable_size,
3212                              void *new_refblock, int new_refblock_size,
3213                              int new_refcount_bits,
3214                              RefblockFinishOp *operation, bool *allocated,
3215                              Qcow2SetRefcountFunc *new_set_refcount,
3216                              BlockDriverAmendStatusCB *status_cb,
3217                              void *cb_opaque, int index, int total,
3218                              Error **errp)
3219{
3220    BDRVQcow2State *s = bs->opaque;
3221    uint64_t reftable_index;
3222    bool new_refblock_empty = true;
3223    int refblock_index;
3224    int new_refblock_index = 0;
3225    int ret;
3226
3227    for (reftable_index = 0; reftable_index < s->refcount_table_size;
3228         reftable_index++)
3229    {
3230        uint64_t refblock_offset = s->refcount_table[reftable_index]
3231                                 & REFT_OFFSET_MASK;
3232
3233        status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
3234                  (uint64_t)total * s->refcount_table_size, cb_opaque);
3235
3236        if (refblock_offset) {
3237            void *refblock;
3238
3239            if (offset_into_cluster(s, refblock_offset)) {
3240                qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
3241                                        PRIx64 " unaligned (reftable index: %#"
3242                                        PRIx64 ")", refblock_offset,
3243                                        reftable_index);
3244                error_setg(errp,
3245                           "Image is corrupt (unaligned refblock offset)");
3246                return -EIO;
3247            }
3248
3249            ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
3250                                  &refblock);
3251            if (ret < 0) {
3252                error_setg_errno(errp, -ret, "Failed to retrieve refblock");
3253                return ret;
3254            }
3255
3256            for (refblock_index = 0; refblock_index < s->refcount_block_size;
3257                 refblock_index++)
3258            {
3259                uint64_t refcount;
3260
3261                if (new_refblock_index >= new_refblock_size) {
3262                    /* new_refblock is now complete */
3263                    ret = operation(bs, new_reftable, *new_reftable_index,
3264                                    new_reftable_size, new_refblock,
3265                                    new_refblock_empty, allocated, errp);
3266                    if (ret < 0) {
3267                        qcow2_cache_put(s->refcount_block_cache, &refblock);
3268                        return ret;
3269                    }
3270
3271                    (*new_reftable_index)++;
3272                    new_refblock_index = 0;
3273                    new_refblock_empty = true;
3274                }
3275
3276                refcount = s->get_refcount(refblock, refblock_index);
3277                if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
3278                    uint64_t offset;
3279
3280                    qcow2_cache_put(s->refcount_block_cache, &refblock);
3281
3282                    offset = ((reftable_index << s->refcount_block_bits)
3283                              + refblock_index) << s->cluster_bits;
3284
3285                    error_setg(errp, "Cannot decrease refcount entry width to "
3286                               "%i bits: Cluster at offset %#" PRIx64 " has a "
3287                               "refcount of %" PRIu64, new_refcount_bits,
3288                               offset, refcount);
3289                    return -EINVAL;
3290                }
3291
3292                if (new_set_refcount) {
3293                    new_set_refcount(new_refblock, new_refblock_index++,
3294                                     refcount);
3295                } else {
3296                    new_refblock_index++;
3297                }
3298                new_refblock_empty = new_refblock_empty && refcount == 0;
3299            }
3300
3301            qcow2_cache_put(s->refcount_block_cache, &refblock);
3302        } else {
3303            /* No refblock means every refcount is 0 */
3304            for (refblock_index = 0; refblock_index < s->refcount_block_size;
3305                 refblock_index++)
3306            {
3307                if (new_refblock_index >= new_refblock_size) {
3308                    /* new_refblock is now complete */
3309                    ret = operation(bs, new_reftable, *new_reftable_index,
3310                                    new_reftable_size, new_refblock,
3311                                    new_refblock_empty, allocated, errp);
3312                    if (ret < 0) {
3313                        return ret;
3314                    }
3315
3316                    (*new_reftable_index)++;
3317                    new_refblock_index = 0;
3318                    new_refblock_empty = true;
3319                }
3320
3321                if (new_set_refcount) {
3322                    new_set_refcount(new_refblock, new_refblock_index++, 0);
3323                } else {
3324                    new_refblock_index++;
3325                }
3326            }
3327        }
3328    }
3329
3330    if (new_refblock_index > 0) {
3331        /* Complete the potentially existing partially filled final refblock */
3332        if (new_set_refcount) {
3333            for (; new_refblock_index < new_refblock_size;
3334                 new_refblock_index++)
3335            {
3336                new_set_refcount(new_refblock, new_refblock_index, 0);
3337            }
3338        }
3339
3340        ret = operation(bs, new_reftable, *new_reftable_index,
3341                        new_reftable_size, new_refblock, new_refblock_empty,
3342                        allocated, errp);
3343        if (ret < 0) {
3344            return ret;
3345        }
3346
3347        (*new_reftable_index)++;
3348    }
3349
3350    status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
3351              (uint64_t)total * s->refcount_table_size, cb_opaque);
3352
3353    return 0;
3354}
3355
3356int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
3357                                BlockDriverAmendStatusCB *status_cb,
3358                                void *cb_opaque, Error **errp)
3359{
3360    BDRVQcow2State *s = bs->opaque;
3361    Qcow2GetRefcountFunc *new_get_refcount;
3362    Qcow2SetRefcountFunc *new_set_refcount;
3363    void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
3364    uint64_t *new_reftable = NULL, new_reftable_size = 0;
3365    uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
3366    uint64_t new_reftable_index = 0;
3367    uint64_t i;
3368    int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
3369    int new_refblock_size, new_refcount_bits = 1 << refcount_order;
3370    int old_refcount_order;
3371    int walk_index = 0;
3372    int ret;
3373    bool new_allocation;
3374
3375    assert(s->qcow_version >= 3);
3376    assert(refcount_order >= 0 && refcount_order <= 6);
3377
3378    /* see qcow2_open() */
3379    new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
3380
3381    new_get_refcount = get_refcount_funcs[refcount_order];
3382    new_set_refcount = set_refcount_funcs[refcount_order];
3383
3384
3385    do {
3386        int total_walks;
3387
3388        new_allocation = false;
3389
3390        /* At least we have to do this walk and the one which writes the
3391         * refblocks; also, at least we have to do this loop here at least
3392         * twice (normally), first to do the allocations, and second to
3393         * determine that everything is correctly allocated, this then makes
3394         * three walks in total */
3395        total_walks = MAX(walk_index + 2, 3);
3396
3397        /* First, allocate the structures so they are present in the refcount
3398         * structures */
3399        ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3400                                 &new_reftable_size, NULL, new_refblock_size,
3401                                 new_refcount_bits, &alloc_refblock,
3402                                 &new_allocation, NULL, status_cb, cb_opaque,
3403                                 walk_index++, total_walks, errp);
3404        if (ret < 0) {
3405            goto done;
3406        }
3407
3408        new_reftable_index = 0;
3409
3410        if (new_allocation) {
3411            if (new_reftable_offset) {
3412                qcow2_free_clusters(
3413                    bs, new_reftable_offset,
3414                    allocated_reftable_size * REFTABLE_ENTRY_SIZE,
3415                    QCOW2_DISCARD_NEVER);
3416            }
3417
3418            new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
3419                                                           REFTABLE_ENTRY_SIZE);
3420            if (new_reftable_offset < 0) {
3421                error_setg_errno(errp, -new_reftable_offset,
3422                                 "Failed to allocate the new reftable");
3423                ret = new_reftable_offset;
3424                goto done;
3425            }
3426            allocated_reftable_size = new_reftable_size;
3427        }
3428    } while (new_allocation);
3429
3430    /* Second, write the new refblocks */
3431    ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3432                             &new_reftable_size, new_refblock,
3433                             new_refblock_size, new_refcount_bits,
3434                             &flush_refblock, &new_allocation, new_set_refcount,
3435                             status_cb, cb_opaque, walk_index, walk_index + 1,
3436                             errp);
3437    if (ret < 0) {
3438        goto done;
3439    }
3440    assert(!new_allocation);
3441
3442
3443    /* Write the new reftable */
3444    ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
3445                                        new_reftable_size * REFTABLE_ENTRY_SIZE,
3446                                        false);
3447    if (ret < 0) {
3448        error_setg_errno(errp, -ret, "Overlap check failed");
3449        goto done;
3450    }
3451
3452    for (i = 0; i < new_reftable_size; i++) {
3453        cpu_to_be64s(&new_reftable[i]);
3454    }
3455
3456    ret = bdrv_pwrite(bs->file, new_reftable_offset,
3457                      new_reftable_size * REFTABLE_ENTRY_SIZE, new_reftable,
3458                      0);
3459
3460    for (i = 0; i < new_reftable_size; i++) {
3461        be64_to_cpus(&new_reftable[i]);
3462    }
3463
3464    if (ret < 0) {
3465        error_setg_errno(errp, -ret, "Failed to write the new reftable");
3466        goto done;
3467    }
3468
3469
3470    /* Empty the refcount cache */
3471    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
3472    if (ret < 0) {
3473        error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
3474        goto done;
3475    }
3476
3477    /* Update the image header to point to the new reftable; this only updates
3478     * the fields which are relevant to qcow2_update_header(); other fields
3479     * such as s->refcount_table or s->refcount_bits stay stale for now
3480     * (because we have to restore everything if qcow2_update_header() fails) */
3481    old_refcount_order  = s->refcount_order;
3482    old_reftable_size   = s->refcount_table_size;
3483    old_reftable_offset = s->refcount_table_offset;
3484
3485    s->refcount_order        = refcount_order;
3486    s->refcount_table_size   = new_reftable_size;
3487    s->refcount_table_offset = new_reftable_offset;
3488
3489    ret = qcow2_update_header(bs);
3490    if (ret < 0) {
3491        s->refcount_order        = old_refcount_order;
3492        s->refcount_table_size   = old_reftable_size;
3493        s->refcount_table_offset = old_reftable_offset;
3494        error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
3495        goto done;
3496    }
3497
3498    /* Now update the rest of the in-memory information */
3499    old_reftable = s->refcount_table;
3500    s->refcount_table = new_reftable;
3501    update_max_refcount_table_index(s);
3502
3503    s->refcount_bits = 1 << refcount_order;
3504    s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
3505    s->refcount_max += s->refcount_max - 1;
3506
3507    s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
3508    s->refcount_block_size = 1 << s->refcount_block_bits;
3509
3510    s->get_refcount = new_get_refcount;
3511    s->set_refcount = new_set_refcount;
3512
3513    /* For cleaning up all old refblocks and the old reftable below the "done"
3514     * label */
3515    new_reftable        = old_reftable;
3516    new_reftable_size   = old_reftable_size;
3517    new_reftable_offset = old_reftable_offset;
3518
3519done:
3520    if (new_reftable) {
3521        /* On success, new_reftable actually points to the old reftable (and
3522         * new_reftable_size is the old reftable's size); but that is just
3523         * fine */
3524        for (i = 0; i < new_reftable_size; i++) {
3525            uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
3526            if (offset) {
3527                qcow2_free_clusters(bs, offset, s->cluster_size,
3528                                    QCOW2_DISCARD_OTHER);
3529            }
3530        }
3531        g_free(new_reftable);
3532
3533        if (new_reftable_offset > 0) {
3534            qcow2_free_clusters(bs, new_reftable_offset,
3535                                new_reftable_size * REFTABLE_ENTRY_SIZE,
3536                                QCOW2_DISCARD_OTHER);
3537        }
3538    }
3539
3540    qemu_vfree(new_refblock);
3541    return ret;
3542}
3543
3544static int64_t get_refblock_offset(BlockDriverState *bs, uint64_t offset)
3545{
3546    BDRVQcow2State *s = bs->opaque;
3547    uint32_t index = offset_to_reftable_index(s, offset);
3548    int64_t covering_refblock_offset = 0;
3549
3550    if (index < s->refcount_table_size) {
3551        covering_refblock_offset = s->refcount_table[index] & REFT_OFFSET_MASK;
3552    }
3553    if (!covering_refblock_offset) {
3554        qcow2_signal_corruption(bs, true, -1, -1, "Refblock at %#" PRIx64 " is "
3555                                "not covered by the refcount structures",
3556                                offset);
3557        return -EIO;
3558    }
3559
3560    return covering_refblock_offset;
3561}
3562
3563static int coroutine_fn
3564qcow2_discard_refcount_block(BlockDriverState *bs, uint64_t discard_block_offs)
3565{
3566    BDRVQcow2State *s = bs->opaque;
3567    int64_t refblock_offs;
3568    uint64_t cluster_index = discard_block_offs >> s->cluster_bits;
3569    uint32_t block_index = cluster_index & (s->refcount_block_size - 1);
3570    void *refblock;
3571    int ret;
3572
3573    refblock_offs = get_refblock_offset(bs, discard_block_offs);
3574    if (refblock_offs < 0) {
3575        return refblock_offs;
3576    }
3577
3578    assert(discard_block_offs != 0);
3579
3580    ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3581                          &refblock);
3582    if (ret < 0) {
3583        return ret;
3584    }
3585
3586    if (s->get_refcount(refblock, block_index) != 1) {
3587        qcow2_signal_corruption(bs, true, -1, -1, "Invalid refcount:"
3588                                " refblock offset %#" PRIx64
3589                                ", reftable index %u"
3590                                ", block offset %#" PRIx64
3591                                ", refcount %#" PRIx64,
3592                                refblock_offs,
3593                                offset_to_reftable_index(s, discard_block_offs),
3594                                discard_block_offs,
3595                                s->get_refcount(refblock, block_index));
3596        qcow2_cache_put(s->refcount_block_cache, &refblock);
3597        return -EINVAL;
3598    }
3599    s->set_refcount(refblock, block_index, 0);
3600
3601    qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refblock);
3602
3603    qcow2_cache_put(s->refcount_block_cache, &refblock);
3604
3605    if (cluster_index < s->free_cluster_index) {
3606        s->free_cluster_index = cluster_index;
3607    }
3608
3609    refblock = qcow2_cache_is_table_offset(s->refcount_block_cache,
3610                                           discard_block_offs);
3611    if (refblock) {
3612        /* discard refblock from the cache if refblock is cached */
3613        qcow2_cache_discard(s->refcount_block_cache, refblock);
3614    }
3615    update_refcount_discard(bs, discard_block_offs, s->cluster_size);
3616
3617    return 0;
3618}
3619
3620int coroutine_fn qcow2_shrink_reftable(BlockDriverState *bs)
3621{
3622    BDRVQcow2State *s = bs->opaque;
3623    uint64_t *reftable_tmp =
3624        g_malloc(s->refcount_table_size * REFTABLE_ENTRY_SIZE);
3625    int i, ret;
3626
3627    for (i = 0; i < s->refcount_table_size; i++) {
3628        int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK;
3629        void *refblock;
3630        bool unused_block;
3631
3632        if (refblock_offs == 0) {
3633            reftable_tmp[i] = 0;
3634            continue;
3635        }
3636        ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3637                              &refblock);
3638        if (ret < 0) {
3639            goto out;
3640        }
3641
3642        /* the refblock has own reference */
3643        if (i == offset_to_reftable_index(s, refblock_offs)) {
3644            uint64_t block_index = (refblock_offs >> s->cluster_bits) &
3645                                   (s->refcount_block_size - 1);
3646            uint64_t refcount = s->get_refcount(refblock, block_index);
3647
3648            s->set_refcount(refblock, block_index, 0);
3649
3650            unused_block = buffer_is_zero(refblock, s->cluster_size);
3651
3652            s->set_refcount(refblock, block_index, refcount);
3653        } else {
3654            unused_block = buffer_is_zero(refblock, s->cluster_size);
3655        }
3656        qcow2_cache_put(s->refcount_block_cache, &refblock);
3657
3658        reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
3659    }
3660
3661    ret = bdrv_co_pwrite_sync(bs->file, s->refcount_table_offset,
3662                              s->refcount_table_size * REFTABLE_ENTRY_SIZE,
3663                              reftable_tmp, 0);
3664    /*
3665     * If the write in the reftable failed the image may contain a partially
3666     * overwritten reftable. In this case it would be better to clear the
3667     * reftable in memory to avoid possible image corruption.
3668     */
3669    for (i = 0; i < s->refcount_table_size; i++) {
3670        if (s->refcount_table[i] && !reftable_tmp[i]) {
3671            if (ret == 0) {
3672                ret = qcow2_discard_refcount_block(bs, s->refcount_table[i] &
3673                                                       REFT_OFFSET_MASK);
3674            }
3675            s->refcount_table[i] = 0;
3676        }
3677    }
3678
3679    if (!s->cache_discards) {
3680        qcow2_process_discards(bs, ret);
3681    }
3682
3683out:
3684    g_free(reftable_tmp);
3685    return ret;
3686}
3687
3688int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
3689{
3690    BDRVQcow2State *s = bs->opaque;
3691    int64_t i;
3692
3693    for (i = size_to_clusters(s, size) - 1; i >= 0; i--) {
3694        uint64_t refcount;
3695        int ret = qcow2_get_refcount(bs, i, &refcount);
3696        if (ret < 0) {
3697            fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
3698                    i, strerror(-ret));
3699            return ret;
3700        }
3701        if (refcount > 0) {
3702            return i;
3703        }
3704    }
3705    qcow2_signal_corruption(bs, true, -1, -1,
3706                            "There are no references in the refcount table.");
3707    return -EIO;
3708}
3709
3710int coroutine_fn qcow2_detect_metadata_preallocation(BlockDriverState *bs)
3711{
3712    BDRVQcow2State *s = bs->opaque;
3713    int64_t i, end_cluster, cluster_count = 0, threshold;
3714    int64_t file_length, real_allocation, real_clusters;
3715
3716    qemu_co_mutex_assert_locked(&s->lock);
3717
3718    file_length = bdrv_getlength(bs->file->bs);
3719    if (file_length < 0) {
3720        return file_length;
3721    }
3722
3723    real_allocation = bdrv_co_get_allocated_file_size(bs->file->bs);
3724    if (real_allocation < 0) {
3725        return real_allocation;
3726    }
3727
3728    real_clusters = real_allocation / s->cluster_size;
3729    threshold = MAX(real_clusters * 10 / 9, real_clusters + 2);
3730
3731    end_cluster = size_to_clusters(s, file_length);
3732    for (i = 0; i < end_cluster && cluster_count < threshold; i++) {
3733        uint64_t refcount;
3734        int ret = qcow2_get_refcount(bs, i, &refcount);
3735        if (ret < 0) {
3736            return ret;
3737        }
3738        cluster_count += !!refcount;
3739    }
3740
3741    return cluster_count >= threshold;
3742}
3743