qemu/block/qcow2-refcount.c
<<
>>
Prefs
   1/*
   2 * Block driver for the QCOW version 2 format
   3 *
   4 * Copyright (c) 2004-2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "qapi/error.h"
  27#include "qcow2.h"
  28#include "qemu/range.h"
  29#include "qemu/bswap.h"
  30#include "qemu/cutils.h"
  31#include "trace.h"
  32
  33static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
  34                                    uint64_t max);
  35static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
  36                            int64_t offset, int64_t length, uint64_t addend,
  37                            bool decrease, enum qcow2_discard_type type);
  38
  39static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
  40static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
  41static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
  42static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
  43static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
  44static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
  45static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
  46
  47static void set_refcount_ro0(void *refcount_array, uint64_t index,
  48                             uint64_t value);
  49static void set_refcount_ro1(void *refcount_array, uint64_t index,
  50                             uint64_t value);
  51static void set_refcount_ro2(void *refcount_array, uint64_t index,
  52                             uint64_t value);
  53static void set_refcount_ro3(void *refcount_array, uint64_t index,
  54                             uint64_t value);
  55static void set_refcount_ro4(void *refcount_array, uint64_t index,
  56                             uint64_t value);
  57static void set_refcount_ro5(void *refcount_array, uint64_t index,
  58                             uint64_t value);
  59static void set_refcount_ro6(void *refcount_array, uint64_t index,
  60                             uint64_t value);
  61
  62
  63static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
  64    &get_refcount_ro0,
  65    &get_refcount_ro1,
  66    &get_refcount_ro2,
  67    &get_refcount_ro3,
  68    &get_refcount_ro4,
  69    &get_refcount_ro5,
  70    &get_refcount_ro6
  71};
  72
  73static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
  74    &set_refcount_ro0,
  75    &set_refcount_ro1,
  76    &set_refcount_ro2,
  77    &set_refcount_ro3,
  78    &set_refcount_ro4,
  79    &set_refcount_ro5,
  80    &set_refcount_ro6
  81};
  82
  83
  84/*********************************************************/
  85/* refcount handling */
  86
  87static void update_max_refcount_table_index(BDRVQcow2State *s)
  88{
  89    unsigned i = s->refcount_table_size - 1;
  90    while (i > 0 && (s->refcount_table[i] & REFT_OFFSET_MASK) == 0) {
  91        i--;
  92    }
  93    /* Set s->max_refcount_table_index to the index of the last used entry */
  94    s->max_refcount_table_index = i;
  95}
  96
  97int qcow2_refcount_init(BlockDriverState *bs)
  98{
  99    BDRVQcow2State *s = bs->opaque;
 100    unsigned int refcount_table_size2, i;
 101    int ret;
 102
 103    assert(s->refcount_order >= 0 && s->refcount_order <= 6);
 104
 105    s->get_refcount = get_refcount_funcs[s->refcount_order];
 106    s->set_refcount = set_refcount_funcs[s->refcount_order];
 107
 108    assert(s->refcount_table_size <= INT_MAX / REFTABLE_ENTRY_SIZE);
 109    refcount_table_size2 = s->refcount_table_size * REFTABLE_ENTRY_SIZE;
 110    s->refcount_table = g_try_malloc(refcount_table_size2);
 111
 112    if (s->refcount_table_size > 0) {
 113        if (s->refcount_table == NULL) {
 114            ret = -ENOMEM;
 115            goto fail;
 116        }
 117        BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
 118        ret = bdrv_pread(bs->file, s->refcount_table_offset,
 119                         s->refcount_table, refcount_table_size2);
 120        if (ret < 0) {
 121            goto fail;
 122        }
 123        for(i = 0; i < s->refcount_table_size; i++)
 124            be64_to_cpus(&s->refcount_table[i]);
 125        update_max_refcount_table_index(s);
 126    }
 127    return 0;
 128 fail:
 129    return ret;
 130}
 131
 132void qcow2_refcount_close(BlockDriverState *bs)
 133{
 134    BDRVQcow2State *s = bs->opaque;
 135    g_free(s->refcount_table);
 136}
 137
 138
 139static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
 140{
 141    return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
 142}
 143
 144static void set_refcount_ro0(void *refcount_array, uint64_t index,
 145                             uint64_t value)
 146{
 147    assert(!(value >> 1));
 148    ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
 149    ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
 150}
 151
 152static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
 153{
 154    return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
 155           & 0x3;
 156}
 157
 158static void set_refcount_ro1(void *refcount_array, uint64_t index,
 159                             uint64_t value)
 160{
 161    assert(!(value >> 2));
 162    ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
 163    ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
 164}
 165
 166static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
 167{
 168    return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
 169           & 0xf;
 170}
 171
 172static void set_refcount_ro2(void *refcount_array, uint64_t index,
 173                             uint64_t value)
 174{
 175    assert(!(value >> 4));
 176    ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
 177    ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
 178}
 179
 180static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
 181{
 182    return ((const uint8_t *)refcount_array)[index];
 183}
 184
 185static void set_refcount_ro3(void *refcount_array, uint64_t index,
 186                             uint64_t value)
 187{
 188    assert(!(value >> 8));
 189    ((uint8_t *)refcount_array)[index] = value;
 190}
 191
 192static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
 193{
 194    return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
 195}
 196
 197static void set_refcount_ro4(void *refcount_array, uint64_t index,
 198                             uint64_t value)
 199{
 200    assert(!(value >> 16));
 201    ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
 202}
 203
 204static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
 205{
 206    return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
 207}
 208
 209static void set_refcount_ro5(void *refcount_array, uint64_t index,
 210                             uint64_t value)
 211{
 212    assert(!(value >> 32));
 213    ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
 214}
 215
 216static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
 217{
 218    return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
 219}
 220
 221static void set_refcount_ro6(void *refcount_array, uint64_t index,
 222                             uint64_t value)
 223{
 224    ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
 225}
 226
 227
 228static int load_refcount_block(BlockDriverState *bs,
 229                               int64_t refcount_block_offset,
 230                               void **refcount_block)
 231{
 232    BDRVQcow2State *s = bs->opaque;
 233
 234    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
 235    return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
 236                           refcount_block);
 237}
 238
 239/*
 240 * Retrieves the refcount of the cluster given by its index and stores it in
 241 * *refcount. Returns 0 on success and -errno on failure.
 242 */
 243int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
 244                       uint64_t *refcount)
 245{
 246    BDRVQcow2State *s = bs->opaque;
 247    uint64_t refcount_table_index, block_index;
 248    int64_t refcount_block_offset;
 249    int ret;
 250    void *refcount_block;
 251
 252    refcount_table_index = cluster_index >> s->refcount_block_bits;
 253    if (refcount_table_index >= s->refcount_table_size) {
 254        *refcount = 0;
 255        return 0;
 256    }
 257    refcount_block_offset =
 258        s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
 259    if (!refcount_block_offset) {
 260        *refcount = 0;
 261        return 0;
 262    }
 263
 264    if (offset_into_cluster(s, refcount_block_offset)) {
 265        qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
 266                                " unaligned (reftable index: %#" PRIx64 ")",
 267                                refcount_block_offset, refcount_table_index);
 268        return -EIO;
 269    }
 270
 271    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
 272                          &refcount_block);
 273    if (ret < 0) {
 274        return ret;
 275    }
 276
 277    block_index = cluster_index & (s->refcount_block_size - 1);
 278    *refcount = s->get_refcount(refcount_block, block_index);
 279
 280    qcow2_cache_put(s->refcount_block_cache, &refcount_block);
 281
 282    return 0;
 283}
 284
 285/* Checks if two offsets are described by the same refcount block */
 286static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
 287    uint64_t offset_b)
 288{
 289    uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
 290    uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
 291
 292    return (block_a == block_b);
 293}
 294
 295/*
 296 * Loads a refcount block. If it doesn't exist yet, it is allocated first
 297 * (including growing the refcount table if needed).
 298 *
 299 * Returns 0 on success or -errno in error case
 300 */
 301static int alloc_refcount_block(BlockDriverState *bs,
 302                                int64_t cluster_index, void **refcount_block)
 303{
 304    BDRVQcow2State *s = bs->opaque;
 305    unsigned int refcount_table_index;
 306    int64_t ret;
 307
 308    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
 309
 310    /* Find the refcount block for the given cluster */
 311    refcount_table_index = cluster_index >> s->refcount_block_bits;
 312
 313    if (refcount_table_index < s->refcount_table_size) {
 314
 315        uint64_t refcount_block_offset =
 316            s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
 317
 318        /* If it's already there, we're done */
 319        if (refcount_block_offset) {
 320            if (offset_into_cluster(s, refcount_block_offset)) {
 321                qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
 322                                        PRIx64 " unaligned (reftable index: "
 323                                        "%#x)", refcount_block_offset,
 324                                        refcount_table_index);
 325                return -EIO;
 326            }
 327
 328             return load_refcount_block(bs, refcount_block_offset,
 329                                        refcount_block);
 330        }
 331    }
 332
 333    /*
 334     * If we came here, we need to allocate something. Something is at least
 335     * a cluster for the new refcount block. It may also include a new refcount
 336     * table if the old refcount table is too small.
 337     *
 338     * Note that allocating clusters here needs some special care:
 339     *
 340     * - We can't use the normal qcow2_alloc_clusters(), it would try to
 341     *   increase the refcount and very likely we would end up with an endless
 342     *   recursion. Instead we must place the refcount blocks in a way that
 343     *   they can describe them themselves.
 344     *
 345     * - We need to consider that at this point we are inside update_refcounts
 346     *   and potentially doing an initial refcount increase. This means that
 347     *   some clusters have already been allocated by the caller, but their
 348     *   refcount isn't accurate yet. If we allocate clusters for metadata, we
 349     *   need to return -EAGAIN to signal the caller that it needs to restart
 350     *   the search for free clusters.
 351     *
 352     * - alloc_clusters_noref and qcow2_free_clusters may load a different
 353     *   refcount block into the cache
 354     */
 355
 356    *refcount_block = NULL;
 357
 358    /* We write to the refcount table, so we might depend on L2 tables */
 359    ret = qcow2_cache_flush(bs, s->l2_table_cache);
 360    if (ret < 0) {
 361        return ret;
 362    }
 363
 364    /* Allocate the refcount block itself and mark it as used */
 365    int64_t new_block = alloc_clusters_noref(bs, s->cluster_size, INT64_MAX);
 366    if (new_block < 0) {
 367        return new_block;
 368    }
 369
 370    /* The offset must fit in the offset field of the refcount table entry */
 371    assert((new_block & REFT_OFFSET_MASK) == new_block);
 372
 373    /* If we're allocating the block at offset 0 then something is wrong */
 374    if (new_block == 0) {
 375        qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
 376                                "allocation of refcount block at offset 0");
 377        return -EIO;
 378    }
 379
 380#ifdef DEBUG_ALLOC2
 381    fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
 382        " at %" PRIx64 "\n",
 383        refcount_table_index, cluster_index << s->cluster_bits, new_block);
 384#endif
 385
 386    if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
 387        /* Zero the new refcount block before updating it */
 388        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
 389                                    refcount_block);
 390        if (ret < 0) {
 391            goto fail;
 392        }
 393
 394        memset(*refcount_block, 0, s->cluster_size);
 395
 396        /* The block describes itself, need to update the cache */
 397        int block_index = (new_block >> s->cluster_bits) &
 398            (s->refcount_block_size - 1);
 399        s->set_refcount(*refcount_block, block_index, 1);
 400    } else {
 401        /* Described somewhere else. This can recurse at most twice before we
 402         * arrive at a block that describes itself. */
 403        ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
 404                              QCOW2_DISCARD_NEVER);
 405        if (ret < 0) {
 406            goto fail;
 407        }
 408
 409        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 410        if (ret < 0) {
 411            goto fail;
 412        }
 413
 414        /* Initialize the new refcount block only after updating its refcount,
 415         * update_refcount uses the refcount cache itself */
 416        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
 417                                    refcount_block);
 418        if (ret < 0) {
 419            goto fail;
 420        }
 421
 422        memset(*refcount_block, 0, s->cluster_size);
 423    }
 424
 425    /* Now the new refcount block needs to be written to disk */
 426    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
 427    qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
 428    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 429    if (ret < 0) {
 430        goto fail;
 431    }
 432
 433    /* If the refcount table is big enough, just hook the block up there */
 434    if (refcount_table_index < s->refcount_table_size) {
 435        uint64_t data64 = cpu_to_be64(new_block);
 436        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
 437        ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset +
 438                               refcount_table_index * REFTABLE_ENTRY_SIZE,
 439            &data64, sizeof(data64));
 440        if (ret < 0) {
 441            goto fail;
 442        }
 443
 444        s->refcount_table[refcount_table_index] = new_block;
 445        /* If there's a hole in s->refcount_table then it can happen
 446         * that refcount_table_index < s->max_refcount_table_index */
 447        s->max_refcount_table_index =
 448            MAX(s->max_refcount_table_index, refcount_table_index);
 449
 450        /* The new refcount block may be where the caller intended to put its
 451         * data, so let it restart the search. */
 452        return -EAGAIN;
 453    }
 454
 455    qcow2_cache_put(s->refcount_block_cache, refcount_block);
 456
 457    /*
 458     * If we come here, we need to grow the refcount table. Again, a new
 459     * refcount table needs some space and we can't simply allocate to avoid
 460     * endless recursion.
 461     *
 462     * Therefore let's grab new refcount blocks at the end of the image, which
 463     * will describe themselves and the new refcount table. This way we can
 464     * reference them only in the new table and do the switch to the new
 465     * refcount table at once without producing an inconsistent state in
 466     * between.
 467     */
 468    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
 469
 470    /* Calculate the number of refcount blocks needed so far; this will be the
 471     * basis for calculating the index of the first cluster used for the
 472     * self-describing refcount structures which we are about to create.
 473     *
 474     * Because we reached this point, there cannot be any refcount entries for
 475     * cluster_index or higher indices yet. However, because new_block has been
 476     * allocated to describe that cluster (and it will assume this role later
 477     * on), we cannot use that index; also, new_block may actually have a higher
 478     * cluster index than cluster_index, so it needs to be taken into account
 479     * here (and 1 needs to be added to its value because that cluster is used).
 480     */
 481    uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
 482                                            (new_block >> s->cluster_bits) + 1),
 483                                        s->refcount_block_size);
 484
 485    /* Create the new refcount table and blocks */
 486    uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
 487        s->cluster_size;
 488
 489    ret = qcow2_refcount_area(bs, meta_offset, 0, false,
 490                              refcount_table_index, new_block);
 491    if (ret < 0) {
 492        return ret;
 493    }
 494
 495    ret = load_refcount_block(bs, new_block, refcount_block);
 496    if (ret < 0) {
 497        return ret;
 498    }
 499
 500    /* If we were trying to do the initial refcount update for some cluster
 501     * allocation, we might have used the same clusters to store newly
 502     * allocated metadata. Make the caller search some new space. */
 503    return -EAGAIN;
 504
 505fail:
 506    if (*refcount_block != NULL) {
 507        qcow2_cache_put(s->refcount_block_cache, refcount_block);
 508    }
 509    return ret;
 510}
 511
 512/*
 513 * Starting at @start_offset, this function creates new self-covering refcount
 514 * structures: A new refcount table and refcount blocks which cover all of
 515 * themselves, and a number of @additional_clusters beyond their end.
 516 * @start_offset must be at the end of the image file, that is, there must be
 517 * only empty space beyond it.
 518 * If @exact_size is false, the refcount table will have 50 % more entries than
 519 * necessary so it will not need to grow again soon.
 520 * If @new_refblock_offset is not zero, it contains the offset of a refcount
 521 * block that should be entered into the new refcount table at index
 522 * @new_refblock_index.
 523 *
 524 * Returns: The offset after the new refcount structures (i.e. where the
 525 *          @additional_clusters may be placed) on success, -errno on error.
 526 */
 527int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
 528                            uint64_t additional_clusters, bool exact_size,
 529                            int new_refblock_index,
 530                            uint64_t new_refblock_offset)
 531{
 532    BDRVQcow2State *s = bs->opaque;
 533    uint64_t total_refblock_count_u64, additional_refblock_count;
 534    int total_refblock_count, table_size, area_reftable_index, table_clusters;
 535    int i;
 536    uint64_t table_offset, block_offset, end_offset;
 537    int ret;
 538    uint64_t *new_table;
 539
 540    assert(!(start_offset % s->cluster_size));
 541
 542    qcow2_refcount_metadata_size(start_offset / s->cluster_size +
 543                                 additional_clusters,
 544                                 s->cluster_size, s->refcount_order,
 545                                 !exact_size, &total_refblock_count_u64);
 546    if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) {
 547        return -EFBIG;
 548    }
 549    total_refblock_count = total_refblock_count_u64;
 550
 551    /* Index in the refcount table of the first refcount block to cover the area
 552     * of refcount structures we are about to create; we know that
 553     * @total_refblock_count can cover @start_offset, so this will definitely
 554     * fit into an int. */
 555    area_reftable_index = (start_offset / s->cluster_size) /
 556                          s->refcount_block_size;
 557
 558    if (exact_size) {
 559        table_size = total_refblock_count;
 560    } else {
 561        table_size = total_refblock_count +
 562                     DIV_ROUND_UP(total_refblock_count, 2);
 563    }
 564    /* The qcow2 file can only store the reftable size in number of clusters */
 565    table_size = ROUND_UP(table_size, s->cluster_size / REFTABLE_ENTRY_SIZE);
 566    table_clusters = (table_size * REFTABLE_ENTRY_SIZE) / s->cluster_size;
 567
 568    if (table_size > QCOW_MAX_REFTABLE_SIZE) {
 569        return -EFBIG;
 570    }
 571
 572    new_table = g_try_new0(uint64_t, table_size);
 573
 574    assert(table_size > 0);
 575    if (new_table == NULL) {
 576        ret = -ENOMEM;
 577        goto fail;
 578    }
 579
 580    /* Fill the new refcount table */
 581    if (table_size > s->max_refcount_table_index) {
 582        /* We're actually growing the reftable */
 583        memcpy(new_table, s->refcount_table,
 584               (s->max_refcount_table_index + 1) * REFTABLE_ENTRY_SIZE);
 585    } else {
 586        /* Improbable case: We're shrinking the reftable. However, the caller
 587         * has assured us that there is only empty space beyond @start_offset,
 588         * so we can simply drop all of the refblocks that won't fit into the
 589         * new reftable. */
 590        memcpy(new_table, s->refcount_table, table_size * REFTABLE_ENTRY_SIZE);
 591    }
 592
 593    if (new_refblock_offset) {
 594        assert(new_refblock_index < total_refblock_count);
 595        new_table[new_refblock_index] = new_refblock_offset;
 596    }
 597
 598    /* Count how many new refblocks we have to create */
 599    additional_refblock_count = 0;
 600    for (i = area_reftable_index; i < total_refblock_count; i++) {
 601        if (!new_table[i]) {
 602            additional_refblock_count++;
 603        }
 604    }
 605
 606    table_offset = start_offset + additional_refblock_count * s->cluster_size;
 607    end_offset = table_offset + table_clusters * s->cluster_size;
 608
 609    /* Fill the refcount blocks, and create new ones, if necessary */
 610    block_offset = start_offset;
 611    for (i = area_reftable_index; i < total_refblock_count; i++) {
 612        void *refblock_data;
 613        uint64_t first_offset_covered;
 614
 615        /* Reuse an existing refblock if possible, create a new one otherwise */
 616        if (new_table[i]) {
 617            ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i],
 618                                  &refblock_data);
 619            if (ret < 0) {
 620                goto fail;
 621            }
 622        } else {
 623            ret = qcow2_cache_get_empty(bs, s->refcount_block_cache,
 624                                        block_offset, &refblock_data);
 625            if (ret < 0) {
 626                goto fail;
 627            }
 628            memset(refblock_data, 0, s->cluster_size);
 629            qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
 630                                         refblock_data);
 631
 632            new_table[i] = block_offset;
 633            block_offset += s->cluster_size;
 634        }
 635
 636        /* First host offset covered by this refblock */
 637        first_offset_covered = (uint64_t)i * s->refcount_block_size *
 638                               s->cluster_size;
 639        if (first_offset_covered < end_offset) {
 640            int j, end_index;
 641
 642            /* Set the refcount of all of the new refcount structures to 1 */
 643
 644            if (first_offset_covered < start_offset) {
 645                assert(i == area_reftable_index);
 646                j = (start_offset - first_offset_covered) / s->cluster_size;
 647                assert(j < s->refcount_block_size);
 648            } else {
 649                j = 0;
 650            }
 651
 652            end_index = MIN((end_offset - first_offset_covered) /
 653                            s->cluster_size,
 654                            s->refcount_block_size);
 655
 656            for (; j < end_index; j++) {
 657                /* The caller guaranteed us this space would be empty */
 658                assert(s->get_refcount(refblock_data, j) == 0);
 659                s->set_refcount(refblock_data, j, 1);
 660            }
 661
 662            qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
 663                                         refblock_data);
 664        }
 665
 666        qcow2_cache_put(s->refcount_block_cache, &refblock_data);
 667    }
 668
 669    assert(block_offset == table_offset);
 670
 671    /* Write refcount blocks to disk */
 672    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
 673    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 674    if (ret < 0) {
 675        goto fail;
 676    }
 677
 678    /* Write refcount table to disk */
 679    for (i = 0; i < total_refblock_count; i++) {
 680        cpu_to_be64s(&new_table[i]);
 681    }
 682
 683    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
 684    ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
 685        table_size * REFTABLE_ENTRY_SIZE);
 686    if (ret < 0) {
 687        goto fail;
 688    }
 689
 690    for (i = 0; i < total_refblock_count; i++) {
 691        be64_to_cpus(&new_table[i]);
 692    }
 693
 694    /* Hook up the new refcount table in the qcow2 header */
 695    struct QEMU_PACKED {
 696        uint64_t d64;
 697        uint32_t d32;
 698    } data;
 699    data.d64 = cpu_to_be64(table_offset);
 700    data.d32 = cpu_to_be32(table_clusters);
 701    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
 702    ret = bdrv_pwrite_sync(bs->file,
 703                           offsetof(QCowHeader, refcount_table_offset),
 704                           &data, sizeof(data));
 705    if (ret < 0) {
 706        goto fail;
 707    }
 708
 709    /* And switch it in memory */
 710    uint64_t old_table_offset = s->refcount_table_offset;
 711    uint64_t old_table_size = s->refcount_table_size;
 712
 713    g_free(s->refcount_table);
 714    s->refcount_table = new_table;
 715    s->refcount_table_size = table_size;
 716    s->refcount_table_offset = table_offset;
 717    update_max_refcount_table_index(s);
 718
 719    /* Free old table. */
 720    qcow2_free_clusters(bs, old_table_offset,
 721                        old_table_size * REFTABLE_ENTRY_SIZE,
 722                        QCOW2_DISCARD_OTHER);
 723
 724    return end_offset;
 725
 726fail:
 727    g_free(new_table);
 728    return ret;
 729}
 730
 731void qcow2_process_discards(BlockDriverState *bs, int ret)
 732{
 733    BDRVQcow2State *s = bs->opaque;
 734    Qcow2DiscardRegion *d, *next;
 735
 736    QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
 737        QTAILQ_REMOVE(&s->discards, d, next);
 738
 739        /* Discard is optional, ignore the return value */
 740        if (ret >= 0) {
 741            int r2 = bdrv_pdiscard(bs->file, d->offset, d->bytes);
 742            if (r2 < 0) {
 743                trace_qcow2_process_discards_failed_region(d->offset, d->bytes,
 744                                                           r2);
 745            }
 746        }
 747
 748        g_free(d);
 749    }
 750}
 751
 752static void update_refcount_discard(BlockDriverState *bs,
 753                                    uint64_t offset, uint64_t length)
 754{
 755    BDRVQcow2State *s = bs->opaque;
 756    Qcow2DiscardRegion *d, *p, *next;
 757
 758    QTAILQ_FOREACH(d, &s->discards, next) {
 759        uint64_t new_start = MIN(offset, d->offset);
 760        uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
 761
 762        if (new_end - new_start <= length + d->bytes) {
 763            /* There can't be any overlap, areas ending up here have no
 764             * references any more and therefore shouldn't get freed another
 765             * time. */
 766            assert(d->bytes + length == new_end - new_start);
 767            d->offset = new_start;
 768            d->bytes = new_end - new_start;
 769            goto found;
 770        }
 771    }
 772
 773    d = g_malloc(sizeof(*d));
 774    *d = (Qcow2DiscardRegion) {
 775        .bs     = bs,
 776        .offset = offset,
 777        .bytes  = length,
 778    };
 779    QTAILQ_INSERT_TAIL(&s->discards, d, next);
 780
 781found:
 782    /* Merge discard requests if they are adjacent now */
 783    QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
 784        if (p == d
 785            || p->offset > d->offset + d->bytes
 786            || d->offset > p->offset + p->bytes)
 787        {
 788            continue;
 789        }
 790
 791        /* Still no overlap possible */
 792        assert(p->offset == d->offset + d->bytes
 793            || d->offset == p->offset + p->bytes);
 794
 795        QTAILQ_REMOVE(&s->discards, p, next);
 796        d->offset = MIN(d->offset, p->offset);
 797        d->bytes += p->bytes;
 798        g_free(p);
 799    }
 800}
 801
 802/* XXX: cache several refcount block clusters ? */
 803/* @addend is the absolute value of the addend; if @decrease is set, @addend
 804 * will be subtracted from the current refcount, otherwise it will be added */
 805static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
 806                                                   int64_t offset,
 807                                                   int64_t length,
 808                                                   uint64_t addend,
 809                                                   bool decrease,
 810                                                   enum qcow2_discard_type type)
 811{
 812    BDRVQcow2State *s = bs->opaque;
 813    int64_t start, last, cluster_offset;
 814    void *refcount_block = NULL;
 815    int64_t old_table_index = -1;
 816    int ret;
 817
 818#ifdef DEBUG_ALLOC2
 819    fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
 820            " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
 821            addend);
 822#endif
 823    if (length < 0) {
 824        return -EINVAL;
 825    } else if (length == 0) {
 826        return 0;
 827    }
 828
 829    if (decrease) {
 830        qcow2_cache_set_dependency(bs, s->refcount_block_cache,
 831            s->l2_table_cache);
 832    }
 833
 834    start = start_of_cluster(s, offset);
 835    last = start_of_cluster(s, offset + length - 1);
 836    for(cluster_offset = start; cluster_offset <= last;
 837        cluster_offset += s->cluster_size)
 838    {
 839        int block_index;
 840        uint64_t refcount;
 841        int64_t cluster_index = cluster_offset >> s->cluster_bits;
 842        int64_t table_index = cluster_index >> s->refcount_block_bits;
 843
 844        /* Load the refcount block and allocate it if needed */
 845        if (table_index != old_table_index) {
 846            if (refcount_block) {
 847                qcow2_cache_put(s->refcount_block_cache, &refcount_block);
 848            }
 849            ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
 850            /* If the caller needs to restart the search for free clusters,
 851             * try the same ones first to see if they're still free. */
 852            if (ret == -EAGAIN) {
 853                if (s->free_cluster_index > (start >> s->cluster_bits)) {
 854                    s->free_cluster_index = (start >> s->cluster_bits);
 855                }
 856            }
 857            if (ret < 0) {
 858                goto fail;
 859            }
 860        }
 861        old_table_index = table_index;
 862
 863        qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
 864
 865        /* we can update the count and save it */
 866        block_index = cluster_index & (s->refcount_block_size - 1);
 867
 868        refcount = s->get_refcount(refcount_block, block_index);
 869        if (decrease ? (refcount - addend > refcount)
 870                     : (refcount + addend < refcount ||
 871                        refcount + addend > s->refcount_max))
 872        {
 873            ret = -EINVAL;
 874            goto fail;
 875        }
 876        if (decrease) {
 877            refcount -= addend;
 878        } else {
 879            refcount += addend;
 880        }
 881        if (refcount == 0 && cluster_index < s->free_cluster_index) {
 882            s->free_cluster_index = cluster_index;
 883        }
 884        s->set_refcount(refcount_block, block_index, refcount);
 885
 886        if (refcount == 0) {
 887            void *table;
 888
 889            table = qcow2_cache_is_table_offset(s->refcount_block_cache,
 890                                                offset);
 891            if (table != NULL) {
 892                qcow2_cache_put(s->refcount_block_cache, &refcount_block);
 893                old_table_index = -1;
 894                qcow2_cache_discard(s->refcount_block_cache, table);
 895            }
 896
 897            table = qcow2_cache_is_table_offset(s->l2_table_cache, offset);
 898            if (table != NULL) {
 899                qcow2_cache_discard(s->l2_table_cache, table);
 900            }
 901
 902            if (s->discard_passthrough[type]) {
 903                update_refcount_discard(bs, cluster_offset, s->cluster_size);
 904            }
 905        }
 906    }
 907
 908    ret = 0;
 909fail:
 910    if (!s->cache_discards) {
 911        qcow2_process_discards(bs, ret);
 912    }
 913
 914    /* Write last changed block to disk */
 915    if (refcount_block) {
 916        qcow2_cache_put(s->refcount_block_cache, &refcount_block);
 917    }
 918
 919    /*
 920     * Try do undo any updates if an error is returned (This may succeed in
 921     * some cases like ENOSPC for allocating a new refcount block)
 922     */
 923    if (ret < 0) {
 924        int dummy;
 925        dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
 926                                !decrease, QCOW2_DISCARD_NEVER);
 927        (void)dummy;
 928    }
 929
 930    return ret;
 931}
 932
 933/*
 934 * Increases or decreases the refcount of a given cluster.
 935 *
 936 * @addend is the absolute value of the addend; if @decrease is set, @addend
 937 * will be subtracted from the current refcount, otherwise it will be added.
 938 *
 939 * On success 0 is returned; on failure -errno is returned.
 940 */
 941int qcow2_update_cluster_refcount(BlockDriverState *bs,
 942                                  int64_t cluster_index,
 943                                  uint64_t addend, bool decrease,
 944                                  enum qcow2_discard_type type)
 945{
 946    BDRVQcow2State *s = bs->opaque;
 947    int ret;
 948
 949    ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
 950                          decrease, type);
 951    if (ret < 0) {
 952        return ret;
 953    }
 954
 955    return 0;
 956}
 957
 958
 959
 960/*********************************************************/
 961/* cluster allocation functions */
 962
 963
 964
 965/* return < 0 if error */
 966static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
 967                                    uint64_t max)
 968{
 969    BDRVQcow2State *s = bs->opaque;
 970    uint64_t i, nb_clusters, refcount;
 971    int ret;
 972
 973    /* We can't allocate clusters if they may still be queued for discard. */
 974    if (s->cache_discards) {
 975        qcow2_process_discards(bs, 0);
 976    }
 977
 978    nb_clusters = size_to_clusters(s, size);
 979retry:
 980    for(i = 0; i < nb_clusters; i++) {
 981        uint64_t next_cluster_index = s->free_cluster_index++;
 982        ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
 983
 984        if (ret < 0) {
 985            return ret;
 986        } else if (refcount != 0) {
 987            goto retry;
 988        }
 989    }
 990
 991    /* Make sure that all offsets in the "allocated" range are representable
 992     * in the requested max */
 993    if (s->free_cluster_index > 0 &&
 994        s->free_cluster_index - 1 > (max >> s->cluster_bits))
 995    {
 996        return -EFBIG;
 997    }
 998
 999#ifdef DEBUG_ALLOC2
1000    fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
1001            size,
1002            (s->free_cluster_index - nb_clusters) << s->cluster_bits);
1003#endif
1004    return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
1005}
1006
1007int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
1008{
1009    int64_t offset;
1010    int ret;
1011
1012    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
1013    do {
1014        offset = alloc_clusters_noref(bs, size, QCOW_MAX_CLUSTER_OFFSET);
1015        if (offset < 0) {
1016            return offset;
1017        }
1018
1019        ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1020    } while (ret == -EAGAIN);
1021
1022    if (ret < 0) {
1023        return ret;
1024    }
1025
1026    return offset;
1027}
1028
1029int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
1030                                int64_t nb_clusters)
1031{
1032    BDRVQcow2State *s = bs->opaque;
1033    uint64_t cluster_index, refcount;
1034    uint64_t i;
1035    int ret;
1036
1037    assert(nb_clusters >= 0);
1038    if (nb_clusters == 0) {
1039        return 0;
1040    }
1041
1042    do {
1043        /* Check how many clusters there are free */
1044        cluster_index = offset >> s->cluster_bits;
1045        for(i = 0; i < nb_clusters; i++) {
1046            ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
1047            if (ret < 0) {
1048                return ret;
1049            } else if (refcount != 0) {
1050                break;
1051            }
1052        }
1053
1054        /* And then allocate them */
1055        ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
1056                              QCOW2_DISCARD_NEVER);
1057    } while (ret == -EAGAIN);
1058
1059    if (ret < 0) {
1060        return ret;
1061    }
1062
1063    return i;
1064}
1065
1066/* only used to allocate compressed sectors. We try to allocate
1067   contiguous sectors. size must be <= cluster_size */
1068int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
1069{
1070    BDRVQcow2State *s = bs->opaque;
1071    int64_t offset;
1072    size_t free_in_cluster;
1073    int ret;
1074
1075    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
1076    assert(size > 0 && size <= s->cluster_size);
1077    assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
1078
1079    offset = s->free_byte_offset;
1080
1081    if (offset) {
1082        uint64_t refcount;
1083        ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
1084        if (ret < 0) {
1085            return ret;
1086        }
1087
1088        if (refcount == s->refcount_max) {
1089            offset = 0;
1090        }
1091    }
1092
1093    free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
1094    do {
1095        if (!offset || free_in_cluster < size) {
1096            int64_t new_cluster;
1097
1098            new_cluster = alloc_clusters_noref(bs, s->cluster_size,
1099                                               MIN(s->cluster_offset_mask,
1100                                                   QCOW_MAX_CLUSTER_OFFSET));
1101            if (new_cluster < 0) {
1102                return new_cluster;
1103            }
1104
1105            if (new_cluster == 0) {
1106                qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
1107                                        "allocation of compressed cluster "
1108                                        "at offset 0");
1109                return -EIO;
1110            }
1111
1112            if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
1113                offset = new_cluster;
1114                free_in_cluster = s->cluster_size;
1115            } else {
1116                free_in_cluster += s->cluster_size;
1117            }
1118        }
1119
1120        assert(offset);
1121        ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1122        if (ret < 0) {
1123            offset = 0;
1124        }
1125    } while (ret == -EAGAIN);
1126    if (ret < 0) {
1127        return ret;
1128    }
1129
1130    /* The cluster refcount was incremented; refcount blocks must be flushed
1131     * before the caller's L2 table updates. */
1132    qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
1133
1134    s->free_byte_offset = offset + size;
1135    if (!offset_into_cluster(s, s->free_byte_offset)) {
1136        s->free_byte_offset = 0;
1137    }
1138
1139    return offset;
1140}
1141
1142void qcow2_free_clusters(BlockDriverState *bs,
1143                          int64_t offset, int64_t size,
1144                          enum qcow2_discard_type type)
1145{
1146    int ret;
1147
1148    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
1149    ret = update_refcount(bs, offset, size, 1, true, type);
1150    if (ret < 0) {
1151        fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
1152        /* TODO Remember the clusters to free them later and avoid leaking */
1153    }
1154}
1155
1156/*
1157 * Free a cluster using its L2 entry (handles clusters of all types, e.g.
1158 * normal cluster, compressed cluster, etc.)
1159 */
1160void qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry,
1161                            enum qcow2_discard_type type)
1162{
1163    BDRVQcow2State *s = bs->opaque;
1164    QCow2ClusterType ctype = qcow2_get_cluster_type(bs, l2_entry);
1165
1166    if (has_data_file(bs)) {
1167        if (s->discard_passthrough[type] &&
1168            (ctype == QCOW2_CLUSTER_NORMAL ||
1169             ctype == QCOW2_CLUSTER_ZERO_ALLOC))
1170        {
1171            bdrv_pdiscard(s->data_file, l2_entry & L2E_OFFSET_MASK,
1172                          s->cluster_size);
1173        }
1174        return;
1175    }
1176
1177    switch (ctype) {
1178    case QCOW2_CLUSTER_COMPRESSED:
1179        {
1180            int64_t offset = (l2_entry & s->cluster_offset_mask)
1181                & QCOW2_COMPRESSED_SECTOR_MASK;
1182            int size = QCOW2_COMPRESSED_SECTOR_SIZE *
1183                (((l2_entry >> s->csize_shift) & s->csize_mask) + 1);
1184            qcow2_free_clusters(bs, offset, size, type);
1185        }
1186        break;
1187    case QCOW2_CLUSTER_NORMAL:
1188    case QCOW2_CLUSTER_ZERO_ALLOC:
1189        if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1190            qcow2_signal_corruption(bs, false, -1, -1,
1191                                    "Cannot free unaligned cluster %#llx",
1192                                    l2_entry & L2E_OFFSET_MASK);
1193        } else {
1194            qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1195                                s->cluster_size, type);
1196        }
1197        break;
1198    case QCOW2_CLUSTER_ZERO_PLAIN:
1199    case QCOW2_CLUSTER_UNALLOCATED:
1200        break;
1201    default:
1202        abort();
1203    }
1204}
1205
1206int coroutine_fn qcow2_write_caches(BlockDriverState *bs)
1207{
1208    BDRVQcow2State *s = bs->opaque;
1209    int ret;
1210
1211    ret = qcow2_cache_write(bs, s->l2_table_cache);
1212    if (ret < 0) {
1213        return ret;
1214    }
1215
1216    if (qcow2_need_accurate_refcounts(s)) {
1217        ret = qcow2_cache_write(bs, s->refcount_block_cache);
1218        if (ret < 0) {
1219            return ret;
1220        }
1221    }
1222
1223    return 0;
1224}
1225
1226int coroutine_fn qcow2_flush_caches(BlockDriverState *bs)
1227{
1228    int ret = qcow2_write_caches(bs);
1229    if (ret < 0) {
1230        return ret;
1231    }
1232
1233    return bdrv_flush(bs->file->bs);
1234}
1235
1236/*********************************************************/
1237/* snapshots and image creation */
1238
1239
1240
1241/* update the refcounts of snapshots and the copied flag */
1242int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1243    int64_t l1_table_offset, int l1_size, int addend)
1244{
1245    BDRVQcow2State *s = bs->opaque;
1246    uint64_t *l1_table, *l2_slice, l2_offset, entry, l1_size2, refcount;
1247    bool l1_allocated = false;
1248    int64_t old_entry, old_l2_offset;
1249    unsigned slice, slice_size2, n_slices;
1250    int i, j, l1_modified = 0, nb_csectors;
1251    int ret;
1252
1253    assert(addend >= -1 && addend <= 1);
1254
1255    l2_slice = NULL;
1256    l1_table = NULL;
1257    l1_size2 = l1_size * L1E_SIZE;
1258    slice_size2 = s->l2_slice_size * l2_entry_size(s);
1259    n_slices = s->cluster_size / slice_size2;
1260
1261    s->cache_discards = true;
1262
1263    /* WARNING: qcow2_snapshot_goto relies on this function not using the
1264     * l1_table_offset when it is the current s->l1_table_offset! Be careful
1265     * when changing this! */
1266    if (l1_table_offset != s->l1_table_offset) {
1267        l1_table = g_try_malloc0(l1_size2);
1268        if (l1_size2 && l1_table == NULL) {
1269            ret = -ENOMEM;
1270            goto fail;
1271        }
1272        l1_allocated = true;
1273
1274        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1275        if (ret < 0) {
1276            goto fail;
1277        }
1278
1279        for (i = 0; i < l1_size; i++) {
1280            be64_to_cpus(&l1_table[i]);
1281        }
1282    } else {
1283        assert(l1_size == s->l1_size);
1284        l1_table = s->l1_table;
1285        l1_allocated = false;
1286    }
1287
1288    for (i = 0; i < l1_size; i++) {
1289        l2_offset = l1_table[i];
1290        if (l2_offset) {
1291            old_l2_offset = l2_offset;
1292            l2_offset &= L1E_OFFSET_MASK;
1293
1294            if (offset_into_cluster(s, l2_offset)) {
1295                qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1296                                        PRIx64 " unaligned (L1 index: %#x)",
1297                                        l2_offset, i);
1298                ret = -EIO;
1299                goto fail;
1300            }
1301
1302            for (slice = 0; slice < n_slices; slice++) {
1303                ret = qcow2_cache_get(bs, s->l2_table_cache,
1304                                      l2_offset + slice * slice_size2,
1305                                      (void **) &l2_slice);
1306                if (ret < 0) {
1307                    goto fail;
1308                }
1309
1310                for (j = 0; j < s->l2_slice_size; j++) {
1311                    uint64_t cluster_index;
1312                    uint64_t offset;
1313
1314                    entry = get_l2_entry(s, l2_slice, j);
1315                    old_entry = entry;
1316                    entry &= ~QCOW_OFLAG_COPIED;
1317                    offset = entry & L2E_OFFSET_MASK;
1318
1319                    switch (qcow2_get_cluster_type(bs, entry)) {
1320                    case QCOW2_CLUSTER_COMPRESSED:
1321                        nb_csectors = ((entry >> s->csize_shift) &
1322                                       s->csize_mask) + 1;
1323                        if (addend != 0) {
1324                            uint64_t coffset = (entry & s->cluster_offset_mask)
1325                                & QCOW2_COMPRESSED_SECTOR_MASK;
1326                            ret = update_refcount(
1327                                bs, coffset,
1328                                nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE,
1329                                abs(addend), addend < 0,
1330                                QCOW2_DISCARD_SNAPSHOT);
1331                            if (ret < 0) {
1332                                goto fail;
1333                            }
1334                        }
1335                        /* compressed clusters are never modified */
1336                        refcount = 2;
1337                        break;
1338
1339                    case QCOW2_CLUSTER_NORMAL:
1340                    case QCOW2_CLUSTER_ZERO_ALLOC:
1341                        if (offset_into_cluster(s, offset)) {
1342                            /* Here l2_index means table (not slice) index */
1343                            int l2_index = slice * s->l2_slice_size + j;
1344                            qcow2_signal_corruption(
1345                                bs, true, -1, -1, "Cluster "
1346                                "allocation offset %#" PRIx64
1347                                " unaligned (L2 offset: %#"
1348                                PRIx64 ", L2 index: %#x)",
1349                                offset, l2_offset, l2_index);
1350                            ret = -EIO;
1351                            goto fail;
1352                        }
1353
1354                        cluster_index = offset >> s->cluster_bits;
1355                        assert(cluster_index);
1356                        if (addend != 0) {
1357                            ret = qcow2_update_cluster_refcount(
1358                                bs, cluster_index, abs(addend), addend < 0,
1359                                QCOW2_DISCARD_SNAPSHOT);
1360                            if (ret < 0) {
1361                                goto fail;
1362                            }
1363                        }
1364
1365                        ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1366                        if (ret < 0) {
1367                            goto fail;
1368                        }
1369                        break;
1370
1371                    case QCOW2_CLUSTER_ZERO_PLAIN:
1372                    case QCOW2_CLUSTER_UNALLOCATED:
1373                        refcount = 0;
1374                        break;
1375
1376                    default:
1377                        abort();
1378                    }
1379
1380                    if (refcount == 1) {
1381                        entry |= QCOW_OFLAG_COPIED;
1382                    }
1383                    if (entry != old_entry) {
1384                        if (addend > 0) {
1385                            qcow2_cache_set_dependency(bs, s->l2_table_cache,
1386                                                       s->refcount_block_cache);
1387                        }
1388                        set_l2_entry(s, l2_slice, j, entry);
1389                        qcow2_cache_entry_mark_dirty(s->l2_table_cache,
1390                                                     l2_slice);
1391                    }
1392                }
1393
1394                qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1395            }
1396
1397            if (addend != 0) {
1398                ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1399                                                        s->cluster_bits,
1400                                                    abs(addend), addend < 0,
1401                                                    QCOW2_DISCARD_SNAPSHOT);
1402                if (ret < 0) {
1403                    goto fail;
1404                }
1405            }
1406            ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1407                                     &refcount);
1408            if (ret < 0) {
1409                goto fail;
1410            } else if (refcount == 1) {
1411                l2_offset |= QCOW_OFLAG_COPIED;
1412            }
1413            if (l2_offset != old_l2_offset) {
1414                l1_table[i] = l2_offset;
1415                l1_modified = 1;
1416            }
1417        }
1418    }
1419
1420    ret = bdrv_flush(bs);
1421fail:
1422    if (l2_slice) {
1423        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1424    }
1425
1426    s->cache_discards = false;
1427    qcow2_process_discards(bs, ret);
1428
1429    /* Update L1 only if it isn't deleted anyway (addend = -1) */
1430    if (ret == 0 && addend >= 0 && l1_modified) {
1431        for (i = 0; i < l1_size; i++) {
1432            cpu_to_be64s(&l1_table[i]);
1433        }
1434
1435        ret = bdrv_pwrite_sync(bs->file, l1_table_offset,
1436                               l1_table, l1_size2);
1437
1438        for (i = 0; i < l1_size; i++) {
1439            be64_to_cpus(&l1_table[i]);
1440        }
1441    }
1442    if (l1_allocated)
1443        g_free(l1_table);
1444    return ret;
1445}
1446
1447
1448
1449
1450/*********************************************************/
1451/* refcount checking functions */
1452
1453
1454static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
1455{
1456    /* This assertion holds because there is no way we can address more than
1457     * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1458     * offsets have to be representable in bytes); due to every cluster
1459     * corresponding to one refcount entry, we are well below that limit */
1460    assert(entries < (UINT64_C(1) << (64 - 9)));
1461
1462    /* Thanks to the assertion this will not overflow, because
1463     * s->refcount_order < 7.
1464     * (note: x << s->refcount_order == x * s->refcount_bits) */
1465    return DIV_ROUND_UP(entries << s->refcount_order, 8);
1466}
1467
1468/**
1469 * Reallocates *array so that it can hold new_size entries. *size must contain
1470 * the current number of entries in *array. If the reallocation fails, *array
1471 * and *size will not be modified and -errno will be returned. If the
1472 * reallocation is successful, *array will be set to the new buffer, *size
1473 * will be set to new_size and 0 will be returned. The size of the reallocated
1474 * refcount array buffer will be aligned to a cluster boundary, and the newly
1475 * allocated area will be zeroed.
1476 */
1477static int realloc_refcount_array(BDRVQcow2State *s, void **array,
1478                                  int64_t *size, int64_t new_size)
1479{
1480    int64_t old_byte_size, new_byte_size;
1481    void *new_ptr;
1482
1483    /* Round to clusters so the array can be directly written to disk */
1484    old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1485                    * s->cluster_size;
1486    new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1487                    * s->cluster_size;
1488
1489    if (new_byte_size == old_byte_size) {
1490        *size = new_size;
1491        return 0;
1492    }
1493
1494    assert(new_byte_size > 0);
1495
1496    if (new_byte_size > SIZE_MAX) {
1497        return -ENOMEM;
1498    }
1499
1500    new_ptr = g_try_realloc(*array, new_byte_size);
1501    if (!new_ptr) {
1502        return -ENOMEM;
1503    }
1504
1505    if (new_byte_size > old_byte_size) {
1506        memset((char *)new_ptr + old_byte_size, 0,
1507               new_byte_size - old_byte_size);
1508    }
1509
1510    *array = new_ptr;
1511    *size  = new_size;
1512
1513    return 0;
1514}
1515
1516/*
1517 * Increases the refcount for a range of clusters in a given refcount table.
1518 * This is used to construct a temporary refcount table out of L1 and L2 tables
1519 * which can be compared to the refcount table saved in the image.
1520 *
1521 * Modifies the number of errors in res.
1522 */
1523int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
1524                             void **refcount_table,
1525                             int64_t *refcount_table_size,
1526                             int64_t offset, int64_t size)
1527{
1528    BDRVQcow2State *s = bs->opaque;
1529    uint64_t start, last, cluster_offset, k, refcount;
1530    int64_t file_len;
1531    int ret;
1532
1533    if (size <= 0) {
1534        return 0;
1535    }
1536
1537    file_len = bdrv_getlength(bs->file->bs);
1538    if (file_len < 0) {
1539        return file_len;
1540    }
1541
1542    /*
1543     * Last cluster of qcow2 image may be semi-allocated, so it may be OK to
1544     * reference some space after file end but it should be less than one
1545     * cluster.
1546     */
1547    if (offset + size - file_len >= s->cluster_size) {
1548        fprintf(stderr, "ERROR: counting reference for region exceeding the "
1549                "end of the file by one cluster or more: offset 0x%" PRIx64
1550                " size 0x%" PRIx64 "\n", offset, size);
1551        res->corruptions++;
1552        return 0;
1553    }
1554
1555    start = start_of_cluster(s, offset);
1556    last = start_of_cluster(s, offset + size - 1);
1557    for(cluster_offset = start; cluster_offset <= last;
1558        cluster_offset += s->cluster_size) {
1559        k = cluster_offset >> s->cluster_bits;
1560        if (k >= *refcount_table_size) {
1561            ret = realloc_refcount_array(s, refcount_table,
1562                                         refcount_table_size, k + 1);
1563            if (ret < 0) {
1564                res->check_errors++;
1565                return ret;
1566            }
1567        }
1568
1569        refcount = s->get_refcount(*refcount_table, k);
1570        if (refcount == s->refcount_max) {
1571            fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1572                    "\n", cluster_offset);
1573            fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1574                    "width or qemu-img convert to create a clean copy if the "
1575                    "image cannot be opened for writing\n");
1576            res->corruptions++;
1577            continue;
1578        }
1579        s->set_refcount(*refcount_table, k, refcount + 1);
1580    }
1581
1582    return 0;
1583}
1584
1585/* Flags for check_refcounts_l1() and check_refcounts_l2() */
1586enum {
1587    CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1588};
1589
1590/*
1591 * Increases the refcount in the given refcount table for the all clusters
1592 * referenced in the L2 table. While doing so, performs some checks on L2
1593 * entries.
1594 *
1595 * Returns the number of errors found by the checks or -errno if an internal
1596 * error occurred.
1597 */
1598static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1599                              void **refcount_table,
1600                              int64_t *refcount_table_size, int64_t l2_offset,
1601                              int flags, BdrvCheckMode fix, bool active)
1602{
1603    BDRVQcow2State *s = bs->opaque;
1604    uint64_t *l2_table, l2_entry;
1605    uint64_t next_contiguous_offset = 0;
1606    int i, l2_size, nb_csectors, ret;
1607
1608    /* Read L2 table from disk */
1609    l2_size = s->l2_size * l2_entry_size(s);
1610    l2_table = g_malloc(l2_size);
1611
1612    ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size);
1613    if (ret < 0) {
1614        fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1615        res->check_errors++;
1616        goto fail;
1617    }
1618
1619    /* Do the actual checks */
1620    for(i = 0; i < s->l2_size; i++) {
1621        l2_entry = get_l2_entry(s, l2_table, i);
1622
1623        switch (qcow2_get_cluster_type(bs, l2_entry)) {
1624        case QCOW2_CLUSTER_COMPRESSED:
1625            /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1626            if (l2_entry & QCOW_OFLAG_COPIED) {
1627                fprintf(stderr, "ERROR: coffset=0x%" PRIx64 ": "
1628                    "copied flag must never be set for compressed "
1629                    "clusters\n", l2_entry & s->cluster_offset_mask);
1630                l2_entry &= ~QCOW_OFLAG_COPIED;
1631                res->corruptions++;
1632            }
1633
1634            if (has_data_file(bs)) {
1635                fprintf(stderr, "ERROR compressed cluster %d with data file, "
1636                        "entry=0x%" PRIx64 "\n", i, l2_entry);
1637                res->corruptions++;
1638                break;
1639            }
1640
1641            /* Mark cluster as used */
1642            nb_csectors = ((l2_entry >> s->csize_shift) &
1643                           s->csize_mask) + 1;
1644            l2_entry &= s->cluster_offset_mask;
1645            ret = qcow2_inc_refcounts_imrt(
1646                bs, res, refcount_table, refcount_table_size,
1647                l2_entry & QCOW2_COMPRESSED_SECTOR_MASK,
1648                nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE);
1649            if (ret < 0) {
1650                goto fail;
1651            }
1652
1653            if (flags & CHECK_FRAG_INFO) {
1654                res->bfi.allocated_clusters++;
1655                res->bfi.compressed_clusters++;
1656
1657                /* Compressed clusters are fragmented by nature.  Since they
1658                 * take up sub-sector space but we only have sector granularity
1659                 * I/O we need to re-read the same sectors even for adjacent
1660                 * compressed clusters.
1661                 */
1662                res->bfi.fragmented_clusters++;
1663            }
1664            break;
1665
1666        case QCOW2_CLUSTER_ZERO_ALLOC:
1667        case QCOW2_CLUSTER_NORMAL:
1668        {
1669            uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1670
1671            /* Correct offsets are cluster aligned */
1672            if (offset_into_cluster(s, offset)) {
1673                bool contains_data;
1674                res->corruptions++;
1675
1676                if (has_subclusters(s)) {
1677                    uint64_t l2_bitmap = get_l2_bitmap(s, l2_table, i);
1678                    contains_data = (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC);
1679                } else {
1680                    contains_data = !(l2_entry & QCOW_OFLAG_ZERO);
1681                }
1682
1683                if (!contains_data) {
1684                    fprintf(stderr, "%s offset=%" PRIx64 ": Preallocated "
1685                            "cluster is not properly aligned; L2 entry "
1686                            "corrupted.\n",
1687                            fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
1688                            offset);
1689                    if (fix & BDRV_FIX_ERRORS) {
1690                        int idx = i * (l2_entry_size(s) / sizeof(uint64_t));
1691                        uint64_t l2e_offset =
1692                            l2_offset + (uint64_t)i * l2_entry_size(s);
1693                        int ign = active ? QCOW2_OL_ACTIVE_L2 :
1694                                           QCOW2_OL_INACTIVE_L2;
1695
1696                        l2_entry = has_subclusters(s) ? 0 : QCOW_OFLAG_ZERO;
1697                        set_l2_entry(s, l2_table, i, l2_entry);
1698                        ret = qcow2_pre_write_overlap_check(bs, ign,
1699                                l2e_offset, l2_entry_size(s), false);
1700                        if (ret < 0) {
1701                            fprintf(stderr, "ERROR: Overlap check failed\n");
1702                            res->check_errors++;
1703                            /* Something is seriously wrong, so abort checking
1704                             * this L2 table */
1705                            goto fail;
1706                        }
1707
1708                        ret = bdrv_pwrite_sync(bs->file, l2e_offset,
1709                                               &l2_table[idx],
1710                                               l2_entry_size(s));
1711                        if (ret < 0) {
1712                            fprintf(stderr, "ERROR: Failed to overwrite L2 "
1713                                    "table entry: %s\n", strerror(-ret));
1714                            res->check_errors++;
1715                            /* Do not abort, continue checking the rest of this
1716                             * L2 table's entries */
1717                        } else {
1718                            res->corruptions--;
1719                            res->corruptions_fixed++;
1720                            /* Skip marking the cluster as used
1721                             * (it is unused now) */
1722                            continue;
1723                        }
1724                    }
1725                } else {
1726                    fprintf(stderr, "ERROR offset=%" PRIx64 ": Data cluster is "
1727                        "not properly aligned; L2 entry corrupted.\n", offset);
1728                }
1729            }
1730
1731            if (flags & CHECK_FRAG_INFO) {
1732                res->bfi.allocated_clusters++;
1733                if (next_contiguous_offset &&
1734                    offset != next_contiguous_offset) {
1735                    res->bfi.fragmented_clusters++;
1736                }
1737                next_contiguous_offset = offset + s->cluster_size;
1738            }
1739
1740            /* Mark cluster as used */
1741            if (!has_data_file(bs)) {
1742                ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table,
1743                                               refcount_table_size,
1744                                               offset, s->cluster_size);
1745                if (ret < 0) {
1746                    goto fail;
1747                }
1748            }
1749            break;
1750        }
1751
1752        case QCOW2_CLUSTER_ZERO_PLAIN:
1753        case QCOW2_CLUSTER_UNALLOCATED:
1754            break;
1755
1756        default:
1757            abort();
1758        }
1759    }
1760
1761    g_free(l2_table);
1762    return 0;
1763
1764fail:
1765    g_free(l2_table);
1766    return ret;
1767}
1768
1769/*
1770 * Increases the refcount for the L1 table, its L2 tables and all referenced
1771 * clusters in the given refcount table. While doing so, performs some checks
1772 * on L1 and L2 entries.
1773 *
1774 * Returns the number of errors found by the checks or -errno if an internal
1775 * error occurred.
1776 */
1777static int check_refcounts_l1(BlockDriverState *bs,
1778                              BdrvCheckResult *res,
1779                              void **refcount_table,
1780                              int64_t *refcount_table_size,
1781                              int64_t l1_table_offset, int l1_size,
1782                              int flags, BdrvCheckMode fix, bool active)
1783{
1784    BDRVQcow2State *s = bs->opaque;
1785    uint64_t *l1_table = NULL, l2_offset, l1_size2;
1786    int i, ret;
1787
1788    l1_size2 = l1_size * L1E_SIZE;
1789
1790    /* Mark L1 table as used */
1791    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size,
1792                                   l1_table_offset, l1_size2);
1793    if (ret < 0) {
1794        goto fail;
1795    }
1796
1797    /* Read L1 table entries from disk */
1798    if (l1_size2 > 0) {
1799        l1_table = g_try_malloc(l1_size2);
1800        if (l1_table == NULL) {
1801            ret = -ENOMEM;
1802            res->check_errors++;
1803            goto fail;
1804        }
1805        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1806        if (ret < 0) {
1807            fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1808            res->check_errors++;
1809            goto fail;
1810        }
1811        for(i = 0;i < l1_size; i++)
1812            be64_to_cpus(&l1_table[i]);
1813    }
1814
1815    /* Do the actual checks */
1816    for(i = 0; i < l1_size; i++) {
1817        l2_offset = l1_table[i];
1818        if (l2_offset) {
1819            /* Mark L2 table as used */
1820            l2_offset &= L1E_OFFSET_MASK;
1821            ret = qcow2_inc_refcounts_imrt(bs, res,
1822                                           refcount_table, refcount_table_size,
1823                                           l2_offset, s->cluster_size);
1824            if (ret < 0) {
1825                goto fail;
1826            }
1827
1828            /* L2 tables are cluster aligned */
1829            if (offset_into_cluster(s, l2_offset)) {
1830                fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1831                    "cluster aligned; L1 entry corrupted\n", l2_offset);
1832                res->corruptions++;
1833            }
1834
1835            /* Process and check L2 entries */
1836            ret = check_refcounts_l2(bs, res, refcount_table,
1837                                     refcount_table_size, l2_offset, flags,
1838                                     fix, active);
1839            if (ret < 0) {
1840                goto fail;
1841            }
1842        }
1843    }
1844    g_free(l1_table);
1845    return 0;
1846
1847fail:
1848    g_free(l1_table);
1849    return ret;
1850}
1851
1852/*
1853 * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1854 *
1855 * This function does not print an error message nor does it increment
1856 * check_errors if qcow2_get_refcount fails (this is because such an error will
1857 * have been already detected and sufficiently signaled by the calling function
1858 * (qcow2_check_refcounts) by the time this function is called).
1859 */
1860static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1861                              BdrvCheckMode fix)
1862{
1863    BDRVQcow2State *s = bs->opaque;
1864    uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1865    int ret;
1866    uint64_t refcount;
1867    int i, j;
1868    bool repair;
1869
1870    if (fix & BDRV_FIX_ERRORS) {
1871        /* Always repair */
1872        repair = true;
1873    } else if (fix & BDRV_FIX_LEAKS) {
1874        /* Repair only if that seems safe: This function is always
1875         * called after the refcounts have been fixed, so the refcount
1876         * is accurate if that repair was successful */
1877        repair = !res->check_errors && !res->corruptions && !res->leaks;
1878    } else {
1879        repair = false;
1880    }
1881
1882    for (i = 0; i < s->l1_size; i++) {
1883        uint64_t l1_entry = s->l1_table[i];
1884        uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1885        int l2_dirty = 0;
1886
1887        if (!l2_offset) {
1888            continue;
1889        }
1890
1891        ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1892                                 &refcount);
1893        if (ret < 0) {
1894            /* don't print message nor increment check_errors */
1895            continue;
1896        }
1897        if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1898            res->corruptions++;
1899            fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1900                    "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1901                    repair ? "Repairing" : "ERROR", i, l1_entry, refcount);
1902            if (repair) {
1903                s->l1_table[i] = refcount == 1
1904                               ? l1_entry |  QCOW_OFLAG_COPIED
1905                               : l1_entry & ~QCOW_OFLAG_COPIED;
1906                ret = qcow2_write_l1_entry(bs, i);
1907                if (ret < 0) {
1908                    res->check_errors++;
1909                    goto fail;
1910                }
1911                res->corruptions--;
1912                res->corruptions_fixed++;
1913            }
1914        }
1915
1916        ret = bdrv_pread(bs->file, l2_offset, l2_table,
1917                         s->l2_size * l2_entry_size(s));
1918        if (ret < 0) {
1919            fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1920                    strerror(-ret));
1921            res->check_errors++;
1922            goto fail;
1923        }
1924
1925        for (j = 0; j < s->l2_size; j++) {
1926            uint64_t l2_entry = get_l2_entry(s, l2_table, j);
1927            uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
1928            QCow2ClusterType cluster_type = qcow2_get_cluster_type(bs, l2_entry);
1929
1930            if (cluster_type == QCOW2_CLUSTER_NORMAL ||
1931                cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) {
1932                if (has_data_file(bs)) {
1933                    refcount = 1;
1934                } else {
1935                    ret = qcow2_get_refcount(bs,
1936                                             data_offset >> s->cluster_bits,
1937                                             &refcount);
1938                    if (ret < 0) {
1939                        /* don't print message nor increment check_errors */
1940                        continue;
1941                    }
1942                }
1943                if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
1944                    res->corruptions++;
1945                    fprintf(stderr, "%s OFLAG_COPIED data cluster: "
1946                            "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1947                            repair ? "Repairing" : "ERROR", l2_entry, refcount);
1948                    if (repair) {
1949                        set_l2_entry(s, l2_table, j,
1950                                     refcount == 1 ?
1951                                     l2_entry |  QCOW_OFLAG_COPIED :
1952                                     l2_entry & ~QCOW_OFLAG_COPIED);
1953                        l2_dirty++;
1954                    }
1955                }
1956            }
1957        }
1958
1959        if (l2_dirty > 0) {
1960            ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1961                                                l2_offset, s->cluster_size,
1962                                                false);
1963            if (ret < 0) {
1964                fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1965                        "overlap check failed: %s\n", strerror(-ret));
1966                res->check_errors++;
1967                goto fail;
1968            }
1969
1970            ret = bdrv_pwrite(bs->file, l2_offset, l2_table,
1971                              s->cluster_size);
1972            if (ret < 0) {
1973                fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1974                        strerror(-ret));
1975                res->check_errors++;
1976                goto fail;
1977            }
1978            res->corruptions -= l2_dirty;
1979            res->corruptions_fixed += l2_dirty;
1980        }
1981    }
1982
1983    ret = 0;
1984
1985fail:
1986    qemu_vfree(l2_table);
1987    return ret;
1988}
1989
1990/*
1991 * Checks consistency of refblocks and accounts for each refblock in
1992 * *refcount_table.
1993 */
1994static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
1995                           BdrvCheckMode fix, bool *rebuild,
1996                           void **refcount_table, int64_t *nb_clusters)
1997{
1998    BDRVQcow2State *s = bs->opaque;
1999    int64_t i, size;
2000    int ret;
2001
2002    for(i = 0; i < s->refcount_table_size; i++) {
2003        uint64_t offset, cluster;
2004        offset = s->refcount_table[i];
2005        cluster = offset >> s->cluster_bits;
2006
2007        /* Refcount blocks are cluster aligned */
2008        if (offset_into_cluster(s, offset)) {
2009            fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
2010                "cluster aligned; refcount table entry corrupted\n", i);
2011            res->corruptions++;
2012            *rebuild = true;
2013            continue;
2014        }
2015
2016        if (cluster >= *nb_clusters) {
2017            res->corruptions++;
2018            fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
2019                    fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
2020
2021            if (fix & BDRV_FIX_ERRORS) {
2022                int64_t new_nb_clusters;
2023                Error *local_err = NULL;
2024
2025                if (offset > INT64_MAX - s->cluster_size) {
2026                    ret = -EINVAL;
2027                    goto resize_fail;
2028                }
2029
2030                ret = bdrv_truncate(bs->file, offset + s->cluster_size, false,
2031                                    PREALLOC_MODE_OFF, 0, &local_err);
2032                if (ret < 0) {
2033                    error_report_err(local_err);
2034                    goto resize_fail;
2035                }
2036                size = bdrv_getlength(bs->file->bs);
2037                if (size < 0) {
2038                    ret = size;
2039                    goto resize_fail;
2040                }
2041
2042                new_nb_clusters = size_to_clusters(s, size);
2043                assert(new_nb_clusters >= *nb_clusters);
2044
2045                ret = realloc_refcount_array(s, refcount_table,
2046                                             nb_clusters, new_nb_clusters);
2047                if (ret < 0) {
2048                    res->check_errors++;
2049                    return ret;
2050                }
2051
2052                if (cluster >= *nb_clusters) {
2053                    ret = -EINVAL;
2054                    goto resize_fail;
2055                }
2056
2057                res->corruptions--;
2058                res->corruptions_fixed++;
2059                ret = qcow2_inc_refcounts_imrt(bs, res,
2060                                               refcount_table, nb_clusters,
2061                                               offset, s->cluster_size);
2062                if (ret < 0) {
2063                    return ret;
2064                }
2065                /* No need to check whether the refcount is now greater than 1:
2066                 * This area was just allocated and zeroed, so it can only be
2067                 * exactly 1 after qcow2_inc_refcounts_imrt() */
2068                continue;
2069
2070resize_fail:
2071                *rebuild = true;
2072                fprintf(stderr, "ERROR could not resize image: %s\n",
2073                        strerror(-ret));
2074            }
2075            continue;
2076        }
2077
2078        if (offset != 0) {
2079            ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2080                                           offset, s->cluster_size);
2081            if (ret < 0) {
2082                return ret;
2083            }
2084            if (s->get_refcount(*refcount_table, cluster) != 1) {
2085                fprintf(stderr, "ERROR refcount block %" PRId64
2086                        " refcount=%" PRIu64 "\n", i,
2087                        s->get_refcount(*refcount_table, cluster));
2088                res->corruptions++;
2089                *rebuild = true;
2090            }
2091        }
2092    }
2093
2094    return 0;
2095}
2096
2097/*
2098 * Calculates an in-memory refcount table.
2099 */
2100static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2101                               BdrvCheckMode fix, bool *rebuild,
2102                               void **refcount_table, int64_t *nb_clusters)
2103{
2104    BDRVQcow2State *s = bs->opaque;
2105    int64_t i;
2106    QCowSnapshot *sn;
2107    int ret;
2108
2109    if (!*refcount_table) {
2110        int64_t old_size = 0;
2111        ret = realloc_refcount_array(s, refcount_table,
2112                                     &old_size, *nb_clusters);
2113        if (ret < 0) {
2114            res->check_errors++;
2115            return ret;
2116        }
2117    }
2118
2119    /* header */
2120    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2121                                   0, s->cluster_size);
2122    if (ret < 0) {
2123        return ret;
2124    }
2125
2126    /* current L1 table */
2127    ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2128                             s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO,
2129                             fix, true);
2130    if (ret < 0) {
2131        return ret;
2132    }
2133
2134    /* snapshots */
2135    if (has_data_file(bs) && s->nb_snapshots) {
2136        fprintf(stderr, "ERROR %d snapshots in image with data file\n",
2137                s->nb_snapshots);
2138        res->corruptions++;
2139    }
2140
2141    for (i = 0; i < s->nb_snapshots; i++) {
2142        sn = s->snapshots + i;
2143        if (offset_into_cluster(s, sn->l1_table_offset)) {
2144            fprintf(stderr, "ERROR snapshot %s (%s) l1_offset=%#" PRIx64 ": "
2145                    "L1 table is not cluster aligned; snapshot table entry "
2146                    "corrupted\n", sn->id_str, sn->name, sn->l1_table_offset);
2147            res->corruptions++;
2148            continue;
2149        }
2150        if (sn->l1_size > QCOW_MAX_L1_SIZE / L1E_SIZE) {
2151            fprintf(stderr, "ERROR snapshot %s (%s) l1_size=%#" PRIx32 ": "
2152                    "L1 table is too large; snapshot table entry corrupted\n",
2153                    sn->id_str, sn->name, sn->l1_size);
2154            res->corruptions++;
2155            continue;
2156        }
2157        ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2158                                 sn->l1_table_offset, sn->l1_size, 0, fix,
2159                                 false);
2160        if (ret < 0) {
2161            return ret;
2162        }
2163    }
2164    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2165                                   s->snapshots_offset, s->snapshots_size);
2166    if (ret < 0) {
2167        return ret;
2168    }
2169
2170    /* refcount data */
2171    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2172                                   s->refcount_table_offset,
2173                                   s->refcount_table_size *
2174                                   REFTABLE_ENTRY_SIZE);
2175    if (ret < 0) {
2176        return ret;
2177    }
2178
2179    /* encryption */
2180    if (s->crypto_header.length) {
2181        ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2182                                       s->crypto_header.offset,
2183                                       s->crypto_header.length);
2184        if (ret < 0) {
2185            return ret;
2186        }
2187    }
2188
2189    /* bitmaps */
2190    ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters);
2191    if (ret < 0) {
2192        return ret;
2193    }
2194
2195    return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
2196}
2197
2198/*
2199 * Compares the actual reference count for each cluster in the image against the
2200 * refcount as reported by the refcount structures on-disk.
2201 */
2202static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2203                              BdrvCheckMode fix, bool *rebuild,
2204                              int64_t *highest_cluster,
2205                              void *refcount_table, int64_t nb_clusters)
2206{
2207    BDRVQcow2State *s = bs->opaque;
2208    int64_t i;
2209    uint64_t refcount1, refcount2;
2210    int ret;
2211
2212    for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
2213        ret = qcow2_get_refcount(bs, i, &refcount1);
2214        if (ret < 0) {
2215            fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
2216                    i, strerror(-ret));
2217            res->check_errors++;
2218            continue;
2219        }
2220
2221        refcount2 = s->get_refcount(refcount_table, i);
2222
2223        if (refcount1 > 0 || refcount2 > 0) {
2224            *highest_cluster = i;
2225        }
2226
2227        if (refcount1 != refcount2) {
2228            /* Check if we're allowed to fix the mismatch */
2229            int *num_fixed = NULL;
2230            if (refcount1 == 0) {
2231                *rebuild = true;
2232            } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
2233                num_fixed = &res->leaks_fixed;
2234            } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
2235                num_fixed = &res->corruptions_fixed;
2236            }
2237
2238            fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
2239                    " reference=%" PRIu64 "\n",
2240                   num_fixed != NULL     ? "Repairing" :
2241                   refcount1 < refcount2 ? "ERROR" :
2242                                           "Leaked",
2243                   i, refcount1, refcount2);
2244
2245            if (num_fixed) {
2246                ret = update_refcount(bs, i << s->cluster_bits, 1,
2247                                      refcount_diff(refcount1, refcount2),
2248                                      refcount1 > refcount2,
2249                                      QCOW2_DISCARD_ALWAYS);
2250                if (ret >= 0) {
2251                    (*num_fixed)++;
2252                    continue;
2253                }
2254            }
2255
2256            /* And if we couldn't, print an error */
2257            if (refcount1 < refcount2) {
2258                res->corruptions++;
2259            } else {
2260                res->leaks++;
2261            }
2262        }
2263    }
2264}
2265
2266/*
2267 * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
2268 * the on-disk refcount structures.
2269 *
2270 * On input, *first_free_cluster tells where to start looking, and need not
2271 * actually be a free cluster; the returned offset will not be before that
2272 * cluster.  On output, *first_free_cluster points to the first gap found, even
2273 * if that gap was too small to be used as the returned offset.
2274 *
2275 * Note that *first_free_cluster is a cluster index whereas the return value is
2276 * an offset.
2277 */
2278static int64_t alloc_clusters_imrt(BlockDriverState *bs,
2279                                   int cluster_count,
2280                                   void **refcount_table,
2281                                   int64_t *imrt_nb_clusters,
2282                                   int64_t *first_free_cluster)
2283{
2284    BDRVQcow2State *s = bs->opaque;
2285    int64_t cluster = *first_free_cluster, i;
2286    bool first_gap = true;
2287    int contiguous_free_clusters;
2288    int ret;
2289
2290    /* Starting at *first_free_cluster, find a range of at least cluster_count
2291     * continuously free clusters */
2292    for (contiguous_free_clusters = 0;
2293         cluster < *imrt_nb_clusters &&
2294         contiguous_free_clusters < cluster_count;
2295         cluster++)
2296    {
2297        if (!s->get_refcount(*refcount_table, cluster)) {
2298            contiguous_free_clusters++;
2299            if (first_gap) {
2300                /* If this is the first free cluster found, update
2301                 * *first_free_cluster accordingly */
2302                *first_free_cluster = cluster;
2303                first_gap = false;
2304            }
2305        } else if (contiguous_free_clusters) {
2306            contiguous_free_clusters = 0;
2307        }
2308    }
2309
2310    /* If contiguous_free_clusters is greater than zero, it contains the number
2311     * of continuously free clusters until the current cluster; the first free
2312     * cluster in the current "gap" is therefore
2313     * cluster - contiguous_free_clusters */
2314
2315    /* If no such range could be found, grow the in-memory refcount table
2316     * accordingly to append free clusters at the end of the image */
2317    if (contiguous_free_clusters < cluster_count) {
2318        /* contiguous_free_clusters clusters are already empty at the image end;
2319         * we need cluster_count clusters; therefore, we have to allocate
2320         * cluster_count - contiguous_free_clusters new clusters at the end of
2321         * the image (which is the current value of cluster; note that cluster
2322         * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
2323         * the image end) */
2324        ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
2325                                     cluster + cluster_count
2326                                     - contiguous_free_clusters);
2327        if (ret < 0) {
2328            return ret;
2329        }
2330    }
2331
2332    /* Go back to the first free cluster */
2333    cluster -= contiguous_free_clusters;
2334    for (i = 0; i < cluster_count; i++) {
2335        s->set_refcount(*refcount_table, cluster + i, 1);
2336    }
2337
2338    return cluster << s->cluster_bits;
2339}
2340
2341/*
2342 * Creates a new refcount structure based solely on the in-memory information
2343 * given through *refcount_table. All necessary allocations will be reflected
2344 * in that array.
2345 *
2346 * On success, the old refcount structure is leaked (it will be covered by the
2347 * new refcount structure).
2348 */
2349static int rebuild_refcount_structure(BlockDriverState *bs,
2350                                      BdrvCheckResult *res,
2351                                      void **refcount_table,
2352                                      int64_t *nb_clusters)
2353{
2354    BDRVQcow2State *s = bs->opaque;
2355    int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0;
2356    int64_t refblock_offset, refblock_start, refblock_index;
2357    uint32_t reftable_size = 0;
2358    uint64_t *on_disk_reftable = NULL;
2359    void *on_disk_refblock;
2360    int ret = 0;
2361    struct {
2362        uint64_t reftable_offset;
2363        uint32_t reftable_clusters;
2364    } QEMU_PACKED reftable_offset_and_clusters;
2365
2366    qcow2_cache_empty(bs, s->refcount_block_cache);
2367
2368write_refblocks:
2369    for (; cluster < *nb_clusters; cluster++) {
2370        if (!s->get_refcount(*refcount_table, cluster)) {
2371            continue;
2372        }
2373
2374        refblock_index = cluster >> s->refcount_block_bits;
2375        refblock_start = refblock_index << s->refcount_block_bits;
2376
2377        /* Don't allocate a cluster in a refblock already written to disk */
2378        if (first_free_cluster < refblock_start) {
2379            first_free_cluster = refblock_start;
2380        }
2381        refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2382                                              nb_clusters, &first_free_cluster);
2383        if (refblock_offset < 0) {
2384            fprintf(stderr, "ERROR allocating refblock: %s\n",
2385                    strerror(-refblock_offset));
2386            res->check_errors++;
2387            ret = refblock_offset;
2388            goto fail;
2389        }
2390
2391        if (reftable_size <= refblock_index) {
2392            uint32_t old_reftable_size = reftable_size;
2393            uint64_t *new_on_disk_reftable;
2394
2395            reftable_size = ROUND_UP((refblock_index + 1) * REFTABLE_ENTRY_SIZE,
2396                                     s->cluster_size) / REFTABLE_ENTRY_SIZE;
2397            new_on_disk_reftable = g_try_realloc(on_disk_reftable,
2398                                                 reftable_size *
2399                                                 REFTABLE_ENTRY_SIZE);
2400            if (!new_on_disk_reftable) {
2401                res->check_errors++;
2402                ret = -ENOMEM;
2403                goto fail;
2404            }
2405            on_disk_reftable = new_on_disk_reftable;
2406
2407            memset(on_disk_reftable + old_reftable_size, 0,
2408                   (reftable_size - old_reftable_size) * REFTABLE_ENTRY_SIZE);
2409
2410            /* The offset we have for the reftable is now no longer valid;
2411             * this will leak that range, but we can easily fix that by running
2412             * a leak-fixing check after this rebuild operation */
2413            reftable_offset = -1;
2414        } else {
2415            assert(on_disk_reftable);
2416        }
2417        on_disk_reftable[refblock_index] = refblock_offset;
2418
2419        /* If this is apparently the last refblock (for now), try to squeeze the
2420         * reftable in */
2421        if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits &&
2422            reftable_offset < 0)
2423        {
2424            uint64_t reftable_clusters = size_to_clusters(s, reftable_size *
2425                                                          REFTABLE_ENTRY_SIZE);
2426            reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2427                                                  refcount_table, nb_clusters,
2428                                                  &first_free_cluster);
2429            if (reftable_offset < 0) {
2430                fprintf(stderr, "ERROR allocating reftable: %s\n",
2431                        strerror(-reftable_offset));
2432                res->check_errors++;
2433                ret = reftable_offset;
2434                goto fail;
2435            }
2436        }
2437
2438        ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2439                                            s->cluster_size, false);
2440        if (ret < 0) {
2441            fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2442            goto fail;
2443        }
2444
2445        /* The size of *refcount_table is always cluster-aligned, therefore the
2446         * write operation will not overflow */
2447        on_disk_refblock = (void *)((char *) *refcount_table +
2448                                    refblock_index * s->cluster_size);
2449
2450        ret = bdrv_pwrite(bs->file, refblock_offset, on_disk_refblock,
2451                          s->cluster_size);
2452        if (ret < 0) {
2453            fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2454            goto fail;
2455        }
2456
2457        /* Go to the end of this refblock */
2458        cluster = refblock_start + s->refcount_block_size - 1;
2459    }
2460
2461    if (reftable_offset < 0) {
2462        uint64_t post_refblock_start, reftable_clusters;
2463
2464        post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size);
2465        reftable_clusters =
2466            size_to_clusters(s, reftable_size * REFTABLE_ENTRY_SIZE);
2467        /* Not pretty but simple */
2468        if (first_free_cluster < post_refblock_start) {
2469            first_free_cluster = post_refblock_start;
2470        }
2471        reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2472                                              refcount_table, nb_clusters,
2473                                              &first_free_cluster);
2474        if (reftable_offset < 0) {
2475            fprintf(stderr, "ERROR allocating reftable: %s\n",
2476                    strerror(-reftable_offset));
2477            res->check_errors++;
2478            ret = reftable_offset;
2479            goto fail;
2480        }
2481
2482        goto write_refblocks;
2483    }
2484
2485    for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2486        cpu_to_be64s(&on_disk_reftable[refblock_index]);
2487    }
2488
2489    ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset,
2490                                        reftable_size * REFTABLE_ENTRY_SIZE,
2491                                        false);
2492    if (ret < 0) {
2493        fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2494        goto fail;
2495    }
2496
2497    assert(reftable_size < INT_MAX / REFTABLE_ENTRY_SIZE);
2498    ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable,
2499                      reftable_size * REFTABLE_ENTRY_SIZE);
2500    if (ret < 0) {
2501        fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2502        goto fail;
2503    }
2504
2505    /* Enter new reftable into the image header */
2506    reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
2507    reftable_offset_and_clusters.reftable_clusters =
2508        cpu_to_be32(size_to_clusters(s, reftable_size * REFTABLE_ENTRY_SIZE));
2509    ret = bdrv_pwrite_sync(bs->file,
2510                           offsetof(QCowHeader, refcount_table_offset),
2511                           &reftable_offset_and_clusters,
2512                           sizeof(reftable_offset_and_clusters));
2513    if (ret < 0) {
2514        fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret));
2515        goto fail;
2516    }
2517
2518    for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2519        be64_to_cpus(&on_disk_reftable[refblock_index]);
2520    }
2521    s->refcount_table = on_disk_reftable;
2522    s->refcount_table_offset = reftable_offset;
2523    s->refcount_table_size = reftable_size;
2524    update_max_refcount_table_index(s);
2525
2526    return 0;
2527
2528fail:
2529    g_free(on_disk_reftable);
2530    return ret;
2531}
2532
2533/*
2534 * Checks an image for refcount consistency.
2535 *
2536 * Returns 0 if no errors are found, the number of errors in case the image is
2537 * detected as corrupted, and -errno when an internal error occurred.
2538 */
2539int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2540                          BdrvCheckMode fix)
2541{
2542    BDRVQcow2State *s = bs->opaque;
2543    BdrvCheckResult pre_compare_res;
2544    int64_t size, highest_cluster, nb_clusters;
2545    void *refcount_table = NULL;
2546    bool rebuild = false;
2547    int ret;
2548
2549    size = bdrv_getlength(bs->file->bs);
2550    if (size < 0) {
2551        res->check_errors++;
2552        return size;
2553    }
2554
2555    nb_clusters = size_to_clusters(s, size);
2556    if (nb_clusters > INT_MAX) {
2557        res->check_errors++;
2558        return -EFBIG;
2559    }
2560
2561    res->bfi.total_clusters =
2562        size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2563
2564    ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2565                              &nb_clusters);
2566    if (ret < 0) {
2567        goto fail;
2568    }
2569
2570    /* In case we don't need to rebuild the refcount structure (but want to fix
2571     * something), this function is immediately called again, in which case the
2572     * result should be ignored */
2573    pre_compare_res = *res;
2574    compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2575                      nb_clusters);
2576
2577    if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2578        BdrvCheckResult old_res = *res;
2579        int fresh_leaks = 0;
2580
2581        fprintf(stderr, "Rebuilding refcount structure\n");
2582        ret = rebuild_refcount_structure(bs, res, &refcount_table,
2583                                         &nb_clusters);
2584        if (ret < 0) {
2585            goto fail;
2586        }
2587
2588        res->corruptions = 0;
2589        res->leaks = 0;
2590
2591        /* Because the old reftable has been exchanged for a new one the
2592         * references have to be recalculated */
2593        rebuild = false;
2594        memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2595        ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2596                                  &nb_clusters);
2597        if (ret < 0) {
2598            goto fail;
2599        }
2600
2601        if (fix & BDRV_FIX_LEAKS) {
2602            /* The old refcount structures are now leaked, fix it; the result
2603             * can be ignored, aside from leaks which were introduced by
2604             * rebuild_refcount_structure() that could not be fixed */
2605            BdrvCheckResult saved_res = *res;
2606            *res = (BdrvCheckResult){ 0 };
2607
2608            compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2609                              &highest_cluster, refcount_table, nb_clusters);
2610            if (rebuild) {
2611                fprintf(stderr, "ERROR rebuilt refcount structure is still "
2612                        "broken\n");
2613            }
2614
2615            /* Any leaks accounted for here were introduced by
2616             * rebuild_refcount_structure() because that function has created a
2617             * new refcount structure from scratch */
2618            fresh_leaks = res->leaks;
2619            *res = saved_res;
2620        }
2621
2622        if (res->corruptions < old_res.corruptions) {
2623            res->corruptions_fixed += old_res.corruptions - res->corruptions;
2624        }
2625        if (res->leaks < old_res.leaks) {
2626            res->leaks_fixed += old_res.leaks - res->leaks;
2627        }
2628        res->leaks += fresh_leaks;
2629    } else if (fix) {
2630        if (rebuild) {
2631            fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2632            res->check_errors++;
2633            ret = -EIO;
2634            goto fail;
2635        }
2636
2637        if (res->leaks || res->corruptions) {
2638            *res = pre_compare_res;
2639            compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2640                              refcount_table, nb_clusters);
2641        }
2642    }
2643
2644    /* check OFLAG_COPIED */
2645    ret = check_oflag_copied(bs, res, fix);
2646    if (ret < 0) {
2647        goto fail;
2648    }
2649
2650    res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2651    ret = 0;
2652
2653fail:
2654    g_free(refcount_table);
2655
2656    return ret;
2657}
2658
2659#define overlaps_with(ofs, sz) \
2660    ranges_overlap(offset, size, ofs, sz)
2661
2662/*
2663 * Checks if the given offset into the image file is actually free to use by
2664 * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2665 * i.e. a sanity check without relying on the refcount tables.
2666 *
2667 * The ign parameter specifies what checks not to perform (being a bitmask of
2668 * QCow2MetadataOverlap values), i.e., what sections to ignore.
2669 *
2670 * Returns:
2671 * - 0 if writing to this offset will not affect the mentioned metadata
2672 * - a positive QCow2MetadataOverlap value indicating one overlapping section
2673 * - a negative value (-errno) indicating an error while performing a check,
2674 *   e.g. when bdrv_pread failed on QCOW2_OL_INACTIVE_L2
2675 */
2676int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2677                                 int64_t size)
2678{
2679    BDRVQcow2State *s = bs->opaque;
2680    int chk = s->overlap_check & ~ign;
2681    int i, j;
2682
2683    if (!size) {
2684        return 0;
2685    }
2686
2687    if (chk & QCOW2_OL_MAIN_HEADER) {
2688        if (offset < s->cluster_size) {
2689            return QCOW2_OL_MAIN_HEADER;
2690        }
2691    }
2692
2693    /* align range to test to cluster boundaries */
2694    size = ROUND_UP(offset_into_cluster(s, offset) + size, s->cluster_size);
2695    offset = start_of_cluster(s, offset);
2696
2697    if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2698        if (overlaps_with(s->l1_table_offset, s->l1_size * L1E_SIZE)) {
2699            return QCOW2_OL_ACTIVE_L1;
2700        }
2701    }
2702
2703    if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2704        if (overlaps_with(s->refcount_table_offset,
2705            s->refcount_table_size * REFTABLE_ENTRY_SIZE)) {
2706            return QCOW2_OL_REFCOUNT_TABLE;
2707        }
2708    }
2709
2710    if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2711        if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2712            return QCOW2_OL_SNAPSHOT_TABLE;
2713        }
2714    }
2715
2716    if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2717        for (i = 0; i < s->nb_snapshots; i++) {
2718            if (s->snapshots[i].l1_size &&
2719                overlaps_with(s->snapshots[i].l1_table_offset,
2720                s->snapshots[i].l1_size * L1E_SIZE)) {
2721                return QCOW2_OL_INACTIVE_L1;
2722            }
2723        }
2724    }
2725
2726    if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2727        for (i = 0; i < s->l1_size; i++) {
2728            if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2729                overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2730                s->cluster_size)) {
2731                return QCOW2_OL_ACTIVE_L2;
2732            }
2733        }
2734    }
2735
2736    if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2737        unsigned last_entry = s->max_refcount_table_index;
2738        assert(last_entry < s->refcount_table_size);
2739        assert(last_entry + 1 == s->refcount_table_size ||
2740               (s->refcount_table[last_entry + 1] & REFT_OFFSET_MASK) == 0);
2741        for (i = 0; i <= last_entry; i++) {
2742            if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2743                overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2744                s->cluster_size)) {
2745                return QCOW2_OL_REFCOUNT_BLOCK;
2746            }
2747        }
2748    }
2749
2750    if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2751        for (i = 0; i < s->nb_snapshots; i++) {
2752            uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2753            uint32_t l1_sz  = s->snapshots[i].l1_size;
2754            uint64_t l1_sz2 = l1_sz * L1E_SIZE;
2755            uint64_t *l1;
2756            int ret;
2757
2758            ret = qcow2_validate_table(bs, l1_ofs, l1_sz, L1E_SIZE,
2759                                       QCOW_MAX_L1_SIZE, "", NULL);
2760            if (ret < 0) {
2761                return ret;
2762            }
2763
2764            l1 = g_try_malloc(l1_sz2);
2765
2766            if (l1_sz2 && l1 == NULL) {
2767                return -ENOMEM;
2768            }
2769
2770            ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
2771            if (ret < 0) {
2772                g_free(l1);
2773                return ret;
2774            }
2775
2776            for (j = 0; j < l1_sz; j++) {
2777                uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
2778                if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
2779                    g_free(l1);
2780                    return QCOW2_OL_INACTIVE_L2;
2781                }
2782            }
2783
2784            g_free(l1);
2785        }
2786    }
2787
2788    if ((chk & QCOW2_OL_BITMAP_DIRECTORY) &&
2789        (s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS))
2790    {
2791        if (overlaps_with(s->bitmap_directory_offset,
2792                          s->bitmap_directory_size))
2793        {
2794            return QCOW2_OL_BITMAP_DIRECTORY;
2795        }
2796    }
2797
2798    return 0;
2799}
2800
2801static const char *metadata_ol_names[] = {
2802    [QCOW2_OL_MAIN_HEADER_BITNR]        = "qcow2_header",
2803    [QCOW2_OL_ACTIVE_L1_BITNR]          = "active L1 table",
2804    [QCOW2_OL_ACTIVE_L2_BITNR]          = "active L2 table",
2805    [QCOW2_OL_REFCOUNT_TABLE_BITNR]     = "refcount table",
2806    [QCOW2_OL_REFCOUNT_BLOCK_BITNR]     = "refcount block",
2807    [QCOW2_OL_SNAPSHOT_TABLE_BITNR]     = "snapshot table",
2808    [QCOW2_OL_INACTIVE_L1_BITNR]        = "inactive L1 table",
2809    [QCOW2_OL_INACTIVE_L2_BITNR]        = "inactive L2 table",
2810    [QCOW2_OL_BITMAP_DIRECTORY_BITNR]   = "bitmap directory",
2811};
2812QEMU_BUILD_BUG_ON(QCOW2_OL_MAX_BITNR != ARRAY_SIZE(metadata_ol_names));
2813
2814/*
2815 * First performs a check for metadata overlaps (through
2816 * qcow2_check_metadata_overlap); if that fails with a negative value (error
2817 * while performing a check), that value is returned. If an impending overlap
2818 * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
2819 * and -EIO returned.
2820 *
2821 * Returns 0 if there were neither overlaps nor errors while checking for
2822 * overlaps; or a negative value (-errno) on error.
2823 */
2824int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
2825                                  int64_t size, bool data_file)
2826{
2827    int ret;
2828
2829    if (data_file && has_data_file(bs)) {
2830        return 0;
2831    }
2832
2833    ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
2834    if (ret < 0) {
2835        return ret;
2836    } else if (ret > 0) {
2837        int metadata_ol_bitnr = ctz32(ret);
2838        assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
2839
2840        qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
2841                                "write on metadata (overlaps with %s)",
2842                                metadata_ol_names[metadata_ol_bitnr]);
2843        return -EIO;
2844    }
2845
2846    return 0;
2847}
2848
2849/* A pointer to a function of this type is given to walk_over_reftable(). That
2850 * function will create refblocks and pass them to a RefblockFinishOp once they
2851 * are completed (@refblock). @refblock_empty is set if the refblock is
2852 * completely empty.
2853 *
2854 * Along with the refblock, a corresponding reftable entry is passed, in the
2855 * reftable @reftable (which may be reallocated) at @reftable_index.
2856 *
2857 * @allocated should be set to true if a new cluster has been allocated.
2858 */
2859typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
2860                               uint64_t reftable_index, uint64_t *reftable_size,
2861                               void *refblock, bool refblock_empty,
2862                               bool *allocated, Error **errp);
2863
2864/**
2865 * This "operation" for walk_over_reftable() allocates the refblock on disk (if
2866 * it is not empty) and inserts its offset into the new reftable. The size of
2867 * this new reftable is increased as required.
2868 */
2869static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
2870                          uint64_t reftable_index, uint64_t *reftable_size,
2871                          void *refblock, bool refblock_empty, bool *allocated,
2872                          Error **errp)
2873{
2874    BDRVQcow2State *s = bs->opaque;
2875    int64_t offset;
2876
2877    if (!refblock_empty && reftable_index >= *reftable_size) {
2878        uint64_t *new_reftable;
2879        uint64_t new_reftable_size;
2880
2881        new_reftable_size = ROUND_UP(reftable_index + 1,
2882                                     s->cluster_size / REFTABLE_ENTRY_SIZE);
2883        if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / REFTABLE_ENTRY_SIZE) {
2884            error_setg(errp,
2885                       "This operation would make the refcount table grow "
2886                       "beyond the maximum size supported by QEMU, aborting");
2887            return -ENOTSUP;
2888        }
2889
2890        new_reftable = g_try_realloc(*reftable, new_reftable_size *
2891                                                REFTABLE_ENTRY_SIZE);
2892        if (!new_reftable) {
2893            error_setg(errp, "Failed to increase reftable buffer size");
2894            return -ENOMEM;
2895        }
2896
2897        memset(new_reftable + *reftable_size, 0,
2898               (new_reftable_size - *reftable_size) * REFTABLE_ENTRY_SIZE);
2899
2900        *reftable      = new_reftable;
2901        *reftable_size = new_reftable_size;
2902    }
2903
2904    if (!refblock_empty && !(*reftable)[reftable_index]) {
2905        offset = qcow2_alloc_clusters(bs, s->cluster_size);
2906        if (offset < 0) {
2907            error_setg_errno(errp, -offset, "Failed to allocate refblock");
2908            return offset;
2909        }
2910        (*reftable)[reftable_index] = offset;
2911        *allocated = true;
2912    }
2913
2914    return 0;
2915}
2916
2917/**
2918 * This "operation" for walk_over_reftable() writes the refblock to disk at the
2919 * offset specified by the new reftable's entry. It does not modify the new
2920 * reftable or change any refcounts.
2921 */
2922static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
2923                          uint64_t reftable_index, uint64_t *reftable_size,
2924                          void *refblock, bool refblock_empty, bool *allocated,
2925                          Error **errp)
2926{
2927    BDRVQcow2State *s = bs->opaque;
2928    int64_t offset;
2929    int ret;
2930
2931    if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
2932        offset = (*reftable)[reftable_index];
2933
2934        ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size,
2935                                            false);
2936        if (ret < 0) {
2937            error_setg_errno(errp, -ret, "Overlap check failed");
2938            return ret;
2939        }
2940
2941        ret = bdrv_pwrite(bs->file, offset, refblock, s->cluster_size);
2942        if (ret < 0) {
2943            error_setg_errno(errp, -ret, "Failed to write refblock");
2944            return ret;
2945        }
2946    } else {
2947        assert(refblock_empty);
2948    }
2949
2950    return 0;
2951}
2952
2953/**
2954 * This function walks over the existing reftable and every referenced refblock;
2955 * if @new_set_refcount is non-NULL, it is called for every refcount entry to
2956 * create an equal new entry in the passed @new_refblock. Once that
2957 * @new_refblock is completely filled, @operation will be called.
2958 *
2959 * @status_cb and @cb_opaque are used for the amend operation's status callback.
2960 * @index is the index of the walk_over_reftable() calls and @total is the total
2961 * number of walk_over_reftable() calls per amend operation. Both are used for
2962 * calculating the parameters for the status callback.
2963 *
2964 * @allocated is set to true if a new cluster has been allocated.
2965 */
2966static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
2967                              uint64_t *new_reftable_index,
2968                              uint64_t *new_reftable_size,
2969                              void *new_refblock, int new_refblock_size,
2970                              int new_refcount_bits,
2971                              RefblockFinishOp *operation, bool *allocated,
2972                              Qcow2SetRefcountFunc *new_set_refcount,
2973                              BlockDriverAmendStatusCB *status_cb,
2974                              void *cb_opaque, int index, int total,
2975                              Error **errp)
2976{
2977    BDRVQcow2State *s = bs->opaque;
2978    uint64_t reftable_index;
2979    bool new_refblock_empty = true;
2980    int refblock_index;
2981    int new_refblock_index = 0;
2982    int ret;
2983
2984    for (reftable_index = 0; reftable_index < s->refcount_table_size;
2985         reftable_index++)
2986    {
2987        uint64_t refblock_offset = s->refcount_table[reftable_index]
2988                                 & REFT_OFFSET_MASK;
2989
2990        status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
2991                  (uint64_t)total * s->refcount_table_size, cb_opaque);
2992
2993        if (refblock_offset) {
2994            void *refblock;
2995
2996            if (offset_into_cluster(s, refblock_offset)) {
2997                qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
2998                                        PRIx64 " unaligned (reftable index: %#"
2999                                        PRIx64 ")", refblock_offset,
3000                                        reftable_index);
3001                error_setg(errp,
3002                           "Image is corrupt (unaligned refblock offset)");
3003                return -EIO;
3004            }
3005
3006            ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
3007                                  &refblock);
3008            if (ret < 0) {
3009                error_setg_errno(errp, -ret, "Failed to retrieve refblock");
3010                return ret;
3011            }
3012
3013            for (refblock_index = 0; refblock_index < s->refcount_block_size;
3014                 refblock_index++)
3015            {
3016                uint64_t refcount;
3017
3018                if (new_refblock_index >= new_refblock_size) {
3019                    /* new_refblock is now complete */
3020                    ret = operation(bs, new_reftable, *new_reftable_index,
3021                                    new_reftable_size, new_refblock,
3022                                    new_refblock_empty, allocated, errp);
3023                    if (ret < 0) {
3024                        qcow2_cache_put(s->refcount_block_cache, &refblock);
3025                        return ret;
3026                    }
3027
3028                    (*new_reftable_index)++;
3029                    new_refblock_index = 0;
3030                    new_refblock_empty = true;
3031                }
3032
3033                refcount = s->get_refcount(refblock, refblock_index);
3034                if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
3035                    uint64_t offset;
3036
3037                    qcow2_cache_put(s->refcount_block_cache, &refblock);
3038
3039                    offset = ((reftable_index << s->refcount_block_bits)
3040                              + refblock_index) << s->cluster_bits;
3041
3042                    error_setg(errp, "Cannot decrease refcount entry width to "
3043                               "%i bits: Cluster at offset %#" PRIx64 " has a "
3044                               "refcount of %" PRIu64, new_refcount_bits,
3045                               offset, refcount);
3046                    return -EINVAL;
3047                }
3048
3049                if (new_set_refcount) {
3050                    new_set_refcount(new_refblock, new_refblock_index++,
3051                                     refcount);
3052                } else {
3053                    new_refblock_index++;
3054                }
3055                new_refblock_empty = new_refblock_empty && refcount == 0;
3056            }
3057
3058            qcow2_cache_put(s->refcount_block_cache, &refblock);
3059        } else {
3060            /* No refblock means every refcount is 0 */
3061            for (refblock_index = 0; refblock_index < s->refcount_block_size;
3062                 refblock_index++)
3063            {
3064                if (new_refblock_index >= new_refblock_size) {
3065                    /* new_refblock is now complete */
3066                    ret = operation(bs, new_reftable, *new_reftable_index,
3067                                    new_reftable_size, new_refblock,
3068                                    new_refblock_empty, allocated, errp);
3069                    if (ret < 0) {
3070                        return ret;
3071                    }
3072
3073                    (*new_reftable_index)++;
3074                    new_refblock_index = 0;
3075                    new_refblock_empty = true;
3076                }
3077
3078                if (new_set_refcount) {
3079                    new_set_refcount(new_refblock, new_refblock_index++, 0);
3080                } else {
3081                    new_refblock_index++;
3082                }
3083            }
3084        }
3085    }
3086
3087    if (new_refblock_index > 0) {
3088        /* Complete the potentially existing partially filled final refblock */
3089        if (new_set_refcount) {
3090            for (; new_refblock_index < new_refblock_size;
3091                 new_refblock_index++)
3092            {
3093                new_set_refcount(new_refblock, new_refblock_index, 0);
3094            }
3095        }
3096
3097        ret = operation(bs, new_reftable, *new_reftable_index,
3098                        new_reftable_size, new_refblock, new_refblock_empty,
3099                        allocated, errp);
3100        if (ret < 0) {
3101            return ret;
3102        }
3103
3104        (*new_reftable_index)++;
3105    }
3106
3107    status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
3108              (uint64_t)total * s->refcount_table_size, cb_opaque);
3109
3110    return 0;
3111}
3112
3113int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
3114                                BlockDriverAmendStatusCB *status_cb,
3115                                void *cb_opaque, Error **errp)
3116{
3117    BDRVQcow2State *s = bs->opaque;
3118    Qcow2GetRefcountFunc *new_get_refcount;
3119    Qcow2SetRefcountFunc *new_set_refcount;
3120    void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
3121    uint64_t *new_reftable = NULL, new_reftable_size = 0;
3122    uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
3123    uint64_t new_reftable_index = 0;
3124    uint64_t i;
3125    int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
3126    int new_refblock_size, new_refcount_bits = 1 << refcount_order;
3127    int old_refcount_order;
3128    int walk_index = 0;
3129    int ret;
3130    bool new_allocation;
3131
3132    assert(s->qcow_version >= 3);
3133    assert(refcount_order >= 0 && refcount_order <= 6);
3134
3135    /* see qcow2_open() */
3136    new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
3137
3138    new_get_refcount = get_refcount_funcs[refcount_order];
3139    new_set_refcount = set_refcount_funcs[refcount_order];
3140
3141
3142    do {
3143        int total_walks;
3144
3145        new_allocation = false;
3146
3147        /* At least we have to do this walk and the one which writes the
3148         * refblocks; also, at least we have to do this loop here at least
3149         * twice (normally), first to do the allocations, and second to
3150         * determine that everything is correctly allocated, this then makes
3151         * three walks in total */
3152        total_walks = MAX(walk_index + 2, 3);
3153
3154        /* First, allocate the structures so they are present in the refcount
3155         * structures */
3156        ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3157                                 &new_reftable_size, NULL, new_refblock_size,
3158                                 new_refcount_bits, &alloc_refblock,
3159                                 &new_allocation, NULL, status_cb, cb_opaque,
3160                                 walk_index++, total_walks, errp);
3161        if (ret < 0) {
3162            goto done;
3163        }
3164
3165        new_reftable_index = 0;
3166
3167        if (new_allocation) {
3168            if (new_reftable_offset) {
3169                qcow2_free_clusters(
3170                    bs, new_reftable_offset,
3171                    allocated_reftable_size * REFTABLE_ENTRY_SIZE,
3172                    QCOW2_DISCARD_NEVER);
3173            }
3174
3175            new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
3176                                                           REFTABLE_ENTRY_SIZE);
3177            if (new_reftable_offset < 0) {
3178                error_setg_errno(errp, -new_reftable_offset,
3179                                 "Failed to allocate the new reftable");
3180                ret = new_reftable_offset;
3181                goto done;
3182            }
3183            allocated_reftable_size = new_reftable_size;
3184        }
3185    } while (new_allocation);
3186
3187    /* Second, write the new refblocks */
3188    ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3189                             &new_reftable_size, new_refblock,
3190                             new_refblock_size, new_refcount_bits,
3191                             &flush_refblock, &new_allocation, new_set_refcount,
3192                             status_cb, cb_opaque, walk_index, walk_index + 1,
3193                             errp);
3194    if (ret < 0) {
3195        goto done;
3196    }
3197    assert(!new_allocation);
3198
3199
3200    /* Write the new reftable */
3201    ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
3202                                        new_reftable_size * REFTABLE_ENTRY_SIZE,
3203                                        false);
3204    if (ret < 0) {
3205        error_setg_errno(errp, -ret, "Overlap check failed");
3206        goto done;
3207    }
3208
3209    for (i = 0; i < new_reftable_size; i++) {
3210        cpu_to_be64s(&new_reftable[i]);
3211    }
3212
3213    ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable,
3214                      new_reftable_size * REFTABLE_ENTRY_SIZE);
3215
3216    for (i = 0; i < new_reftable_size; i++) {
3217        be64_to_cpus(&new_reftable[i]);
3218    }
3219
3220    if (ret < 0) {
3221        error_setg_errno(errp, -ret, "Failed to write the new reftable");
3222        goto done;
3223    }
3224
3225
3226    /* Empty the refcount cache */
3227    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
3228    if (ret < 0) {
3229        error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
3230        goto done;
3231    }
3232
3233    /* Update the image header to point to the new reftable; this only updates
3234     * the fields which are relevant to qcow2_update_header(); other fields
3235     * such as s->refcount_table or s->refcount_bits stay stale for now
3236     * (because we have to restore everything if qcow2_update_header() fails) */
3237    old_refcount_order  = s->refcount_order;
3238    old_reftable_size   = s->refcount_table_size;
3239    old_reftable_offset = s->refcount_table_offset;
3240
3241    s->refcount_order        = refcount_order;
3242    s->refcount_table_size   = new_reftable_size;
3243    s->refcount_table_offset = new_reftable_offset;
3244
3245    ret = qcow2_update_header(bs);
3246    if (ret < 0) {
3247        s->refcount_order        = old_refcount_order;
3248        s->refcount_table_size   = old_reftable_size;
3249        s->refcount_table_offset = old_reftable_offset;
3250        error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
3251        goto done;
3252    }
3253
3254    /* Now update the rest of the in-memory information */
3255    old_reftable = s->refcount_table;
3256    s->refcount_table = new_reftable;
3257    update_max_refcount_table_index(s);
3258
3259    s->refcount_bits = 1 << refcount_order;
3260    s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
3261    s->refcount_max += s->refcount_max - 1;
3262
3263    s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
3264    s->refcount_block_size = 1 << s->refcount_block_bits;
3265
3266    s->get_refcount = new_get_refcount;
3267    s->set_refcount = new_set_refcount;
3268
3269    /* For cleaning up all old refblocks and the old reftable below the "done"
3270     * label */
3271    new_reftable        = old_reftable;
3272    new_reftable_size   = old_reftable_size;
3273    new_reftable_offset = old_reftable_offset;
3274
3275done:
3276    if (new_reftable) {
3277        /* On success, new_reftable actually points to the old reftable (and
3278         * new_reftable_size is the old reftable's size); but that is just
3279         * fine */
3280        for (i = 0; i < new_reftable_size; i++) {
3281            uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
3282            if (offset) {
3283                qcow2_free_clusters(bs, offset, s->cluster_size,
3284                                    QCOW2_DISCARD_OTHER);
3285            }
3286        }
3287        g_free(new_reftable);
3288
3289        if (new_reftable_offset > 0) {
3290            qcow2_free_clusters(bs, new_reftable_offset,
3291                                new_reftable_size * REFTABLE_ENTRY_SIZE,
3292                                QCOW2_DISCARD_OTHER);
3293        }
3294    }
3295
3296    qemu_vfree(new_refblock);
3297    return ret;
3298}
3299
3300static int64_t get_refblock_offset(BlockDriverState *bs, uint64_t offset)
3301{
3302    BDRVQcow2State *s = bs->opaque;
3303    uint32_t index = offset_to_reftable_index(s, offset);
3304    int64_t covering_refblock_offset = 0;
3305
3306    if (index < s->refcount_table_size) {
3307        covering_refblock_offset = s->refcount_table[index] & REFT_OFFSET_MASK;
3308    }
3309    if (!covering_refblock_offset) {
3310        qcow2_signal_corruption(bs, true, -1, -1, "Refblock at %#" PRIx64 " is "
3311                                "not covered by the refcount structures",
3312                                offset);
3313        return -EIO;
3314    }
3315
3316    return covering_refblock_offset;
3317}
3318
3319static int qcow2_discard_refcount_block(BlockDriverState *bs,
3320                                        uint64_t discard_block_offs)
3321{
3322    BDRVQcow2State *s = bs->opaque;
3323    int64_t refblock_offs;
3324    uint64_t cluster_index = discard_block_offs >> s->cluster_bits;
3325    uint32_t block_index = cluster_index & (s->refcount_block_size - 1);
3326    void *refblock;
3327    int ret;
3328
3329    refblock_offs = get_refblock_offset(bs, discard_block_offs);
3330    if (refblock_offs < 0) {
3331        return refblock_offs;
3332    }
3333
3334    assert(discard_block_offs != 0);
3335
3336    ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3337                          &refblock);
3338    if (ret < 0) {
3339        return ret;
3340    }
3341
3342    if (s->get_refcount(refblock, block_index) != 1) {
3343        qcow2_signal_corruption(bs, true, -1, -1, "Invalid refcount:"
3344                                " refblock offset %#" PRIx64
3345                                ", reftable index %u"
3346                                ", block offset %#" PRIx64
3347                                ", refcount %#" PRIx64,
3348                                refblock_offs,
3349                                offset_to_reftable_index(s, discard_block_offs),
3350                                discard_block_offs,
3351                                s->get_refcount(refblock, block_index));
3352        qcow2_cache_put(s->refcount_block_cache, &refblock);
3353        return -EINVAL;
3354    }
3355    s->set_refcount(refblock, block_index, 0);
3356
3357    qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refblock);
3358
3359    qcow2_cache_put(s->refcount_block_cache, &refblock);
3360
3361    if (cluster_index < s->free_cluster_index) {
3362        s->free_cluster_index = cluster_index;
3363    }
3364
3365    refblock = qcow2_cache_is_table_offset(s->refcount_block_cache,
3366                                           discard_block_offs);
3367    if (refblock) {
3368        /* discard refblock from the cache if refblock is cached */
3369        qcow2_cache_discard(s->refcount_block_cache, refblock);
3370    }
3371    update_refcount_discard(bs, discard_block_offs, s->cluster_size);
3372
3373    return 0;
3374}
3375
3376int qcow2_shrink_reftable(BlockDriverState *bs)
3377{
3378    BDRVQcow2State *s = bs->opaque;
3379    uint64_t *reftable_tmp =
3380        g_malloc(s->refcount_table_size * REFTABLE_ENTRY_SIZE);
3381    int i, ret;
3382
3383    for (i = 0; i < s->refcount_table_size; i++) {
3384        int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK;
3385        void *refblock;
3386        bool unused_block;
3387
3388        if (refblock_offs == 0) {
3389            reftable_tmp[i] = 0;
3390            continue;
3391        }
3392        ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3393                              &refblock);
3394        if (ret < 0) {
3395            goto out;
3396        }
3397
3398        /* the refblock has own reference */
3399        if (i == offset_to_reftable_index(s, refblock_offs)) {
3400            uint64_t block_index = (refblock_offs >> s->cluster_bits) &
3401                                   (s->refcount_block_size - 1);
3402            uint64_t refcount = s->get_refcount(refblock, block_index);
3403
3404            s->set_refcount(refblock, block_index, 0);
3405
3406            unused_block = buffer_is_zero(refblock, s->cluster_size);
3407
3408            s->set_refcount(refblock, block_index, refcount);
3409        } else {
3410            unused_block = buffer_is_zero(refblock, s->cluster_size);
3411        }
3412        qcow2_cache_put(s->refcount_block_cache, &refblock);
3413
3414        reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
3415    }
3416
3417    ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset, reftable_tmp,
3418                           s->refcount_table_size * REFTABLE_ENTRY_SIZE);
3419    /*
3420     * If the write in the reftable failed the image may contain a partially
3421     * overwritten reftable. In this case it would be better to clear the
3422     * reftable in memory to avoid possible image corruption.
3423     */
3424    for (i = 0; i < s->refcount_table_size; i++) {
3425        if (s->refcount_table[i] && !reftable_tmp[i]) {
3426            if (ret == 0) {
3427                ret = qcow2_discard_refcount_block(bs, s->refcount_table[i] &
3428                                                       REFT_OFFSET_MASK);
3429            }
3430            s->refcount_table[i] = 0;
3431        }
3432    }
3433
3434    if (!s->cache_discards) {
3435        qcow2_process_discards(bs, ret);
3436    }
3437
3438out:
3439    g_free(reftable_tmp);
3440    return ret;
3441}
3442
3443int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
3444{
3445    BDRVQcow2State *s = bs->opaque;
3446    int64_t i;
3447
3448    for (i = size_to_clusters(s, size) - 1; i >= 0; i--) {
3449        uint64_t refcount;
3450        int ret = qcow2_get_refcount(bs, i, &refcount);
3451        if (ret < 0) {
3452            fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
3453                    i, strerror(-ret));
3454            return ret;
3455        }
3456        if (refcount > 0) {
3457            return i;
3458        }
3459    }
3460    qcow2_signal_corruption(bs, true, -1, -1,
3461                            "There are no references in the refcount table.");
3462    return -EIO;
3463}
3464
3465int qcow2_detect_metadata_preallocation(BlockDriverState *bs)
3466{
3467    BDRVQcow2State *s = bs->opaque;
3468    int64_t i, end_cluster, cluster_count = 0, threshold;
3469    int64_t file_length, real_allocation, real_clusters;
3470
3471    qemu_co_mutex_assert_locked(&s->lock);
3472
3473    file_length = bdrv_getlength(bs->file->bs);
3474    if (file_length < 0) {
3475        return file_length;
3476    }
3477
3478    real_allocation = bdrv_get_allocated_file_size(bs->file->bs);
3479    if (real_allocation < 0) {
3480        return real_allocation;
3481    }
3482
3483    real_clusters = real_allocation / s->cluster_size;
3484    threshold = MAX(real_clusters * 10 / 9, real_clusters + 2);
3485
3486    end_cluster = size_to_clusters(s, file_length);
3487    for (i = 0; i < end_cluster && cluster_count < threshold; i++) {
3488        uint64_t refcount;
3489        int ret = qcow2_get_refcount(bs, i, &refcount);
3490        if (ret < 0) {
3491            return ret;
3492        }
3493        cluster_count += !!refcount;
3494    }
3495
3496    return cluster_count >= threshold;
3497}
3498