qemu/block/qcow2-refcount.c
<<
>>
Prefs
   1/*
   2 * Block driver for the QCOW version 2 format
   3 *
   4 * Copyright (c) 2004-2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu-common.h"
  26#include "block/block_int.h"
  27#include "block/qcow2.h"
  28#include "qemu/range.h"
  29
  30static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
  31static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
  32                            int64_t offset, int64_t length, uint64_t addend,
  33                            bool decrease, enum qcow2_discard_type type);
  34
  35static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
  36static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
  37static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
  38static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
  39static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
  40static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
  41static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
  42
  43static void set_refcount_ro0(void *refcount_array, uint64_t index,
  44                             uint64_t value);
  45static void set_refcount_ro1(void *refcount_array, uint64_t index,
  46                             uint64_t value);
  47static void set_refcount_ro2(void *refcount_array, uint64_t index,
  48                             uint64_t value);
  49static void set_refcount_ro3(void *refcount_array, uint64_t index,
  50                             uint64_t value);
  51static void set_refcount_ro4(void *refcount_array, uint64_t index,
  52                             uint64_t value);
  53static void set_refcount_ro5(void *refcount_array, uint64_t index,
  54                             uint64_t value);
  55static void set_refcount_ro6(void *refcount_array, uint64_t index,
  56                             uint64_t value);
  57
  58
  59static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
  60    &get_refcount_ro0,
  61    &get_refcount_ro1,
  62    &get_refcount_ro2,
  63    &get_refcount_ro3,
  64    &get_refcount_ro4,
  65    &get_refcount_ro5,
  66    &get_refcount_ro6
  67};
  68
  69static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
  70    &set_refcount_ro0,
  71    &set_refcount_ro1,
  72    &set_refcount_ro2,
  73    &set_refcount_ro3,
  74    &set_refcount_ro4,
  75    &set_refcount_ro5,
  76    &set_refcount_ro6
  77};
  78
  79
  80/*********************************************************/
  81/* refcount handling */
  82
  83int qcow2_refcount_init(BlockDriverState *bs)
  84{
  85    BDRVQcowState *s = bs->opaque;
  86    unsigned int refcount_table_size2, i;
  87    int ret;
  88
  89    assert(s->refcount_order >= 0 && s->refcount_order <= 6);
  90
  91    s->get_refcount = get_refcount_funcs[s->refcount_order];
  92    s->set_refcount = set_refcount_funcs[s->refcount_order];
  93
  94    assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
  95    refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
  96    s->refcount_table = g_try_malloc(refcount_table_size2);
  97
  98    if (s->refcount_table_size > 0) {
  99        if (s->refcount_table == NULL) {
 100            ret = -ENOMEM;
 101            goto fail;
 102        }
 103        BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
 104        ret = bdrv_pread(bs->file, s->refcount_table_offset,
 105                         s->refcount_table, refcount_table_size2);
 106        if (ret < 0) {
 107            goto fail;
 108        }
 109        for(i = 0; i < s->refcount_table_size; i++)
 110            be64_to_cpus(&s->refcount_table[i]);
 111    }
 112    return 0;
 113 fail:
 114    return ret;
 115}
 116
 117void qcow2_refcount_close(BlockDriverState *bs)
 118{
 119    BDRVQcowState *s = bs->opaque;
 120    g_free(s->refcount_table);
 121}
 122
 123
 124static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
 125{
 126    return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
 127}
 128
 129static void set_refcount_ro0(void *refcount_array, uint64_t index,
 130                             uint64_t value)
 131{
 132    assert(!(value >> 1));
 133    ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
 134    ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
 135}
 136
 137static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
 138{
 139    return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
 140           & 0x3;
 141}
 142
 143static void set_refcount_ro1(void *refcount_array, uint64_t index,
 144                             uint64_t value)
 145{
 146    assert(!(value >> 2));
 147    ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
 148    ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
 149}
 150
 151static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
 152{
 153    return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
 154           & 0xf;
 155}
 156
 157static void set_refcount_ro2(void *refcount_array, uint64_t index,
 158                             uint64_t value)
 159{
 160    assert(!(value >> 4));
 161    ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
 162    ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
 163}
 164
 165static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
 166{
 167    return ((const uint8_t *)refcount_array)[index];
 168}
 169
 170static void set_refcount_ro3(void *refcount_array, uint64_t index,
 171                             uint64_t value)
 172{
 173    assert(!(value >> 8));
 174    ((uint8_t *)refcount_array)[index] = value;
 175}
 176
 177static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
 178{
 179    return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
 180}
 181
 182static void set_refcount_ro4(void *refcount_array, uint64_t index,
 183                             uint64_t value)
 184{
 185    assert(!(value >> 16));
 186    ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
 187}
 188
 189static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
 190{
 191    return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
 192}
 193
 194static void set_refcount_ro5(void *refcount_array, uint64_t index,
 195                             uint64_t value)
 196{
 197    assert(!(value >> 32));
 198    ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
 199}
 200
 201static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
 202{
 203    return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
 204}
 205
 206static void set_refcount_ro6(void *refcount_array, uint64_t index,
 207                             uint64_t value)
 208{
 209    ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
 210}
 211
 212
 213static int load_refcount_block(BlockDriverState *bs,
 214                               int64_t refcount_block_offset,
 215                               void **refcount_block)
 216{
 217    BDRVQcowState *s = bs->opaque;
 218    int ret;
 219
 220    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
 221    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
 222        refcount_block);
 223
 224    return ret;
 225}
 226
 227/*
 228 * Retrieves the refcount of the cluster given by its index and stores it in
 229 * *refcount. Returns 0 on success and -errno on failure.
 230 */
 231int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
 232                       uint64_t *refcount)
 233{
 234    BDRVQcowState *s = bs->opaque;
 235    uint64_t refcount_table_index, block_index;
 236    int64_t refcount_block_offset;
 237    int ret;
 238    void *refcount_block;
 239
 240    refcount_table_index = cluster_index >> s->refcount_block_bits;
 241    if (refcount_table_index >= s->refcount_table_size) {
 242        *refcount = 0;
 243        return 0;
 244    }
 245    refcount_block_offset =
 246        s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
 247    if (!refcount_block_offset) {
 248        *refcount = 0;
 249        return 0;
 250    }
 251
 252    if (offset_into_cluster(s, refcount_block_offset)) {
 253        qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
 254                                " unaligned (reftable index: %#" PRIx64 ")",
 255                                refcount_block_offset, refcount_table_index);
 256        return -EIO;
 257    }
 258
 259    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
 260                          &refcount_block);
 261    if (ret < 0) {
 262        return ret;
 263    }
 264
 265    block_index = cluster_index & (s->refcount_block_size - 1);
 266    *refcount = s->get_refcount(refcount_block, block_index);
 267
 268    ret = qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
 269    if (ret < 0) {
 270        return ret;
 271    }
 272
 273    return 0;
 274}
 275
 276/*
 277 * Rounds the refcount table size up to avoid growing the table for each single
 278 * refcount block that is allocated.
 279 */
 280static unsigned int next_refcount_table_size(BDRVQcowState *s,
 281    unsigned int min_size)
 282{
 283    unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1;
 284    unsigned int refcount_table_clusters =
 285        MAX(1, s->refcount_table_size >> (s->cluster_bits - 3));
 286
 287    while (min_clusters > refcount_table_clusters) {
 288        refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
 289    }
 290
 291    return refcount_table_clusters << (s->cluster_bits - 3);
 292}
 293
 294
 295/* Checks if two offsets are described by the same refcount block */
 296static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a,
 297    uint64_t offset_b)
 298{
 299    uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
 300    uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
 301
 302    return (block_a == block_b);
 303}
 304
 305/*
 306 * Loads a refcount block. If it doesn't exist yet, it is allocated first
 307 * (including growing the refcount table if needed).
 308 *
 309 * Returns 0 on success or -errno in error case
 310 */
 311static int alloc_refcount_block(BlockDriverState *bs,
 312                                int64_t cluster_index, void **refcount_block)
 313{
 314    BDRVQcowState *s = bs->opaque;
 315    unsigned int refcount_table_index;
 316    int ret;
 317
 318    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
 319
 320    /* Find the refcount block for the given cluster */
 321    refcount_table_index = cluster_index >> s->refcount_block_bits;
 322
 323    if (refcount_table_index < s->refcount_table_size) {
 324
 325        uint64_t refcount_block_offset =
 326            s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
 327
 328        /* If it's already there, we're done */
 329        if (refcount_block_offset) {
 330            if (offset_into_cluster(s, refcount_block_offset)) {
 331                qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
 332                                        PRIx64 " unaligned (reftable index: "
 333                                        "%#x)", refcount_block_offset,
 334                                        refcount_table_index);
 335                return -EIO;
 336            }
 337
 338             return load_refcount_block(bs, refcount_block_offset,
 339                                        refcount_block);
 340        }
 341    }
 342
 343    /*
 344     * If we came here, we need to allocate something. Something is at least
 345     * a cluster for the new refcount block. It may also include a new refcount
 346     * table if the old refcount table is too small.
 347     *
 348     * Note that allocating clusters here needs some special care:
 349     *
 350     * - We can't use the normal qcow2_alloc_clusters(), it would try to
 351     *   increase the refcount and very likely we would end up with an endless
 352     *   recursion. Instead we must place the refcount blocks in a way that
 353     *   they can describe them themselves.
 354     *
 355     * - We need to consider that at this point we are inside update_refcounts
 356     *   and potentially doing an initial refcount increase. This means that
 357     *   some clusters have already been allocated by the caller, but their
 358     *   refcount isn't accurate yet. If we allocate clusters for metadata, we
 359     *   need to return -EAGAIN to signal the caller that it needs to restart
 360     *   the search for free clusters.
 361     *
 362     * - alloc_clusters_noref and qcow2_free_clusters may load a different
 363     *   refcount block into the cache
 364     */
 365
 366    *refcount_block = NULL;
 367
 368    /* We write to the refcount table, so we might depend on L2 tables */
 369    ret = qcow2_cache_flush(bs, s->l2_table_cache);
 370    if (ret < 0) {
 371        return ret;
 372    }
 373
 374    /* Allocate the refcount block itself and mark it as used */
 375    int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
 376    if (new_block < 0) {
 377        return new_block;
 378    }
 379
 380#ifdef DEBUG_ALLOC2
 381    fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
 382        " at %" PRIx64 "\n",
 383        refcount_table_index, cluster_index << s->cluster_bits, new_block);
 384#endif
 385
 386    if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
 387        /* Zero the new refcount block before updating it */
 388        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
 389                                    refcount_block);
 390        if (ret < 0) {
 391            goto fail_block;
 392        }
 393
 394        memset(*refcount_block, 0, s->cluster_size);
 395
 396        /* The block describes itself, need to update the cache */
 397        int block_index = (new_block >> s->cluster_bits) &
 398            (s->refcount_block_size - 1);
 399        s->set_refcount(*refcount_block, block_index, 1);
 400    } else {
 401        /* Described somewhere else. This can recurse at most twice before we
 402         * arrive at a block that describes itself. */
 403        ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
 404                              QCOW2_DISCARD_NEVER);
 405        if (ret < 0) {
 406            goto fail_block;
 407        }
 408
 409        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 410        if (ret < 0) {
 411            goto fail_block;
 412        }
 413
 414        /* Initialize the new refcount block only after updating its refcount,
 415         * update_refcount uses the refcount cache itself */
 416        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
 417                                    refcount_block);
 418        if (ret < 0) {
 419            goto fail_block;
 420        }
 421
 422        memset(*refcount_block, 0, s->cluster_size);
 423    }
 424
 425    /* Now the new refcount block needs to be written to disk */
 426    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
 427    qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
 428    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 429    if (ret < 0) {
 430        goto fail_block;
 431    }
 432
 433    /* If the refcount table is big enough, just hook the block up there */
 434    if (refcount_table_index < s->refcount_table_size) {
 435        uint64_t data64 = cpu_to_be64(new_block);
 436        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
 437        ret = bdrv_pwrite_sync(bs->file,
 438            s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
 439            &data64, sizeof(data64));
 440        if (ret < 0) {
 441            goto fail_block;
 442        }
 443
 444        s->refcount_table[refcount_table_index] = new_block;
 445
 446        /* The new refcount block may be where the caller intended to put its
 447         * data, so let it restart the search. */
 448        return -EAGAIN;
 449    }
 450
 451    ret = qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
 452    if (ret < 0) {
 453        goto fail_block;
 454    }
 455
 456    /*
 457     * If we come here, we need to grow the refcount table. Again, a new
 458     * refcount table needs some space and we can't simply allocate to avoid
 459     * endless recursion.
 460     *
 461     * Therefore let's grab new refcount blocks at the end of the image, which
 462     * will describe themselves and the new refcount table. This way we can
 463     * reference them only in the new table and do the switch to the new
 464     * refcount table at once without producing an inconsistent state in
 465     * between.
 466     */
 467    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
 468
 469    /* Calculate the number of refcount blocks needed so far; this will be the
 470     * basis for calculating the index of the first cluster used for the
 471     * self-describing refcount structures which we are about to create.
 472     *
 473     * Because we reached this point, there cannot be any refcount entries for
 474     * cluster_index or higher indices yet. However, because new_block has been
 475     * allocated to describe that cluster (and it will assume this role later
 476     * on), we cannot use that index; also, new_block may actually have a higher
 477     * cluster index than cluster_index, so it needs to be taken into account
 478     * here (and 1 needs to be added to its value because that cluster is used).
 479     */
 480    uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
 481                                            (new_block >> s->cluster_bits) + 1),
 482                                        s->refcount_block_size);
 483
 484    if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
 485        return -EFBIG;
 486    }
 487
 488    /* And now we need at least one block more for the new metadata */
 489    uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
 490    uint64_t last_table_size;
 491    uint64_t blocks_clusters;
 492    do {
 493        uint64_t table_clusters =
 494            size_to_clusters(s, table_size * sizeof(uint64_t));
 495        blocks_clusters = 1 +
 496            ((table_clusters + s->refcount_block_size - 1)
 497            / s->refcount_block_size);
 498        uint64_t meta_clusters = table_clusters + blocks_clusters;
 499
 500        last_table_size = table_size;
 501        table_size = next_refcount_table_size(s, blocks_used +
 502            ((meta_clusters + s->refcount_block_size - 1)
 503            / s->refcount_block_size));
 504
 505    } while (last_table_size != table_size);
 506
 507#ifdef DEBUG_ALLOC2
 508    fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n",
 509        s->refcount_table_size, table_size);
 510#endif
 511
 512    /* Create the new refcount table and blocks */
 513    uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
 514        s->cluster_size;
 515    uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
 516    uint64_t *new_table = g_try_new0(uint64_t, table_size);
 517    void *new_blocks = g_try_malloc0(blocks_clusters * s->cluster_size);
 518
 519    assert(table_size > 0 && blocks_clusters > 0);
 520    if (new_table == NULL || new_blocks == NULL) {
 521        ret = -ENOMEM;
 522        goto fail_table;
 523    }
 524
 525    /* Fill the new refcount table */
 526    memcpy(new_table, s->refcount_table,
 527        s->refcount_table_size * sizeof(uint64_t));
 528    new_table[refcount_table_index] = new_block;
 529
 530    int i;
 531    for (i = 0; i < blocks_clusters; i++) {
 532        new_table[blocks_used + i] = meta_offset + (i * s->cluster_size);
 533    }
 534
 535    /* Fill the refcount blocks */
 536    uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t));
 537    int block = 0;
 538    for (i = 0; i < table_clusters + blocks_clusters; i++) {
 539        s->set_refcount(new_blocks, block++, 1);
 540    }
 541
 542    /* Write refcount blocks to disk */
 543    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
 544    ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks,
 545        blocks_clusters * s->cluster_size);
 546    g_free(new_blocks);
 547    new_blocks = NULL;
 548    if (ret < 0) {
 549        goto fail_table;
 550    }
 551
 552    /* Write refcount table to disk */
 553    for(i = 0; i < table_size; i++) {
 554        cpu_to_be64s(&new_table[i]);
 555    }
 556
 557    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
 558    ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
 559        table_size * sizeof(uint64_t));
 560    if (ret < 0) {
 561        goto fail_table;
 562    }
 563
 564    for(i = 0; i < table_size; i++) {
 565        be64_to_cpus(&new_table[i]);
 566    }
 567
 568    /* Hook up the new refcount table in the qcow2 header */
 569    uint8_t data[12];
 570    cpu_to_be64w((uint64_t*)data, table_offset);
 571    cpu_to_be32w((uint32_t*)(data + 8), table_clusters);
 572    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
 573    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, refcount_table_offset),
 574        data, sizeof(data));
 575    if (ret < 0) {
 576        goto fail_table;
 577    }
 578
 579    /* And switch it in memory */
 580    uint64_t old_table_offset = s->refcount_table_offset;
 581    uint64_t old_table_size = s->refcount_table_size;
 582
 583    g_free(s->refcount_table);
 584    s->refcount_table = new_table;
 585    s->refcount_table_size = table_size;
 586    s->refcount_table_offset = table_offset;
 587
 588    /* Free old table. */
 589    qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
 590                        QCOW2_DISCARD_OTHER);
 591
 592    ret = load_refcount_block(bs, new_block, refcount_block);
 593    if (ret < 0) {
 594        return ret;
 595    }
 596
 597    /* If we were trying to do the initial refcount update for some cluster
 598     * allocation, we might have used the same clusters to store newly
 599     * allocated metadata. Make the caller search some new space. */
 600    return -EAGAIN;
 601
 602fail_table:
 603    g_free(new_blocks);
 604    g_free(new_table);
 605fail_block:
 606    if (*refcount_block != NULL) {
 607        qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
 608    }
 609    return ret;
 610}
 611
 612void qcow2_process_discards(BlockDriverState *bs, int ret)
 613{
 614    BDRVQcowState *s = bs->opaque;
 615    Qcow2DiscardRegion *d, *next;
 616
 617    QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
 618        QTAILQ_REMOVE(&s->discards, d, next);
 619
 620        /* Discard is optional, ignore the return value */
 621        if (ret >= 0) {
 622            bdrv_discard(bs->file,
 623                         d->offset >> BDRV_SECTOR_BITS,
 624                         d->bytes >> BDRV_SECTOR_BITS);
 625        }
 626
 627        g_free(d);
 628    }
 629}
 630
 631static void update_refcount_discard(BlockDriverState *bs,
 632                                    uint64_t offset, uint64_t length)
 633{
 634    BDRVQcowState *s = bs->opaque;
 635    Qcow2DiscardRegion *d, *p, *next;
 636
 637    QTAILQ_FOREACH(d, &s->discards, next) {
 638        uint64_t new_start = MIN(offset, d->offset);
 639        uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
 640
 641        if (new_end - new_start <= length + d->bytes) {
 642            /* There can't be any overlap, areas ending up here have no
 643             * references any more and therefore shouldn't get freed another
 644             * time. */
 645            assert(d->bytes + length == new_end - new_start);
 646            d->offset = new_start;
 647            d->bytes = new_end - new_start;
 648            goto found;
 649        }
 650    }
 651
 652    d = g_malloc(sizeof(*d));
 653    *d = (Qcow2DiscardRegion) {
 654        .bs     = bs,
 655        .offset = offset,
 656        .bytes  = length,
 657    };
 658    QTAILQ_INSERT_TAIL(&s->discards, d, next);
 659
 660found:
 661    /* Merge discard requests if they are adjacent now */
 662    QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
 663        if (p == d
 664            || p->offset > d->offset + d->bytes
 665            || d->offset > p->offset + p->bytes)
 666        {
 667            continue;
 668        }
 669
 670        /* Still no overlap possible */
 671        assert(p->offset == d->offset + d->bytes
 672            || d->offset == p->offset + p->bytes);
 673
 674        QTAILQ_REMOVE(&s->discards, p, next);
 675        d->offset = MIN(d->offset, p->offset);
 676        d->bytes += p->bytes;
 677        g_free(p);
 678    }
 679}
 680
 681/* XXX: cache several refcount block clusters ? */
 682/* @addend is the absolute value of the addend; if @decrease is set, @addend
 683 * will be subtracted from the current refcount, otherwise it will be added */
 684static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
 685                                                   int64_t offset,
 686                                                   int64_t length,
 687                                                   uint64_t addend,
 688                                                   bool decrease,
 689                                                   enum qcow2_discard_type type)
 690{
 691    BDRVQcowState *s = bs->opaque;
 692    int64_t start, last, cluster_offset;
 693    void *refcount_block = NULL;
 694    int64_t old_table_index = -1;
 695    int ret;
 696
 697#ifdef DEBUG_ALLOC2
 698    fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
 699            " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
 700            addend);
 701#endif
 702    if (length < 0) {
 703        return -EINVAL;
 704    } else if (length == 0) {
 705        return 0;
 706    }
 707
 708    if (decrease) {
 709        qcow2_cache_set_dependency(bs, s->refcount_block_cache,
 710            s->l2_table_cache);
 711    }
 712
 713    start = start_of_cluster(s, offset);
 714    last = start_of_cluster(s, offset + length - 1);
 715    for(cluster_offset = start; cluster_offset <= last;
 716        cluster_offset += s->cluster_size)
 717    {
 718        int block_index;
 719        uint64_t refcount;
 720        int64_t cluster_index = cluster_offset >> s->cluster_bits;
 721        int64_t table_index = cluster_index >> s->refcount_block_bits;
 722
 723        /* Load the refcount block and allocate it if needed */
 724        if (table_index != old_table_index) {
 725            if (refcount_block) {
 726                ret = qcow2_cache_put(bs, s->refcount_block_cache,
 727                                      &refcount_block);
 728                if (ret < 0) {
 729                    goto fail;
 730                }
 731            }
 732
 733            ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
 734            if (ret < 0) {
 735                goto fail;
 736            }
 737        }
 738        old_table_index = table_index;
 739
 740        qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
 741
 742        /* we can update the count and save it */
 743        block_index = cluster_index & (s->refcount_block_size - 1);
 744
 745        refcount = s->get_refcount(refcount_block, block_index);
 746        if (decrease ? (refcount - addend > refcount)
 747                     : (refcount + addend < refcount ||
 748                        refcount + addend > s->refcount_max))
 749        {
 750            ret = -EINVAL;
 751            goto fail;
 752        }
 753        if (decrease) {
 754            refcount -= addend;
 755        } else {
 756            refcount += addend;
 757        }
 758        if (refcount == 0 && cluster_index < s->free_cluster_index) {
 759            s->free_cluster_index = cluster_index;
 760        }
 761        s->set_refcount(refcount_block, block_index, refcount);
 762
 763        if (refcount == 0 && s->discard_passthrough[type]) {
 764            update_refcount_discard(bs, cluster_offset, s->cluster_size);
 765        }
 766    }
 767
 768    ret = 0;
 769fail:
 770    if (!s->cache_discards) {
 771        qcow2_process_discards(bs, ret);
 772    }
 773
 774    /* Write last changed block to disk */
 775    if (refcount_block) {
 776        int wret;
 777        wret = qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
 778        if (wret < 0) {
 779            return ret < 0 ? ret : wret;
 780        }
 781    }
 782
 783    /*
 784     * Try do undo any updates if an error is returned (This may succeed in
 785     * some cases like ENOSPC for allocating a new refcount block)
 786     */
 787    if (ret < 0) {
 788        int dummy;
 789        dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
 790                                !decrease, QCOW2_DISCARD_NEVER);
 791        (void)dummy;
 792    }
 793
 794    return ret;
 795}
 796
 797/*
 798 * Increases or decreases the refcount of a given cluster.
 799 *
 800 * @addend is the absolute value of the addend; if @decrease is set, @addend
 801 * will be subtracted from the current refcount, otherwise it will be added.
 802 *
 803 * On success 0 is returned; on failure -errno is returned.
 804 */
 805int qcow2_update_cluster_refcount(BlockDriverState *bs,
 806                                  int64_t cluster_index,
 807                                  uint64_t addend, bool decrease,
 808                                  enum qcow2_discard_type type)
 809{
 810    BDRVQcowState *s = bs->opaque;
 811    int ret;
 812
 813    ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
 814                          decrease, type);
 815    if (ret < 0) {
 816        return ret;
 817    }
 818
 819    return 0;
 820}
 821
 822
 823
 824/*********************************************************/
 825/* cluster allocation functions */
 826
 827
 828
 829/* return < 0 if error */
 830static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
 831{
 832    BDRVQcowState *s = bs->opaque;
 833    uint64_t i, nb_clusters, refcount;
 834    int ret;
 835
 836    /* We can't allocate clusters if they may still be queued for discard. */
 837    if (s->cache_discards) {
 838        qcow2_process_discards(bs, 0);
 839    }
 840
 841    nb_clusters = size_to_clusters(s, size);
 842retry:
 843    for(i = 0; i < nb_clusters; i++) {
 844        uint64_t next_cluster_index = s->free_cluster_index++;
 845        ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
 846
 847        if (ret < 0) {
 848            return ret;
 849        } else if (refcount != 0) {
 850            goto retry;
 851        }
 852    }
 853
 854    /* Make sure that all offsets in the "allocated" range are representable
 855     * in an int64_t */
 856    if (s->free_cluster_index > 0 &&
 857        s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits))
 858    {
 859        return -EFBIG;
 860    }
 861
 862#ifdef DEBUG_ALLOC2
 863    fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
 864            size,
 865            (s->free_cluster_index - nb_clusters) << s->cluster_bits);
 866#endif
 867    return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
 868}
 869
 870int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
 871{
 872    int64_t offset;
 873    int ret;
 874
 875    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
 876    do {
 877        offset = alloc_clusters_noref(bs, size);
 878        if (offset < 0) {
 879            return offset;
 880        }
 881
 882        ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
 883    } while (ret == -EAGAIN);
 884
 885    if (ret < 0) {
 886        return ret;
 887    }
 888
 889    return offset;
 890}
 891
 892int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
 893    int nb_clusters)
 894{
 895    BDRVQcowState *s = bs->opaque;
 896    uint64_t cluster_index, refcount;
 897    uint64_t i;
 898    int ret;
 899
 900    assert(nb_clusters >= 0);
 901    if (nb_clusters == 0) {
 902        return 0;
 903    }
 904
 905    do {
 906        /* Check how many clusters there are free */
 907        cluster_index = offset >> s->cluster_bits;
 908        for(i = 0; i < nb_clusters; i++) {
 909            ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
 910            if (ret < 0) {
 911                return ret;
 912            } else if (refcount != 0) {
 913                break;
 914            }
 915        }
 916
 917        /* And then allocate them */
 918        ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
 919                              QCOW2_DISCARD_NEVER);
 920    } while (ret == -EAGAIN);
 921
 922    if (ret < 0) {
 923        return ret;
 924    }
 925
 926    return i;
 927}
 928
 929/* only used to allocate compressed sectors. We try to allocate
 930   contiguous sectors. size must be <= cluster_size */
 931int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
 932{
 933    BDRVQcowState *s = bs->opaque;
 934    int64_t offset;
 935    size_t free_in_cluster;
 936    int ret;
 937
 938    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
 939    assert(size > 0 && size <= s->cluster_size);
 940    assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
 941
 942    offset = s->free_byte_offset;
 943
 944    if (offset) {
 945        uint64_t refcount;
 946        ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
 947        if (ret < 0) {
 948            return ret;
 949        }
 950
 951        if (refcount == s->refcount_max) {
 952            offset = 0;
 953        }
 954    }
 955
 956    free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
 957    if (!offset || free_in_cluster < size) {
 958        int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size);
 959        if (new_cluster < 0) {
 960            return new_cluster;
 961        }
 962
 963        if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
 964            offset = new_cluster;
 965        }
 966    }
 967
 968    assert(offset);
 969    ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
 970    if (ret < 0) {
 971        return ret;
 972    }
 973
 974    /* The cluster refcount was incremented; refcount blocks must be flushed
 975     * before the caller's L2 table updates. */
 976    qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
 977
 978    s->free_byte_offset = offset + size;
 979    if (!offset_into_cluster(s, s->free_byte_offset)) {
 980        s->free_byte_offset = 0;
 981    }
 982
 983    return offset;
 984}
 985
 986void qcow2_free_clusters(BlockDriverState *bs,
 987                          int64_t offset, int64_t size,
 988                          enum qcow2_discard_type type)
 989{
 990    int ret;
 991
 992    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
 993    ret = update_refcount(bs, offset, size, 1, true, type);
 994    if (ret < 0) {
 995        fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
 996        /* TODO Remember the clusters to free them later and avoid leaking */
 997    }
 998}
 999
1000/*
1001 * Free a cluster using its L2 entry (handles clusters of all types, e.g.
1002 * normal cluster, compressed cluster, etc.)
1003 */
1004void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
1005                             int nb_clusters, enum qcow2_discard_type type)
1006{
1007    BDRVQcowState *s = bs->opaque;
1008
1009    switch (qcow2_get_cluster_type(l2_entry)) {
1010    case QCOW2_CLUSTER_COMPRESSED:
1011        {
1012            int nb_csectors;
1013            nb_csectors = ((l2_entry >> s->csize_shift) &
1014                           s->csize_mask) + 1;
1015            qcow2_free_clusters(bs,
1016                (l2_entry & s->cluster_offset_mask) & ~511,
1017                nb_csectors * 512, type);
1018        }
1019        break;
1020    case QCOW2_CLUSTER_NORMAL:
1021    case QCOW2_CLUSTER_ZERO:
1022        if (l2_entry & L2E_OFFSET_MASK) {
1023            if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1024                qcow2_signal_corruption(bs, false, -1, -1,
1025                                        "Cannot free unaligned cluster %#llx",
1026                                        l2_entry & L2E_OFFSET_MASK);
1027            } else {
1028                qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1029                                    nb_clusters << s->cluster_bits, type);
1030            }
1031        }
1032        break;
1033    case QCOW2_CLUSTER_UNALLOCATED:
1034        break;
1035    default:
1036        abort();
1037    }
1038}
1039
1040
1041
1042/*********************************************************/
1043/* snapshots and image creation */
1044
1045
1046
1047/* update the refcounts of snapshots and the copied flag */
1048int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1049    int64_t l1_table_offset, int l1_size, int addend)
1050{
1051    BDRVQcowState *s = bs->opaque;
1052    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, refcount;
1053    bool l1_allocated = false;
1054    int64_t old_offset, old_l2_offset;
1055    int i, j, l1_modified = 0, nb_csectors;
1056    int ret;
1057
1058    assert(addend >= -1 && addend <= 1);
1059
1060    l2_table = NULL;
1061    l1_table = NULL;
1062    l1_size2 = l1_size * sizeof(uint64_t);
1063
1064    s->cache_discards = true;
1065
1066    /* WARNING: qcow2_snapshot_goto relies on this function not using the
1067     * l1_table_offset when it is the current s->l1_table_offset! Be careful
1068     * when changing this! */
1069    if (l1_table_offset != s->l1_table_offset) {
1070        l1_table = g_try_malloc0(align_offset(l1_size2, 512));
1071        if (l1_size2 && l1_table == NULL) {
1072            ret = -ENOMEM;
1073            goto fail;
1074        }
1075        l1_allocated = true;
1076
1077        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1078        if (ret < 0) {
1079            goto fail;
1080        }
1081
1082        for(i = 0;i < l1_size; i++)
1083            be64_to_cpus(&l1_table[i]);
1084    } else {
1085        assert(l1_size == s->l1_size);
1086        l1_table = s->l1_table;
1087        l1_allocated = false;
1088    }
1089
1090    for(i = 0; i < l1_size; i++) {
1091        l2_offset = l1_table[i];
1092        if (l2_offset) {
1093            old_l2_offset = l2_offset;
1094            l2_offset &= L1E_OFFSET_MASK;
1095
1096            if (offset_into_cluster(s, l2_offset)) {
1097                qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1098                                        PRIx64 " unaligned (L1 index: %#x)",
1099                                        l2_offset, i);
1100                ret = -EIO;
1101                goto fail;
1102            }
1103
1104            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
1105                (void**) &l2_table);
1106            if (ret < 0) {
1107                goto fail;
1108            }
1109
1110            for(j = 0; j < s->l2_size; j++) {
1111                uint64_t cluster_index;
1112
1113                offset = be64_to_cpu(l2_table[j]);
1114                old_offset = offset;
1115                offset &= ~QCOW_OFLAG_COPIED;
1116
1117                switch (qcow2_get_cluster_type(offset)) {
1118                    case QCOW2_CLUSTER_COMPRESSED:
1119                        nb_csectors = ((offset >> s->csize_shift) &
1120                                       s->csize_mask) + 1;
1121                        if (addend != 0) {
1122                            ret = update_refcount(bs,
1123                                (offset & s->cluster_offset_mask) & ~511,
1124                                nb_csectors * 512, abs(addend), addend < 0,
1125                                QCOW2_DISCARD_SNAPSHOT);
1126                            if (ret < 0) {
1127                                goto fail;
1128                            }
1129                        }
1130                        /* compressed clusters are never modified */
1131                        refcount = 2;
1132                        break;
1133
1134                    case QCOW2_CLUSTER_NORMAL:
1135                    case QCOW2_CLUSTER_ZERO:
1136                        if (offset_into_cluster(s, offset & L2E_OFFSET_MASK)) {
1137                            qcow2_signal_corruption(bs, true, -1, -1, "Data "
1138                                                    "cluster offset %#llx "
1139                                                    "unaligned (L2 offset: %#"
1140                                                    PRIx64 ", L2 index: %#x)",
1141                                                    offset & L2E_OFFSET_MASK,
1142                                                    l2_offset, j);
1143                            ret = -EIO;
1144                            goto fail;
1145                        }
1146
1147                        cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
1148                        if (!cluster_index) {
1149                            /* unallocated */
1150                            refcount = 0;
1151                            break;
1152                        }
1153                        if (addend != 0) {
1154                            ret = qcow2_update_cluster_refcount(bs,
1155                                    cluster_index, abs(addend), addend < 0,
1156                                    QCOW2_DISCARD_SNAPSHOT);
1157                            if (ret < 0) {
1158                                goto fail;
1159                            }
1160                        }
1161
1162                        ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1163                        if (ret < 0) {
1164                            goto fail;
1165                        }
1166                        break;
1167
1168                    case QCOW2_CLUSTER_UNALLOCATED:
1169                        refcount = 0;
1170                        break;
1171
1172                    default:
1173                        abort();
1174                }
1175
1176                if (refcount == 1) {
1177                    offset |= QCOW_OFLAG_COPIED;
1178                }
1179                if (offset != old_offset) {
1180                    if (addend > 0) {
1181                        qcow2_cache_set_dependency(bs, s->l2_table_cache,
1182                            s->refcount_block_cache);
1183                    }
1184                    l2_table[j] = cpu_to_be64(offset);
1185                    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
1186                }
1187            }
1188
1189            ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
1190            if (ret < 0) {
1191                goto fail;
1192            }
1193
1194
1195            if (addend != 0) {
1196                ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1197                                                        s->cluster_bits,
1198                                                    abs(addend), addend < 0,
1199                                                    QCOW2_DISCARD_SNAPSHOT);
1200                if (ret < 0) {
1201                    goto fail;
1202                }
1203            }
1204            ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1205                                     &refcount);
1206            if (ret < 0) {
1207                goto fail;
1208            } else if (refcount == 1) {
1209                l2_offset |= QCOW_OFLAG_COPIED;
1210            }
1211            if (l2_offset != old_l2_offset) {
1212                l1_table[i] = l2_offset;
1213                l1_modified = 1;
1214            }
1215        }
1216    }
1217
1218    ret = bdrv_flush(bs);
1219fail:
1220    if (l2_table) {
1221        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
1222    }
1223
1224    s->cache_discards = false;
1225    qcow2_process_discards(bs, ret);
1226
1227    /* Update L1 only if it isn't deleted anyway (addend = -1) */
1228    if (ret == 0 && addend >= 0 && l1_modified) {
1229        for (i = 0; i < l1_size; i++) {
1230            cpu_to_be64s(&l1_table[i]);
1231        }
1232
1233        ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, l1_size2);
1234
1235        for (i = 0; i < l1_size; i++) {
1236            be64_to_cpus(&l1_table[i]);
1237        }
1238    }
1239    if (l1_allocated)
1240        g_free(l1_table);
1241    return ret;
1242}
1243
1244
1245
1246
1247/*********************************************************/
1248/* refcount checking functions */
1249
1250
1251static size_t refcount_array_byte_size(BDRVQcowState *s, uint64_t entries)
1252{
1253    /* This assertion holds because there is no way we can address more than
1254     * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1255     * offsets have to be representable in bytes); due to every cluster
1256     * corresponding to one refcount entry, we are well below that limit */
1257    assert(entries < (UINT64_C(1) << (64 - 9)));
1258
1259    /* Thanks to the assertion this will not overflow, because
1260     * s->refcount_order < 7.
1261     * (note: x << s->refcount_order == x * s->refcount_bits) */
1262    return DIV_ROUND_UP(entries << s->refcount_order, 8);
1263}
1264
1265/**
1266 * Reallocates *array so that it can hold new_size entries. *size must contain
1267 * the current number of entries in *array. If the reallocation fails, *array
1268 * and *size will not be modified and -errno will be returned. If the
1269 * reallocation is successful, *array will be set to the new buffer, *size
1270 * will be set to new_size and 0 will be returned. The size of the reallocated
1271 * refcount array buffer will be aligned to a cluster boundary, and the newly
1272 * allocated area will be zeroed.
1273 */
1274static int realloc_refcount_array(BDRVQcowState *s, void **array,
1275                                  int64_t *size, int64_t new_size)
1276{
1277    size_t old_byte_size, new_byte_size;
1278    void *new_ptr;
1279
1280    /* Round to clusters so the array can be directly written to disk */
1281    old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1282                    * s->cluster_size;
1283    new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1284                    * s->cluster_size;
1285
1286    if (new_byte_size == old_byte_size) {
1287        *size = new_size;
1288        return 0;
1289    }
1290
1291    assert(new_byte_size > 0);
1292
1293    new_ptr = g_try_realloc(*array, new_byte_size);
1294    if (!new_ptr) {
1295        return -ENOMEM;
1296    }
1297
1298    if (new_byte_size > old_byte_size) {
1299        memset((void *)((uintptr_t)new_ptr + old_byte_size), 0,
1300               new_byte_size - old_byte_size);
1301    }
1302
1303    *array = new_ptr;
1304    *size  = new_size;
1305
1306    return 0;
1307}
1308
1309/*
1310 * Increases the refcount for a range of clusters in a given refcount table.
1311 * This is used to construct a temporary refcount table out of L1 and L2 tables
1312 * which can be compared the the refcount table saved in the image.
1313 *
1314 * Modifies the number of errors in res.
1315 */
1316static int inc_refcounts(BlockDriverState *bs,
1317                         BdrvCheckResult *res,
1318                         void **refcount_table,
1319                         int64_t *refcount_table_size,
1320                         int64_t offset, int64_t size)
1321{
1322    BDRVQcowState *s = bs->opaque;
1323    uint64_t start, last, cluster_offset, k, refcount;
1324    int ret;
1325
1326    if (size <= 0) {
1327        return 0;
1328    }
1329
1330    start = start_of_cluster(s, offset);
1331    last = start_of_cluster(s, offset + size - 1);
1332    for(cluster_offset = start; cluster_offset <= last;
1333        cluster_offset += s->cluster_size) {
1334        k = cluster_offset >> s->cluster_bits;
1335        if (k >= *refcount_table_size) {
1336            ret = realloc_refcount_array(s, refcount_table,
1337                                         refcount_table_size, k + 1);
1338            if (ret < 0) {
1339                res->check_errors++;
1340                return ret;
1341            }
1342        }
1343
1344        refcount = s->get_refcount(*refcount_table, k);
1345        if (refcount == s->refcount_max) {
1346            fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1347                    "\n", cluster_offset);
1348            res->corruptions++;
1349            continue;
1350        }
1351        s->set_refcount(*refcount_table, k, refcount + 1);
1352    }
1353
1354    return 0;
1355}
1356
1357/* Flags for check_refcounts_l1() and check_refcounts_l2() */
1358enum {
1359    CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1360};
1361
1362/*
1363 * Increases the refcount in the given refcount table for the all clusters
1364 * referenced in the L2 table. While doing so, performs some checks on L2
1365 * entries.
1366 *
1367 * Returns the number of errors found by the checks or -errno if an internal
1368 * error occurred.
1369 */
1370static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1371                              void **refcount_table,
1372                              int64_t *refcount_table_size, int64_t l2_offset,
1373                              int flags)
1374{
1375    BDRVQcowState *s = bs->opaque;
1376    uint64_t *l2_table, l2_entry;
1377    uint64_t next_contiguous_offset = 0;
1378    int i, l2_size, nb_csectors, ret;
1379
1380    /* Read L2 table from disk */
1381    l2_size = s->l2_size * sizeof(uint64_t);
1382    l2_table = g_malloc(l2_size);
1383
1384    ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size);
1385    if (ret < 0) {
1386        fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1387        res->check_errors++;
1388        goto fail;
1389    }
1390
1391    /* Do the actual checks */
1392    for(i = 0; i < s->l2_size; i++) {
1393        l2_entry = be64_to_cpu(l2_table[i]);
1394
1395        switch (qcow2_get_cluster_type(l2_entry)) {
1396        case QCOW2_CLUSTER_COMPRESSED:
1397            /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1398            if (l2_entry & QCOW_OFLAG_COPIED) {
1399                fprintf(stderr, "ERROR: cluster %" PRId64 ": "
1400                    "copied flag must never be set for compressed "
1401                    "clusters\n", l2_entry >> s->cluster_bits);
1402                l2_entry &= ~QCOW_OFLAG_COPIED;
1403                res->corruptions++;
1404            }
1405
1406            /* Mark cluster as used */
1407            nb_csectors = ((l2_entry >> s->csize_shift) &
1408                           s->csize_mask) + 1;
1409            l2_entry &= s->cluster_offset_mask;
1410            ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1411                                l2_entry & ~511, nb_csectors * 512);
1412            if (ret < 0) {
1413                goto fail;
1414            }
1415
1416            if (flags & CHECK_FRAG_INFO) {
1417                res->bfi.allocated_clusters++;
1418                res->bfi.compressed_clusters++;
1419
1420                /* Compressed clusters are fragmented by nature.  Since they
1421                 * take up sub-sector space but we only have sector granularity
1422                 * I/O we need to re-read the same sectors even for adjacent
1423                 * compressed clusters.
1424                 */
1425                res->bfi.fragmented_clusters++;
1426            }
1427            break;
1428
1429        case QCOW2_CLUSTER_ZERO:
1430            if ((l2_entry & L2E_OFFSET_MASK) == 0) {
1431                break;
1432            }
1433            /* fall through */
1434
1435        case QCOW2_CLUSTER_NORMAL:
1436        {
1437            uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1438
1439            if (flags & CHECK_FRAG_INFO) {
1440                res->bfi.allocated_clusters++;
1441                if (next_contiguous_offset &&
1442                    offset != next_contiguous_offset) {
1443                    res->bfi.fragmented_clusters++;
1444                }
1445                next_contiguous_offset = offset + s->cluster_size;
1446            }
1447
1448            /* Mark cluster as used */
1449            ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1450                                offset, s->cluster_size);
1451            if (ret < 0) {
1452                goto fail;
1453            }
1454
1455            /* Correct offsets are cluster aligned */
1456            if (offset_into_cluster(s, offset)) {
1457                fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
1458                    "properly aligned; L2 entry corrupted.\n", offset);
1459                res->corruptions++;
1460            }
1461            break;
1462        }
1463
1464        case QCOW2_CLUSTER_UNALLOCATED:
1465            break;
1466
1467        default:
1468            abort();
1469        }
1470    }
1471
1472    g_free(l2_table);
1473    return 0;
1474
1475fail:
1476    g_free(l2_table);
1477    return ret;
1478}
1479
1480/*
1481 * Increases the refcount for the L1 table, its L2 tables and all referenced
1482 * clusters in the given refcount table. While doing so, performs some checks
1483 * on L1 and L2 entries.
1484 *
1485 * Returns the number of errors found by the checks or -errno if an internal
1486 * error occurred.
1487 */
1488static int check_refcounts_l1(BlockDriverState *bs,
1489                              BdrvCheckResult *res,
1490                              void **refcount_table,
1491                              int64_t *refcount_table_size,
1492                              int64_t l1_table_offset, int l1_size,
1493                              int flags)
1494{
1495    BDRVQcowState *s = bs->opaque;
1496    uint64_t *l1_table = NULL, l2_offset, l1_size2;
1497    int i, ret;
1498
1499    l1_size2 = l1_size * sizeof(uint64_t);
1500
1501    /* Mark L1 table as used */
1502    ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1503                        l1_table_offset, l1_size2);
1504    if (ret < 0) {
1505        goto fail;
1506    }
1507
1508    /* Read L1 table entries from disk */
1509    if (l1_size2 > 0) {
1510        l1_table = g_try_malloc(l1_size2);
1511        if (l1_table == NULL) {
1512            ret = -ENOMEM;
1513            res->check_errors++;
1514            goto fail;
1515        }
1516        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1517        if (ret < 0) {
1518            fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1519            res->check_errors++;
1520            goto fail;
1521        }
1522        for(i = 0;i < l1_size; i++)
1523            be64_to_cpus(&l1_table[i]);
1524    }
1525
1526    /* Do the actual checks */
1527    for(i = 0; i < l1_size; i++) {
1528        l2_offset = l1_table[i];
1529        if (l2_offset) {
1530            /* Mark L2 table as used */
1531            l2_offset &= L1E_OFFSET_MASK;
1532            ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1533                                l2_offset, s->cluster_size);
1534            if (ret < 0) {
1535                goto fail;
1536            }
1537
1538            /* L2 tables are cluster aligned */
1539            if (offset_into_cluster(s, l2_offset)) {
1540                fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1541                    "cluster aligned; L1 entry corrupted\n", l2_offset);
1542                res->corruptions++;
1543            }
1544
1545            /* Process and check L2 entries */
1546            ret = check_refcounts_l2(bs, res, refcount_table,
1547                                     refcount_table_size, l2_offset, flags);
1548            if (ret < 0) {
1549                goto fail;
1550            }
1551        }
1552    }
1553    g_free(l1_table);
1554    return 0;
1555
1556fail:
1557    g_free(l1_table);
1558    return ret;
1559}
1560
1561/*
1562 * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1563 *
1564 * This function does not print an error message nor does it increment
1565 * check_errors if qcow2_get_refcount fails (this is because such an error will
1566 * have been already detected and sufficiently signaled by the calling function
1567 * (qcow2_check_refcounts) by the time this function is called).
1568 */
1569static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1570                              BdrvCheckMode fix)
1571{
1572    BDRVQcowState *s = bs->opaque;
1573    uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1574    int ret;
1575    uint64_t refcount;
1576    int i, j;
1577
1578    for (i = 0; i < s->l1_size; i++) {
1579        uint64_t l1_entry = s->l1_table[i];
1580        uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1581        bool l2_dirty = false;
1582
1583        if (!l2_offset) {
1584            continue;
1585        }
1586
1587        ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1588                                 &refcount);
1589        if (ret < 0) {
1590            /* don't print message nor increment check_errors */
1591            continue;
1592        }
1593        if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1594            fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1595                    "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1596                    fix & BDRV_FIX_ERRORS ? "Repairing" :
1597                                            "ERROR",
1598                    i, l1_entry, refcount);
1599            if (fix & BDRV_FIX_ERRORS) {
1600                s->l1_table[i] = refcount == 1
1601                               ? l1_entry |  QCOW_OFLAG_COPIED
1602                               : l1_entry & ~QCOW_OFLAG_COPIED;
1603                ret = qcow2_write_l1_entry(bs, i);
1604                if (ret < 0) {
1605                    res->check_errors++;
1606                    goto fail;
1607                }
1608                res->corruptions_fixed++;
1609            } else {
1610                res->corruptions++;
1611            }
1612        }
1613
1614        ret = bdrv_pread(bs->file, l2_offset, l2_table,
1615                         s->l2_size * sizeof(uint64_t));
1616        if (ret < 0) {
1617            fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1618                    strerror(-ret));
1619            res->check_errors++;
1620            goto fail;
1621        }
1622
1623        for (j = 0; j < s->l2_size; j++) {
1624            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1625            uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
1626            int cluster_type = qcow2_get_cluster_type(l2_entry);
1627
1628            if ((cluster_type == QCOW2_CLUSTER_NORMAL) ||
1629                ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) {
1630                ret = qcow2_get_refcount(bs,
1631                                         data_offset >> s->cluster_bits,
1632                                         &refcount);
1633                if (ret < 0) {
1634                    /* don't print message nor increment check_errors */
1635                    continue;
1636                }
1637                if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
1638                    fprintf(stderr, "%s OFLAG_COPIED data cluster: "
1639                            "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1640                            fix & BDRV_FIX_ERRORS ? "Repairing" :
1641                                                    "ERROR",
1642                            l2_entry, refcount);
1643                    if (fix & BDRV_FIX_ERRORS) {
1644                        l2_table[j] = cpu_to_be64(refcount == 1
1645                                    ? l2_entry |  QCOW_OFLAG_COPIED
1646                                    : l2_entry & ~QCOW_OFLAG_COPIED);
1647                        l2_dirty = true;
1648                        res->corruptions_fixed++;
1649                    } else {
1650                        res->corruptions++;
1651                    }
1652                }
1653            }
1654        }
1655
1656        if (l2_dirty) {
1657            ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1658                                                l2_offset, s->cluster_size);
1659            if (ret < 0) {
1660                fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1661                        "overlap check failed: %s\n", strerror(-ret));
1662                res->check_errors++;
1663                goto fail;
1664            }
1665
1666            ret = bdrv_pwrite(bs->file, l2_offset, l2_table, s->cluster_size);
1667            if (ret < 0) {
1668                fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1669                        strerror(-ret));
1670                res->check_errors++;
1671                goto fail;
1672            }
1673        }
1674    }
1675
1676    ret = 0;
1677
1678fail:
1679    qemu_vfree(l2_table);
1680    return ret;
1681}
1682
1683/*
1684 * Checks consistency of refblocks and accounts for each refblock in
1685 * *refcount_table.
1686 */
1687static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
1688                           BdrvCheckMode fix, bool *rebuild,
1689                           void **refcount_table, int64_t *nb_clusters)
1690{
1691    BDRVQcowState *s = bs->opaque;
1692    int64_t i, size;
1693    int ret;
1694
1695    for(i = 0; i < s->refcount_table_size; i++) {
1696        uint64_t offset, cluster;
1697        offset = s->refcount_table[i];
1698        cluster = offset >> s->cluster_bits;
1699
1700        /* Refcount blocks are cluster aligned */
1701        if (offset_into_cluster(s, offset)) {
1702            fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
1703                "cluster aligned; refcount table entry corrupted\n", i);
1704            res->corruptions++;
1705            *rebuild = true;
1706            continue;
1707        }
1708
1709        if (cluster >= *nb_clusters) {
1710            fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
1711                    fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
1712
1713            if (fix & BDRV_FIX_ERRORS) {
1714                int64_t new_nb_clusters;
1715
1716                if (offset > INT64_MAX - s->cluster_size) {
1717                    ret = -EINVAL;
1718                    goto resize_fail;
1719                }
1720
1721                ret = bdrv_truncate(bs->file, offset + s->cluster_size);
1722                if (ret < 0) {
1723                    goto resize_fail;
1724                }
1725                size = bdrv_getlength(bs->file);
1726                if (size < 0) {
1727                    ret = size;
1728                    goto resize_fail;
1729                }
1730
1731                new_nb_clusters = size_to_clusters(s, size);
1732                assert(new_nb_clusters >= *nb_clusters);
1733
1734                ret = realloc_refcount_array(s, refcount_table,
1735                                             nb_clusters, new_nb_clusters);
1736                if (ret < 0) {
1737                    res->check_errors++;
1738                    return ret;
1739                }
1740
1741                if (cluster >= *nb_clusters) {
1742                    ret = -EINVAL;
1743                    goto resize_fail;
1744                }
1745
1746                res->corruptions_fixed++;
1747                ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1748                                    offset, s->cluster_size);
1749                if (ret < 0) {
1750                    return ret;
1751                }
1752                /* No need to check whether the refcount is now greater than 1:
1753                 * This area was just allocated and zeroed, so it can only be
1754                 * exactly 1 after inc_refcounts() */
1755                continue;
1756
1757resize_fail:
1758                res->corruptions++;
1759                *rebuild = true;
1760                fprintf(stderr, "ERROR could not resize image: %s\n",
1761                        strerror(-ret));
1762            } else {
1763                res->corruptions++;
1764            }
1765            continue;
1766        }
1767
1768        if (offset != 0) {
1769            ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1770                                offset, s->cluster_size);
1771            if (ret < 0) {
1772                return ret;
1773            }
1774            if (s->get_refcount(*refcount_table, cluster) != 1) {
1775                fprintf(stderr, "ERROR refcount block %" PRId64
1776                        " refcount=%" PRIu64 "\n", i,
1777                        s->get_refcount(*refcount_table, cluster));
1778                res->corruptions++;
1779                *rebuild = true;
1780            }
1781        }
1782    }
1783
1784    return 0;
1785}
1786
1787/*
1788 * Calculates an in-memory refcount table.
1789 */
1790static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1791                               BdrvCheckMode fix, bool *rebuild,
1792                               void **refcount_table, int64_t *nb_clusters)
1793{
1794    BDRVQcowState *s = bs->opaque;
1795    int64_t i;
1796    QCowSnapshot *sn;
1797    int ret;
1798
1799    if (!*refcount_table) {
1800        int64_t old_size = 0;
1801        ret = realloc_refcount_array(s, refcount_table,
1802                                     &old_size, *nb_clusters);
1803        if (ret < 0) {
1804            res->check_errors++;
1805            return ret;
1806        }
1807    }
1808
1809    /* header */
1810    ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1811                        0, s->cluster_size);
1812    if (ret < 0) {
1813        return ret;
1814    }
1815
1816    /* current L1 table */
1817    ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1818                             s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO);
1819    if (ret < 0) {
1820        return ret;
1821    }
1822
1823    /* snapshots */
1824    for (i = 0; i < s->nb_snapshots; i++) {
1825        sn = s->snapshots + i;
1826        ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1827                                 sn->l1_table_offset, sn->l1_size, 0);
1828        if (ret < 0) {
1829            return ret;
1830        }
1831    }
1832    ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1833                        s->snapshots_offset, s->snapshots_size);
1834    if (ret < 0) {
1835        return ret;
1836    }
1837
1838    /* refcount data */
1839    ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1840                        s->refcount_table_offset,
1841                        s->refcount_table_size * sizeof(uint64_t));
1842    if (ret < 0) {
1843        return ret;
1844    }
1845
1846    return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
1847}
1848
1849/*
1850 * Compares the actual reference count for each cluster in the image against the
1851 * refcount as reported by the refcount structures on-disk.
1852 */
1853static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1854                              BdrvCheckMode fix, bool *rebuild,
1855                              int64_t *highest_cluster,
1856                              void *refcount_table, int64_t nb_clusters)
1857{
1858    BDRVQcowState *s = bs->opaque;
1859    int64_t i;
1860    uint64_t refcount1, refcount2;
1861    int ret;
1862
1863    for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
1864        ret = qcow2_get_refcount(bs, i, &refcount1);
1865        if (ret < 0) {
1866            fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
1867                    i, strerror(-ret));
1868            res->check_errors++;
1869            continue;
1870        }
1871
1872        refcount2 = s->get_refcount(refcount_table, i);
1873
1874        if (refcount1 > 0 || refcount2 > 0) {
1875            *highest_cluster = i;
1876        }
1877
1878        if (refcount1 != refcount2) {
1879            /* Check if we're allowed to fix the mismatch */
1880            int *num_fixed = NULL;
1881            if (refcount1 == 0) {
1882                *rebuild = true;
1883            } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
1884                num_fixed = &res->leaks_fixed;
1885            } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
1886                num_fixed = &res->corruptions_fixed;
1887            }
1888
1889            fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
1890                    " reference=%" PRIu64 "\n",
1891                   num_fixed != NULL     ? "Repairing" :
1892                   refcount1 < refcount2 ? "ERROR" :
1893                                           "Leaked",
1894                   i, refcount1, refcount2);
1895
1896            if (num_fixed) {
1897                ret = update_refcount(bs, i << s->cluster_bits, 1,
1898                                      refcount_diff(refcount1, refcount2),
1899                                      refcount1 > refcount2,
1900                                      QCOW2_DISCARD_ALWAYS);
1901                if (ret >= 0) {
1902                    (*num_fixed)++;
1903                    continue;
1904                }
1905            }
1906
1907            /* And if we couldn't, print an error */
1908            if (refcount1 < refcount2) {
1909                res->corruptions++;
1910            } else {
1911                res->leaks++;
1912            }
1913        }
1914    }
1915}
1916
1917/*
1918 * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
1919 * the on-disk refcount structures.
1920 *
1921 * On input, *first_free_cluster tells where to start looking, and need not
1922 * actually be a free cluster; the returned offset will not be before that
1923 * cluster.  On output, *first_free_cluster points to the first gap found, even
1924 * if that gap was too small to be used as the returned offset.
1925 *
1926 * Note that *first_free_cluster is a cluster index whereas the return value is
1927 * an offset.
1928 */
1929static int64_t alloc_clusters_imrt(BlockDriverState *bs,
1930                                   int cluster_count,
1931                                   void **refcount_table,
1932                                   int64_t *imrt_nb_clusters,
1933                                   int64_t *first_free_cluster)
1934{
1935    BDRVQcowState *s = bs->opaque;
1936    int64_t cluster = *first_free_cluster, i;
1937    bool first_gap = true;
1938    int contiguous_free_clusters;
1939    int ret;
1940
1941    /* Starting at *first_free_cluster, find a range of at least cluster_count
1942     * continuously free clusters */
1943    for (contiguous_free_clusters = 0;
1944         cluster < *imrt_nb_clusters &&
1945         contiguous_free_clusters < cluster_count;
1946         cluster++)
1947    {
1948        if (!s->get_refcount(*refcount_table, cluster)) {
1949            contiguous_free_clusters++;
1950            if (first_gap) {
1951                /* If this is the first free cluster found, update
1952                 * *first_free_cluster accordingly */
1953                *first_free_cluster = cluster;
1954                first_gap = false;
1955            }
1956        } else if (contiguous_free_clusters) {
1957            contiguous_free_clusters = 0;
1958        }
1959    }
1960
1961    /* If contiguous_free_clusters is greater than zero, it contains the number
1962     * of continuously free clusters until the current cluster; the first free
1963     * cluster in the current "gap" is therefore
1964     * cluster - contiguous_free_clusters */
1965
1966    /* If no such range could be found, grow the in-memory refcount table
1967     * accordingly to append free clusters at the end of the image */
1968    if (contiguous_free_clusters < cluster_count) {
1969        /* contiguous_free_clusters clusters are already empty at the image end;
1970         * we need cluster_count clusters; therefore, we have to allocate
1971         * cluster_count - contiguous_free_clusters new clusters at the end of
1972         * the image (which is the current value of cluster; note that cluster
1973         * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
1974         * the image end) */
1975        ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
1976                                     cluster + cluster_count
1977                                     - contiguous_free_clusters);
1978        if (ret < 0) {
1979            return ret;
1980        }
1981    }
1982
1983    /* Go back to the first free cluster */
1984    cluster -= contiguous_free_clusters;
1985    for (i = 0; i < cluster_count; i++) {
1986        s->set_refcount(*refcount_table, cluster + i, 1);
1987    }
1988
1989    return cluster << s->cluster_bits;
1990}
1991
1992/*
1993 * Creates a new refcount structure based solely on the in-memory information
1994 * given through *refcount_table. All necessary allocations will be reflected
1995 * in that array.
1996 *
1997 * On success, the old refcount structure is leaked (it will be covered by the
1998 * new refcount structure).
1999 */
2000static int rebuild_refcount_structure(BlockDriverState *bs,
2001                                      BdrvCheckResult *res,
2002                                      void **refcount_table,
2003                                      int64_t *nb_clusters)
2004{
2005    BDRVQcowState *s = bs->opaque;
2006    int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0;
2007    int64_t refblock_offset, refblock_start, refblock_index;
2008    uint32_t reftable_size = 0;
2009    uint64_t *on_disk_reftable = NULL;
2010    void *on_disk_refblock;
2011    int ret = 0;
2012    struct {
2013        uint64_t reftable_offset;
2014        uint32_t reftable_clusters;
2015    } QEMU_PACKED reftable_offset_and_clusters;
2016
2017    qcow2_cache_empty(bs, s->refcount_block_cache);
2018
2019write_refblocks:
2020    for (; cluster < *nb_clusters; cluster++) {
2021        if (!s->get_refcount(*refcount_table, cluster)) {
2022            continue;
2023        }
2024
2025        refblock_index = cluster >> s->refcount_block_bits;
2026        refblock_start = refblock_index << s->refcount_block_bits;
2027
2028        /* Don't allocate a cluster in a refblock already written to disk */
2029        if (first_free_cluster < refblock_start) {
2030            first_free_cluster = refblock_start;
2031        }
2032        refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2033                                              nb_clusters, &first_free_cluster);
2034        if (refblock_offset < 0) {
2035            fprintf(stderr, "ERROR allocating refblock: %s\n",
2036                    strerror(-refblock_offset));
2037            res->check_errors++;
2038            ret = refblock_offset;
2039            goto fail;
2040        }
2041
2042        if (reftable_size <= refblock_index) {
2043            uint32_t old_reftable_size = reftable_size;
2044            uint64_t *new_on_disk_reftable;
2045
2046            reftable_size = ROUND_UP((refblock_index + 1) * sizeof(uint64_t),
2047                                     s->cluster_size) / sizeof(uint64_t);
2048            new_on_disk_reftable = g_try_realloc(on_disk_reftable,
2049                                                 reftable_size *
2050                                                 sizeof(uint64_t));
2051            if (!new_on_disk_reftable) {
2052                res->check_errors++;
2053                ret = -ENOMEM;
2054                goto fail;
2055            }
2056            on_disk_reftable = new_on_disk_reftable;
2057
2058            memset(on_disk_reftable + old_reftable_size, 0,
2059                   (reftable_size - old_reftable_size) * sizeof(uint64_t));
2060
2061            /* The offset we have for the reftable is now no longer valid;
2062             * this will leak that range, but we can easily fix that by running
2063             * a leak-fixing check after this rebuild operation */
2064            reftable_offset = -1;
2065        }
2066        on_disk_reftable[refblock_index] = refblock_offset;
2067
2068        /* If this is apparently the last refblock (for now), try to squeeze the
2069         * reftable in */
2070        if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits &&
2071            reftable_offset < 0)
2072        {
2073            uint64_t reftable_clusters = size_to_clusters(s, reftable_size *
2074                                                          sizeof(uint64_t));
2075            reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2076                                                  refcount_table, nb_clusters,
2077                                                  &first_free_cluster);
2078            if (reftable_offset < 0) {
2079                fprintf(stderr, "ERROR allocating reftable: %s\n",
2080                        strerror(-reftable_offset));
2081                res->check_errors++;
2082                ret = reftable_offset;
2083                goto fail;
2084            }
2085        }
2086
2087        ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2088                                            s->cluster_size);
2089        if (ret < 0) {
2090            fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2091            goto fail;
2092        }
2093
2094        /* The size of *refcount_table is always cluster-aligned, therefore the
2095         * write operation will not overflow */
2096        on_disk_refblock = (void *)((char *) *refcount_table +
2097                                    refblock_index * s->cluster_size);
2098
2099        ret = bdrv_write(bs->file, refblock_offset / BDRV_SECTOR_SIZE,
2100                         on_disk_refblock, s->cluster_sectors);
2101        if (ret < 0) {
2102            fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2103            goto fail;
2104        }
2105
2106        /* Go to the end of this refblock */
2107        cluster = refblock_start + s->refcount_block_size - 1;
2108    }
2109
2110    if (reftable_offset < 0) {
2111        uint64_t post_refblock_start, reftable_clusters;
2112
2113        post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size);
2114        reftable_clusters = size_to_clusters(s,
2115                                             reftable_size * sizeof(uint64_t));
2116        /* Not pretty but simple */
2117        if (first_free_cluster < post_refblock_start) {
2118            first_free_cluster = post_refblock_start;
2119        }
2120        reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2121                                              refcount_table, nb_clusters,
2122                                              &first_free_cluster);
2123        if (reftable_offset < 0) {
2124            fprintf(stderr, "ERROR allocating reftable: %s\n",
2125                    strerror(-reftable_offset));
2126            res->check_errors++;
2127            ret = reftable_offset;
2128            goto fail;
2129        }
2130
2131        goto write_refblocks;
2132    }
2133
2134    assert(on_disk_reftable);
2135
2136    for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2137        cpu_to_be64s(&on_disk_reftable[refblock_index]);
2138    }
2139
2140    ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset,
2141                                        reftable_size * sizeof(uint64_t));
2142    if (ret < 0) {
2143        fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2144        goto fail;
2145    }
2146
2147    assert(reftable_size < INT_MAX / sizeof(uint64_t));
2148    ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable,
2149                      reftable_size * sizeof(uint64_t));
2150    if (ret < 0) {
2151        fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2152        goto fail;
2153    }
2154
2155    /* Enter new reftable into the image header */
2156    cpu_to_be64w(&reftable_offset_and_clusters.reftable_offset,
2157                 reftable_offset);
2158    cpu_to_be32w(&reftable_offset_and_clusters.reftable_clusters,
2159                 size_to_clusters(s, reftable_size * sizeof(uint64_t)));
2160    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader,
2161                                              refcount_table_offset),
2162                           &reftable_offset_and_clusters,
2163                           sizeof(reftable_offset_and_clusters));
2164    if (ret < 0) {
2165        fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret));
2166        goto fail;
2167    }
2168
2169    for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2170        be64_to_cpus(&on_disk_reftable[refblock_index]);
2171    }
2172    s->refcount_table = on_disk_reftable;
2173    s->refcount_table_offset = reftable_offset;
2174    s->refcount_table_size = reftable_size;
2175
2176    return 0;
2177
2178fail:
2179    g_free(on_disk_reftable);
2180    return ret;
2181}
2182
2183/*
2184 * Checks an image for refcount consistency.
2185 *
2186 * Returns 0 if no errors are found, the number of errors in case the image is
2187 * detected as corrupted, and -errno when an internal error occurred.
2188 */
2189int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2190                          BdrvCheckMode fix)
2191{
2192    BDRVQcowState *s = bs->opaque;
2193    BdrvCheckResult pre_compare_res;
2194    int64_t size, highest_cluster, nb_clusters;
2195    void *refcount_table = NULL;
2196    bool rebuild = false;
2197    int ret;
2198
2199    size = bdrv_getlength(bs->file);
2200    if (size < 0) {
2201        res->check_errors++;
2202        return size;
2203    }
2204
2205    nb_clusters = size_to_clusters(s, size);
2206    if (nb_clusters > INT_MAX) {
2207        res->check_errors++;
2208        return -EFBIG;
2209    }
2210
2211    res->bfi.total_clusters =
2212        size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2213
2214    ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2215                              &nb_clusters);
2216    if (ret < 0) {
2217        goto fail;
2218    }
2219
2220    /* In case we don't need to rebuild the refcount structure (but want to fix
2221     * something), this function is immediately called again, in which case the
2222     * result should be ignored */
2223    pre_compare_res = *res;
2224    compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2225                      nb_clusters);
2226
2227    if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2228        BdrvCheckResult old_res = *res;
2229        int fresh_leaks = 0;
2230
2231        fprintf(stderr, "Rebuilding refcount structure\n");
2232        ret = rebuild_refcount_structure(bs, res, &refcount_table,
2233                                         &nb_clusters);
2234        if (ret < 0) {
2235            goto fail;
2236        }
2237
2238        res->corruptions = 0;
2239        res->leaks = 0;
2240
2241        /* Because the old reftable has been exchanged for a new one the
2242         * references have to be recalculated */
2243        rebuild = false;
2244        memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2245        ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2246                                  &nb_clusters);
2247        if (ret < 0) {
2248            goto fail;
2249        }
2250
2251        if (fix & BDRV_FIX_LEAKS) {
2252            /* The old refcount structures are now leaked, fix it; the result
2253             * can be ignored, aside from leaks which were introduced by
2254             * rebuild_refcount_structure() that could not be fixed */
2255            BdrvCheckResult saved_res = *res;
2256            *res = (BdrvCheckResult){ 0 };
2257
2258            compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2259                              &highest_cluster, refcount_table, nb_clusters);
2260            if (rebuild) {
2261                fprintf(stderr, "ERROR rebuilt refcount structure is still "
2262                        "broken\n");
2263            }
2264
2265            /* Any leaks accounted for here were introduced by
2266             * rebuild_refcount_structure() because that function has created a
2267             * new refcount structure from scratch */
2268            fresh_leaks = res->leaks;
2269            *res = saved_res;
2270        }
2271
2272        if (res->corruptions < old_res.corruptions) {
2273            res->corruptions_fixed += old_res.corruptions - res->corruptions;
2274        }
2275        if (res->leaks < old_res.leaks) {
2276            res->leaks_fixed += old_res.leaks - res->leaks;
2277        }
2278        res->leaks += fresh_leaks;
2279    } else if (fix) {
2280        if (rebuild) {
2281            fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2282            res->check_errors++;
2283            ret = -EIO;
2284            goto fail;
2285        }
2286
2287        if (res->leaks || res->corruptions) {
2288            *res = pre_compare_res;
2289            compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2290                              refcount_table, nb_clusters);
2291        }
2292    }
2293
2294    /* check OFLAG_COPIED */
2295    ret = check_oflag_copied(bs, res, fix);
2296    if (ret < 0) {
2297        goto fail;
2298    }
2299
2300    res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2301    ret = 0;
2302
2303fail:
2304    g_free(refcount_table);
2305
2306    return ret;
2307}
2308
2309#define overlaps_with(ofs, sz) \
2310    ranges_overlap(offset, size, ofs, sz)
2311
2312/*
2313 * Checks if the given offset into the image file is actually free to use by
2314 * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2315 * i.e. a sanity check without relying on the refcount tables.
2316 *
2317 * The ign parameter specifies what checks not to perform (being a bitmask of
2318 * QCow2MetadataOverlap values), i.e., what sections to ignore.
2319 *
2320 * Returns:
2321 * - 0 if writing to this offset will not affect the mentioned metadata
2322 * - a positive QCow2MetadataOverlap value indicating one overlapping section
2323 * - a negative value (-errno) indicating an error while performing a check,
2324 *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
2325 */
2326int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2327                                 int64_t size)
2328{
2329    BDRVQcowState *s = bs->opaque;
2330    int chk = s->overlap_check & ~ign;
2331    int i, j;
2332
2333    if (!size) {
2334        return 0;
2335    }
2336
2337    if (chk & QCOW2_OL_MAIN_HEADER) {
2338        if (offset < s->cluster_size) {
2339            return QCOW2_OL_MAIN_HEADER;
2340        }
2341    }
2342
2343    /* align range to test to cluster boundaries */
2344    size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
2345    offset = start_of_cluster(s, offset);
2346
2347    if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2348        if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
2349            return QCOW2_OL_ACTIVE_L1;
2350        }
2351    }
2352
2353    if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2354        if (overlaps_with(s->refcount_table_offset,
2355            s->refcount_table_size * sizeof(uint64_t))) {
2356            return QCOW2_OL_REFCOUNT_TABLE;
2357        }
2358    }
2359
2360    if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2361        if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2362            return QCOW2_OL_SNAPSHOT_TABLE;
2363        }
2364    }
2365
2366    if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2367        for (i = 0; i < s->nb_snapshots; i++) {
2368            if (s->snapshots[i].l1_size &&
2369                overlaps_with(s->snapshots[i].l1_table_offset,
2370                s->snapshots[i].l1_size * sizeof(uint64_t))) {
2371                return QCOW2_OL_INACTIVE_L1;
2372            }
2373        }
2374    }
2375
2376    if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2377        for (i = 0; i < s->l1_size; i++) {
2378            if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2379                overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2380                s->cluster_size)) {
2381                return QCOW2_OL_ACTIVE_L2;
2382            }
2383        }
2384    }
2385
2386    if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2387        for (i = 0; i < s->refcount_table_size; i++) {
2388            if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2389                overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2390                s->cluster_size)) {
2391                return QCOW2_OL_REFCOUNT_BLOCK;
2392            }
2393        }
2394    }
2395
2396    if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2397        for (i = 0; i < s->nb_snapshots; i++) {
2398            uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2399            uint32_t l1_sz  = s->snapshots[i].l1_size;
2400            uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
2401            uint64_t *l1 = g_try_malloc(l1_sz2);
2402            int ret;
2403
2404            if (l1_sz2 && l1 == NULL) {
2405                return -ENOMEM;
2406            }
2407
2408            ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
2409            if (ret < 0) {
2410                g_free(l1);
2411                return ret;
2412            }
2413
2414            for (j = 0; j < l1_sz; j++) {
2415                uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
2416                if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
2417                    g_free(l1);
2418                    return QCOW2_OL_INACTIVE_L2;
2419                }
2420            }
2421
2422            g_free(l1);
2423        }
2424    }
2425
2426    return 0;
2427}
2428
2429static const char *metadata_ol_names[] = {
2430    [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header",
2431    [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table",
2432    [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table",
2433    [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
2434    [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
2435    [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
2436    [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table",
2437    [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table",
2438};
2439
2440/*
2441 * First performs a check for metadata overlaps (through
2442 * qcow2_check_metadata_overlap); if that fails with a negative value (error
2443 * while performing a check), that value is returned. If an impending overlap
2444 * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
2445 * and -EIO returned.
2446 *
2447 * Returns 0 if there were neither overlaps nor errors while checking for
2448 * overlaps; or a negative value (-errno) on error.
2449 */
2450int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
2451                                  int64_t size)
2452{
2453    int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
2454
2455    if (ret < 0) {
2456        return ret;
2457    } else if (ret > 0) {
2458        int metadata_ol_bitnr = ffs(ret) - 1;
2459        assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
2460
2461        qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
2462                                "write on metadata (overlaps with %s)",
2463                                metadata_ol_names[metadata_ol_bitnr]);
2464        return -EIO;
2465    }
2466
2467    return 0;
2468}
2469