qemu/block/qcow2-refcount.c
<<
>>
Prefs
   1/*
   2 * Block driver for the QCOW version 2 format
   3 *
   4 * Copyright (c) 2004-2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "qapi/error.h"
  27#include "qemu-common.h"
  28#include "block/block_int.h"
  29#include "block/qcow2.h"
  30#include "qemu/range.h"
  31#include "qemu/bswap.h"
  32
  33static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
  34static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
  35                            int64_t offset, int64_t length, uint64_t addend,
  36                            bool decrease, enum qcow2_discard_type type);
  37
  38static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
  39static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
  40static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
  41static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
  42static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
  43static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
  44static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
  45
  46static void set_refcount_ro0(void *refcount_array, uint64_t index,
  47                             uint64_t value);
  48static void set_refcount_ro1(void *refcount_array, uint64_t index,
  49                             uint64_t value);
  50static void set_refcount_ro2(void *refcount_array, uint64_t index,
  51                             uint64_t value);
  52static void set_refcount_ro3(void *refcount_array, uint64_t index,
  53                             uint64_t value);
  54static void set_refcount_ro4(void *refcount_array, uint64_t index,
  55                             uint64_t value);
  56static void set_refcount_ro5(void *refcount_array, uint64_t index,
  57                             uint64_t value);
  58static void set_refcount_ro6(void *refcount_array, uint64_t index,
  59                             uint64_t value);
  60
  61
  62static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
  63    &get_refcount_ro0,
  64    &get_refcount_ro1,
  65    &get_refcount_ro2,
  66    &get_refcount_ro3,
  67    &get_refcount_ro4,
  68    &get_refcount_ro5,
  69    &get_refcount_ro6
  70};
  71
  72static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
  73    &set_refcount_ro0,
  74    &set_refcount_ro1,
  75    &set_refcount_ro2,
  76    &set_refcount_ro3,
  77    &set_refcount_ro4,
  78    &set_refcount_ro5,
  79    &set_refcount_ro6
  80};
  81
  82
  83/*********************************************************/
  84/* refcount handling */
  85
  86int qcow2_refcount_init(BlockDriverState *bs)
  87{
  88    BDRVQcow2State *s = bs->opaque;
  89    unsigned int refcount_table_size2, i;
  90    int ret;
  91
  92    assert(s->refcount_order >= 0 && s->refcount_order <= 6);
  93
  94    s->get_refcount = get_refcount_funcs[s->refcount_order];
  95    s->set_refcount = set_refcount_funcs[s->refcount_order];
  96
  97    assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
  98    refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
  99    s->refcount_table = g_try_malloc(refcount_table_size2);
 100
 101    if (s->refcount_table_size > 0) {
 102        if (s->refcount_table == NULL) {
 103            ret = -ENOMEM;
 104            goto fail;
 105        }
 106        BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
 107        ret = bdrv_pread(bs->file, s->refcount_table_offset,
 108                         s->refcount_table, refcount_table_size2);
 109        if (ret < 0) {
 110            goto fail;
 111        }
 112        for(i = 0; i < s->refcount_table_size; i++)
 113            be64_to_cpus(&s->refcount_table[i]);
 114    }
 115    return 0;
 116 fail:
 117    return ret;
 118}
 119
 120void qcow2_refcount_close(BlockDriverState *bs)
 121{
 122    BDRVQcow2State *s = bs->opaque;
 123    g_free(s->refcount_table);
 124}
 125
 126
 127static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
 128{
 129    return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
 130}
 131
 132static void set_refcount_ro0(void *refcount_array, uint64_t index,
 133                             uint64_t value)
 134{
 135    assert(!(value >> 1));
 136    ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
 137    ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
 138}
 139
 140static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
 141{
 142    return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
 143           & 0x3;
 144}
 145
 146static void set_refcount_ro1(void *refcount_array, uint64_t index,
 147                             uint64_t value)
 148{
 149    assert(!(value >> 2));
 150    ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
 151    ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
 152}
 153
 154static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
 155{
 156    return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
 157           & 0xf;
 158}
 159
 160static void set_refcount_ro2(void *refcount_array, uint64_t index,
 161                             uint64_t value)
 162{
 163    assert(!(value >> 4));
 164    ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
 165    ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
 166}
 167
 168static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
 169{
 170    return ((const uint8_t *)refcount_array)[index];
 171}
 172
 173static void set_refcount_ro3(void *refcount_array, uint64_t index,
 174                             uint64_t value)
 175{
 176    assert(!(value >> 8));
 177    ((uint8_t *)refcount_array)[index] = value;
 178}
 179
 180static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
 181{
 182    return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
 183}
 184
 185static void set_refcount_ro4(void *refcount_array, uint64_t index,
 186                             uint64_t value)
 187{
 188    assert(!(value >> 16));
 189    ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
 190}
 191
 192static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
 193{
 194    return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
 195}
 196
 197static void set_refcount_ro5(void *refcount_array, uint64_t index,
 198                             uint64_t value)
 199{
 200    assert(!(value >> 32));
 201    ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
 202}
 203
 204static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
 205{
 206    return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
 207}
 208
 209static void set_refcount_ro6(void *refcount_array, uint64_t index,
 210                             uint64_t value)
 211{
 212    ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
 213}
 214
 215
 216static int load_refcount_block(BlockDriverState *bs,
 217                               int64_t refcount_block_offset,
 218                               void **refcount_block)
 219{
 220    BDRVQcow2State *s = bs->opaque;
 221
 222    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
 223    return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
 224                           refcount_block);
 225}
 226
 227/*
 228 * Retrieves the refcount of the cluster given by its index and stores it in
 229 * *refcount. Returns 0 on success and -errno on failure.
 230 */
 231int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
 232                       uint64_t *refcount)
 233{
 234    BDRVQcow2State *s = bs->opaque;
 235    uint64_t refcount_table_index, block_index;
 236    int64_t refcount_block_offset;
 237    int ret;
 238    void *refcount_block;
 239
 240    refcount_table_index = cluster_index >> s->refcount_block_bits;
 241    if (refcount_table_index >= s->refcount_table_size) {
 242        *refcount = 0;
 243        return 0;
 244    }
 245    refcount_block_offset =
 246        s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
 247    if (!refcount_block_offset) {
 248        *refcount = 0;
 249        return 0;
 250    }
 251
 252    if (offset_into_cluster(s, refcount_block_offset)) {
 253        qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
 254                                " unaligned (reftable index: %#" PRIx64 ")",
 255                                refcount_block_offset, refcount_table_index);
 256        return -EIO;
 257    }
 258
 259    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
 260                          &refcount_block);
 261    if (ret < 0) {
 262        return ret;
 263    }
 264
 265    block_index = cluster_index & (s->refcount_block_size - 1);
 266    *refcount = s->get_refcount(refcount_block, block_index);
 267
 268    qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
 269
 270    return 0;
 271}
 272
 273/*
 274 * Rounds the refcount table size up to avoid growing the table for each single
 275 * refcount block that is allocated.
 276 */
 277static unsigned int next_refcount_table_size(BDRVQcow2State *s,
 278    unsigned int min_size)
 279{
 280    unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1;
 281    unsigned int refcount_table_clusters =
 282        MAX(1, s->refcount_table_size >> (s->cluster_bits - 3));
 283
 284    while (min_clusters > refcount_table_clusters) {
 285        refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
 286    }
 287
 288    return refcount_table_clusters << (s->cluster_bits - 3);
 289}
 290
 291
 292/* Checks if two offsets are described by the same refcount block */
 293static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
 294    uint64_t offset_b)
 295{
 296    uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
 297    uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
 298
 299    return (block_a == block_b);
 300}
 301
 302/*
 303 * Loads a refcount block. If it doesn't exist yet, it is allocated first
 304 * (including growing the refcount table if needed).
 305 *
 306 * Returns 0 on success or -errno in error case
 307 */
 308static int alloc_refcount_block(BlockDriverState *bs,
 309                                int64_t cluster_index, void **refcount_block)
 310{
 311    BDRVQcow2State *s = bs->opaque;
 312    unsigned int refcount_table_index;
 313    int ret;
 314
 315    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
 316
 317    /* Find the refcount block for the given cluster */
 318    refcount_table_index = cluster_index >> s->refcount_block_bits;
 319
 320    if (refcount_table_index < s->refcount_table_size) {
 321
 322        uint64_t refcount_block_offset =
 323            s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
 324
 325        /* If it's already there, we're done */
 326        if (refcount_block_offset) {
 327            if (offset_into_cluster(s, refcount_block_offset)) {
 328                qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
 329                                        PRIx64 " unaligned (reftable index: "
 330                                        "%#x)", refcount_block_offset,
 331                                        refcount_table_index);
 332                return -EIO;
 333            }
 334
 335             return load_refcount_block(bs, refcount_block_offset,
 336                                        refcount_block);
 337        }
 338    }
 339
 340    /*
 341     * If we came here, we need to allocate something. Something is at least
 342     * a cluster for the new refcount block. It may also include a new refcount
 343     * table if the old refcount table is too small.
 344     *
 345     * Note that allocating clusters here needs some special care:
 346     *
 347     * - We can't use the normal qcow2_alloc_clusters(), it would try to
 348     *   increase the refcount and very likely we would end up with an endless
 349     *   recursion. Instead we must place the refcount blocks in a way that
 350     *   they can describe them themselves.
 351     *
 352     * - We need to consider that at this point we are inside update_refcounts
 353     *   and potentially doing an initial refcount increase. This means that
 354     *   some clusters have already been allocated by the caller, but their
 355     *   refcount isn't accurate yet. If we allocate clusters for metadata, we
 356     *   need to return -EAGAIN to signal the caller that it needs to restart
 357     *   the search for free clusters.
 358     *
 359     * - alloc_clusters_noref and qcow2_free_clusters may load a different
 360     *   refcount block into the cache
 361     */
 362
 363    *refcount_block = NULL;
 364
 365    /* We write to the refcount table, so we might depend on L2 tables */
 366    ret = qcow2_cache_flush(bs, s->l2_table_cache);
 367    if (ret < 0) {
 368        return ret;
 369    }
 370
 371    /* Allocate the refcount block itself and mark it as used */
 372    int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
 373    if (new_block < 0) {
 374        return new_block;
 375    }
 376
 377#ifdef DEBUG_ALLOC2
 378    fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
 379        " at %" PRIx64 "\n",
 380        refcount_table_index, cluster_index << s->cluster_bits, new_block);
 381#endif
 382
 383    if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
 384        /* Zero the new refcount block before updating it */
 385        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
 386                                    refcount_block);
 387        if (ret < 0) {
 388            goto fail_block;
 389        }
 390
 391        memset(*refcount_block, 0, s->cluster_size);
 392
 393        /* The block describes itself, need to update the cache */
 394        int block_index = (new_block >> s->cluster_bits) &
 395            (s->refcount_block_size - 1);
 396        s->set_refcount(*refcount_block, block_index, 1);
 397    } else {
 398        /* Described somewhere else. This can recurse at most twice before we
 399         * arrive at a block that describes itself. */
 400        ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
 401                              QCOW2_DISCARD_NEVER);
 402        if (ret < 0) {
 403            goto fail_block;
 404        }
 405
 406        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 407        if (ret < 0) {
 408            goto fail_block;
 409        }
 410
 411        /* Initialize the new refcount block only after updating its refcount,
 412         * update_refcount uses the refcount cache itself */
 413        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
 414                                    refcount_block);
 415        if (ret < 0) {
 416            goto fail_block;
 417        }
 418
 419        memset(*refcount_block, 0, s->cluster_size);
 420    }
 421
 422    /* Now the new refcount block needs to be written to disk */
 423    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
 424    qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, *refcount_block);
 425    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 426    if (ret < 0) {
 427        goto fail_block;
 428    }
 429
 430    /* If the refcount table is big enough, just hook the block up there */
 431    if (refcount_table_index < s->refcount_table_size) {
 432        uint64_t data64 = cpu_to_be64(new_block);
 433        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
 434        ret = bdrv_pwrite_sync(bs->file,
 435            s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
 436            &data64, sizeof(data64));
 437        if (ret < 0) {
 438            goto fail_block;
 439        }
 440
 441        s->refcount_table[refcount_table_index] = new_block;
 442
 443        /* The new refcount block may be where the caller intended to put its
 444         * data, so let it restart the search. */
 445        return -EAGAIN;
 446    }
 447
 448    qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
 449
 450    /*
 451     * If we come here, we need to grow the refcount table. Again, a new
 452     * refcount table needs some space and we can't simply allocate to avoid
 453     * endless recursion.
 454     *
 455     * Therefore let's grab new refcount blocks at the end of the image, which
 456     * will describe themselves and the new refcount table. This way we can
 457     * reference them only in the new table and do the switch to the new
 458     * refcount table at once without producing an inconsistent state in
 459     * between.
 460     */
 461    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
 462
 463    /* Calculate the number of refcount blocks needed so far; this will be the
 464     * basis for calculating the index of the first cluster used for the
 465     * self-describing refcount structures which we are about to create.
 466     *
 467     * Because we reached this point, there cannot be any refcount entries for
 468     * cluster_index or higher indices yet. However, because new_block has been
 469     * allocated to describe that cluster (and it will assume this role later
 470     * on), we cannot use that index; also, new_block may actually have a higher
 471     * cluster index than cluster_index, so it needs to be taken into account
 472     * here (and 1 needs to be added to its value because that cluster is used).
 473     */
 474    uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
 475                                            (new_block >> s->cluster_bits) + 1),
 476                                        s->refcount_block_size);
 477
 478    if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
 479        return -EFBIG;
 480    }
 481
 482    /* And now we need at least one block more for the new metadata */
 483    uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
 484    uint64_t last_table_size;
 485    uint64_t blocks_clusters;
 486    do {
 487        uint64_t table_clusters =
 488            size_to_clusters(s, table_size * sizeof(uint64_t));
 489        blocks_clusters = 1 +
 490            DIV_ROUND_UP(table_clusters, s->refcount_block_size);
 491        uint64_t meta_clusters = table_clusters + blocks_clusters;
 492
 493        last_table_size = table_size;
 494        table_size = next_refcount_table_size(s, blocks_used +
 495            DIV_ROUND_UP(meta_clusters, s->refcount_block_size));
 496
 497    } while (last_table_size != table_size);
 498
 499#ifdef DEBUG_ALLOC2
 500    fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n",
 501        s->refcount_table_size, table_size);
 502#endif
 503
 504    /* Create the new refcount table and blocks */
 505    uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
 506        s->cluster_size;
 507    uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
 508    uint64_t *new_table = g_try_new0(uint64_t, table_size);
 509    void *new_blocks = g_try_malloc0(blocks_clusters * s->cluster_size);
 510
 511    assert(table_size > 0 && blocks_clusters > 0);
 512    if (new_table == NULL || new_blocks == NULL) {
 513        ret = -ENOMEM;
 514        goto fail_table;
 515    }
 516
 517    /* Fill the new refcount table */
 518    memcpy(new_table, s->refcount_table,
 519        s->refcount_table_size * sizeof(uint64_t));
 520    new_table[refcount_table_index] = new_block;
 521
 522    int i;
 523    for (i = 0; i < blocks_clusters; i++) {
 524        new_table[blocks_used + i] = meta_offset + (i * s->cluster_size);
 525    }
 526
 527    /* Fill the refcount blocks */
 528    uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t));
 529    int block = 0;
 530    for (i = 0; i < table_clusters + blocks_clusters; i++) {
 531        s->set_refcount(new_blocks, block++, 1);
 532    }
 533
 534    /* Write refcount blocks to disk */
 535    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
 536    ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks,
 537        blocks_clusters * s->cluster_size);
 538    g_free(new_blocks);
 539    new_blocks = NULL;
 540    if (ret < 0) {
 541        goto fail_table;
 542    }
 543
 544    /* Write refcount table to disk */
 545    for(i = 0; i < table_size; i++) {
 546        cpu_to_be64s(&new_table[i]);
 547    }
 548
 549    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
 550    ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
 551        table_size * sizeof(uint64_t));
 552    if (ret < 0) {
 553        goto fail_table;
 554    }
 555
 556    for(i = 0; i < table_size; i++) {
 557        be64_to_cpus(&new_table[i]);
 558    }
 559
 560    /* Hook up the new refcount table in the qcow2 header */
 561    struct QEMU_PACKED {
 562        uint64_t d64;
 563        uint32_t d32;
 564    } data;
 565    data.d64 = cpu_to_be64(table_offset);
 566    data.d32 = cpu_to_be32(table_clusters);
 567    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
 568    ret = bdrv_pwrite_sync(bs->file,
 569                           offsetof(QCowHeader, refcount_table_offset),
 570                           &data, sizeof(data));
 571    if (ret < 0) {
 572        goto fail_table;
 573    }
 574
 575    /* And switch it in memory */
 576    uint64_t old_table_offset = s->refcount_table_offset;
 577    uint64_t old_table_size = s->refcount_table_size;
 578
 579    g_free(s->refcount_table);
 580    s->refcount_table = new_table;
 581    s->refcount_table_size = table_size;
 582    s->refcount_table_offset = table_offset;
 583
 584    /* Free old table. */
 585    qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
 586                        QCOW2_DISCARD_OTHER);
 587
 588    ret = load_refcount_block(bs, new_block, refcount_block);
 589    if (ret < 0) {
 590        return ret;
 591    }
 592
 593    /* If we were trying to do the initial refcount update for some cluster
 594     * allocation, we might have used the same clusters to store newly
 595     * allocated metadata. Make the caller search some new space. */
 596    return -EAGAIN;
 597
 598fail_table:
 599    g_free(new_blocks);
 600    g_free(new_table);
 601fail_block:
 602    if (*refcount_block != NULL) {
 603        qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
 604    }
 605    return ret;
 606}
 607
 608void qcow2_process_discards(BlockDriverState *bs, int ret)
 609{
 610    BDRVQcow2State *s = bs->opaque;
 611    Qcow2DiscardRegion *d, *next;
 612
 613    QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
 614        QTAILQ_REMOVE(&s->discards, d, next);
 615
 616        /* Discard is optional, ignore the return value */
 617        if (ret >= 0) {
 618            bdrv_pdiscard(bs->file->bs, d->offset, d->bytes);
 619        }
 620
 621        g_free(d);
 622    }
 623}
 624
 625static void update_refcount_discard(BlockDriverState *bs,
 626                                    uint64_t offset, uint64_t length)
 627{
 628    BDRVQcow2State *s = bs->opaque;
 629    Qcow2DiscardRegion *d, *p, *next;
 630
 631    QTAILQ_FOREACH(d, &s->discards, next) {
 632        uint64_t new_start = MIN(offset, d->offset);
 633        uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
 634
 635        if (new_end - new_start <= length + d->bytes) {
 636            /* There can't be any overlap, areas ending up here have no
 637             * references any more and therefore shouldn't get freed another
 638             * time. */
 639            assert(d->bytes + length == new_end - new_start);
 640            d->offset = new_start;
 641            d->bytes = new_end - new_start;
 642            goto found;
 643        }
 644    }
 645
 646    d = g_malloc(sizeof(*d));
 647    *d = (Qcow2DiscardRegion) {
 648        .bs     = bs,
 649        .offset = offset,
 650        .bytes  = length,
 651    };
 652    QTAILQ_INSERT_TAIL(&s->discards, d, next);
 653
 654found:
 655    /* Merge discard requests if they are adjacent now */
 656    QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
 657        if (p == d
 658            || p->offset > d->offset + d->bytes
 659            || d->offset > p->offset + p->bytes)
 660        {
 661            continue;
 662        }
 663
 664        /* Still no overlap possible */
 665        assert(p->offset == d->offset + d->bytes
 666            || d->offset == p->offset + p->bytes);
 667
 668        QTAILQ_REMOVE(&s->discards, p, next);
 669        d->offset = MIN(d->offset, p->offset);
 670        d->bytes += p->bytes;
 671        g_free(p);
 672    }
 673}
 674
 675/* XXX: cache several refcount block clusters ? */
 676/* @addend is the absolute value of the addend; if @decrease is set, @addend
 677 * will be subtracted from the current refcount, otherwise it will be added */
 678static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
 679                                                   int64_t offset,
 680                                                   int64_t length,
 681                                                   uint64_t addend,
 682                                                   bool decrease,
 683                                                   enum qcow2_discard_type type)
 684{
 685    BDRVQcow2State *s = bs->opaque;
 686    int64_t start, last, cluster_offset;
 687    void *refcount_block = NULL;
 688    int64_t old_table_index = -1;
 689    int ret;
 690
 691#ifdef DEBUG_ALLOC2
 692    fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
 693            " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
 694            addend);
 695#endif
 696    if (length < 0) {
 697        return -EINVAL;
 698    } else if (length == 0) {
 699        return 0;
 700    }
 701
 702    if (decrease) {
 703        qcow2_cache_set_dependency(bs, s->refcount_block_cache,
 704            s->l2_table_cache);
 705    }
 706
 707    start = start_of_cluster(s, offset);
 708    last = start_of_cluster(s, offset + length - 1);
 709    for(cluster_offset = start; cluster_offset <= last;
 710        cluster_offset += s->cluster_size)
 711    {
 712        int block_index;
 713        uint64_t refcount;
 714        int64_t cluster_index = cluster_offset >> s->cluster_bits;
 715        int64_t table_index = cluster_index >> s->refcount_block_bits;
 716
 717        /* Load the refcount block and allocate it if needed */
 718        if (table_index != old_table_index) {
 719            if (refcount_block) {
 720                qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
 721            }
 722            ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
 723            if (ret < 0) {
 724                goto fail;
 725            }
 726        }
 727        old_table_index = table_index;
 728
 729        qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
 730                                     refcount_block);
 731
 732        /* we can update the count and save it */
 733        block_index = cluster_index & (s->refcount_block_size - 1);
 734
 735        refcount = s->get_refcount(refcount_block, block_index);
 736        if (decrease ? (refcount - addend > refcount)
 737                     : (refcount + addend < refcount ||
 738                        refcount + addend > s->refcount_max))
 739        {
 740            ret = -EINVAL;
 741            goto fail;
 742        }
 743        if (decrease) {
 744            refcount -= addend;
 745        } else {
 746            refcount += addend;
 747        }
 748        if (refcount == 0 && cluster_index < s->free_cluster_index) {
 749            s->free_cluster_index = cluster_index;
 750        }
 751        s->set_refcount(refcount_block, block_index, refcount);
 752
 753        if (refcount == 0 && s->discard_passthrough[type]) {
 754            update_refcount_discard(bs, cluster_offset, s->cluster_size);
 755        }
 756    }
 757
 758    ret = 0;
 759fail:
 760    if (!s->cache_discards) {
 761        qcow2_process_discards(bs, ret);
 762    }
 763
 764    /* Write last changed block to disk */
 765    if (refcount_block) {
 766        qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
 767    }
 768
 769    /*
 770     * Try do undo any updates if an error is returned (This may succeed in
 771     * some cases like ENOSPC for allocating a new refcount block)
 772     */
 773    if (ret < 0) {
 774        int dummy;
 775        dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
 776                                !decrease, QCOW2_DISCARD_NEVER);
 777        (void)dummy;
 778    }
 779
 780    return ret;
 781}
 782
 783/*
 784 * Increases or decreases the refcount of a given cluster.
 785 *
 786 * @addend is the absolute value of the addend; if @decrease is set, @addend
 787 * will be subtracted from the current refcount, otherwise it will be added.
 788 *
 789 * On success 0 is returned; on failure -errno is returned.
 790 */
 791int qcow2_update_cluster_refcount(BlockDriverState *bs,
 792                                  int64_t cluster_index,
 793                                  uint64_t addend, bool decrease,
 794                                  enum qcow2_discard_type type)
 795{
 796    BDRVQcow2State *s = bs->opaque;
 797    int ret;
 798
 799    ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
 800                          decrease, type);
 801    if (ret < 0) {
 802        return ret;
 803    }
 804
 805    return 0;
 806}
 807
 808
 809
 810/*********************************************************/
 811/* cluster allocation functions */
 812
 813
 814
 815/* return < 0 if error */
 816static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
 817{
 818    BDRVQcow2State *s = bs->opaque;
 819    uint64_t i, nb_clusters, refcount;
 820    int ret;
 821
 822    /* We can't allocate clusters if they may still be queued for discard. */
 823    if (s->cache_discards) {
 824        qcow2_process_discards(bs, 0);
 825    }
 826
 827    nb_clusters = size_to_clusters(s, size);
 828retry:
 829    for(i = 0; i < nb_clusters; i++) {
 830        uint64_t next_cluster_index = s->free_cluster_index++;
 831        ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
 832
 833        if (ret < 0) {
 834            return ret;
 835        } else if (refcount != 0) {
 836            goto retry;
 837        }
 838    }
 839
 840    /* Make sure that all offsets in the "allocated" range are representable
 841     * in an int64_t */
 842    if (s->free_cluster_index > 0 &&
 843        s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits))
 844    {
 845        return -EFBIG;
 846    }
 847
 848#ifdef DEBUG_ALLOC2
 849    fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
 850            size,
 851            (s->free_cluster_index - nb_clusters) << s->cluster_bits);
 852#endif
 853    return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
 854}
 855
 856int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
 857{
 858    int64_t offset;
 859    int ret;
 860
 861    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
 862    do {
 863        offset = alloc_clusters_noref(bs, size);
 864        if (offset < 0) {
 865            return offset;
 866        }
 867
 868        ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
 869    } while (ret == -EAGAIN);
 870
 871    if (ret < 0) {
 872        return ret;
 873    }
 874
 875    return offset;
 876}
 877
 878int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
 879                                int64_t nb_clusters)
 880{
 881    BDRVQcow2State *s = bs->opaque;
 882    uint64_t cluster_index, refcount;
 883    uint64_t i;
 884    int ret;
 885
 886    assert(nb_clusters >= 0);
 887    if (nb_clusters == 0) {
 888        return 0;
 889    }
 890
 891    do {
 892        /* Check how many clusters there are free */
 893        cluster_index = offset >> s->cluster_bits;
 894        for(i = 0; i < nb_clusters; i++) {
 895            ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
 896            if (ret < 0) {
 897                return ret;
 898            } else if (refcount != 0) {
 899                break;
 900            }
 901        }
 902
 903        /* And then allocate them */
 904        ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
 905                              QCOW2_DISCARD_NEVER);
 906    } while (ret == -EAGAIN);
 907
 908    if (ret < 0) {
 909        return ret;
 910    }
 911
 912    return i;
 913}
 914
 915/* only used to allocate compressed sectors. We try to allocate
 916   contiguous sectors. size must be <= cluster_size */
 917int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
 918{
 919    BDRVQcow2State *s = bs->opaque;
 920    int64_t offset;
 921    size_t free_in_cluster;
 922    int ret;
 923
 924    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
 925    assert(size > 0 && size <= s->cluster_size);
 926    assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
 927
 928    offset = s->free_byte_offset;
 929
 930    if (offset) {
 931        uint64_t refcount;
 932        ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
 933        if (ret < 0) {
 934            return ret;
 935        }
 936
 937        if (refcount == s->refcount_max) {
 938            offset = 0;
 939        }
 940    }
 941
 942    free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
 943    do {
 944        if (!offset || free_in_cluster < size) {
 945            int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size);
 946            if (new_cluster < 0) {
 947                return new_cluster;
 948            }
 949
 950            if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
 951                offset = new_cluster;
 952                free_in_cluster = s->cluster_size;
 953            } else {
 954                free_in_cluster += s->cluster_size;
 955            }
 956        }
 957
 958        assert(offset);
 959        ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
 960        if (ret < 0) {
 961            offset = 0;
 962        }
 963    } while (ret == -EAGAIN);
 964    if (ret < 0) {
 965        return ret;
 966    }
 967
 968    /* The cluster refcount was incremented; refcount blocks must be flushed
 969     * before the caller's L2 table updates. */
 970    qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
 971
 972    s->free_byte_offset = offset + size;
 973    if (!offset_into_cluster(s, s->free_byte_offset)) {
 974        s->free_byte_offset = 0;
 975    }
 976
 977    return offset;
 978}
 979
 980void qcow2_free_clusters(BlockDriverState *bs,
 981                          int64_t offset, int64_t size,
 982                          enum qcow2_discard_type type)
 983{
 984    int ret;
 985
 986    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
 987    ret = update_refcount(bs, offset, size, 1, true, type);
 988    if (ret < 0) {
 989        fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
 990        /* TODO Remember the clusters to free them later and avoid leaking */
 991    }
 992}
 993
 994/*
 995 * Free a cluster using its L2 entry (handles clusters of all types, e.g.
 996 * normal cluster, compressed cluster, etc.)
 997 */
 998void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
 999                             int nb_clusters, enum qcow2_discard_type type)
1000{
1001    BDRVQcow2State *s = bs->opaque;
1002
1003    switch (qcow2_get_cluster_type(l2_entry)) {
1004    case QCOW2_CLUSTER_COMPRESSED:
1005        {
1006            int nb_csectors;
1007            nb_csectors = ((l2_entry >> s->csize_shift) &
1008                           s->csize_mask) + 1;
1009            qcow2_free_clusters(bs,
1010                (l2_entry & s->cluster_offset_mask) & ~511,
1011                nb_csectors * 512, type);
1012        }
1013        break;
1014    case QCOW2_CLUSTER_NORMAL:
1015    case QCOW2_CLUSTER_ZERO:
1016        if (l2_entry & L2E_OFFSET_MASK) {
1017            if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1018                qcow2_signal_corruption(bs, false, -1, -1,
1019                                        "Cannot free unaligned cluster %#llx",
1020                                        l2_entry & L2E_OFFSET_MASK);
1021            } else {
1022                qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1023                                    nb_clusters << s->cluster_bits, type);
1024            }
1025        }
1026        break;
1027    case QCOW2_CLUSTER_UNALLOCATED:
1028        break;
1029    default:
1030        abort();
1031    }
1032}
1033
1034
1035
1036/*********************************************************/
1037/* snapshots and image creation */
1038
1039
1040
1041/* update the refcounts of snapshots and the copied flag */
1042int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1043    int64_t l1_table_offset, int l1_size, int addend)
1044{
1045    BDRVQcow2State *s = bs->opaque;
1046    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, refcount;
1047    bool l1_allocated = false;
1048    int64_t old_offset, old_l2_offset;
1049    int i, j, l1_modified = 0, nb_csectors;
1050    int ret;
1051
1052    assert(addend >= -1 && addend <= 1);
1053
1054    l2_table = NULL;
1055    l1_table = NULL;
1056    l1_size2 = l1_size * sizeof(uint64_t);
1057
1058    s->cache_discards = true;
1059
1060    /* WARNING: qcow2_snapshot_goto relies on this function not using the
1061     * l1_table_offset when it is the current s->l1_table_offset! Be careful
1062     * when changing this! */
1063    if (l1_table_offset != s->l1_table_offset) {
1064        l1_table = g_try_malloc0(align_offset(l1_size2, 512));
1065        if (l1_size2 && l1_table == NULL) {
1066            ret = -ENOMEM;
1067            goto fail;
1068        }
1069        l1_allocated = true;
1070
1071        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1072        if (ret < 0) {
1073            goto fail;
1074        }
1075
1076        for(i = 0;i < l1_size; i++)
1077            be64_to_cpus(&l1_table[i]);
1078    } else {
1079        assert(l1_size == s->l1_size);
1080        l1_table = s->l1_table;
1081        l1_allocated = false;
1082    }
1083
1084    for(i = 0; i < l1_size; i++) {
1085        l2_offset = l1_table[i];
1086        if (l2_offset) {
1087            old_l2_offset = l2_offset;
1088            l2_offset &= L1E_OFFSET_MASK;
1089
1090            if (offset_into_cluster(s, l2_offset)) {
1091                qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1092                                        PRIx64 " unaligned (L1 index: %#x)",
1093                                        l2_offset, i);
1094                ret = -EIO;
1095                goto fail;
1096            }
1097
1098            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
1099                (void**) &l2_table);
1100            if (ret < 0) {
1101                goto fail;
1102            }
1103
1104            for(j = 0; j < s->l2_size; j++) {
1105                uint64_t cluster_index;
1106
1107                offset = be64_to_cpu(l2_table[j]);
1108                old_offset = offset;
1109                offset &= ~QCOW_OFLAG_COPIED;
1110
1111                switch (qcow2_get_cluster_type(offset)) {
1112                    case QCOW2_CLUSTER_COMPRESSED:
1113                        nb_csectors = ((offset >> s->csize_shift) &
1114                                       s->csize_mask) + 1;
1115                        if (addend != 0) {
1116                            ret = update_refcount(bs,
1117                                (offset & s->cluster_offset_mask) & ~511,
1118                                nb_csectors * 512, abs(addend), addend < 0,
1119                                QCOW2_DISCARD_SNAPSHOT);
1120                            if (ret < 0) {
1121                                goto fail;
1122                            }
1123                        }
1124                        /* compressed clusters are never modified */
1125                        refcount = 2;
1126                        break;
1127
1128                    case QCOW2_CLUSTER_NORMAL:
1129                    case QCOW2_CLUSTER_ZERO:
1130                        if (offset_into_cluster(s, offset & L2E_OFFSET_MASK)) {
1131                            qcow2_signal_corruption(bs, true, -1, -1, "Data "
1132                                                    "cluster offset %#llx "
1133                                                    "unaligned (L2 offset: %#"
1134                                                    PRIx64 ", L2 index: %#x)",
1135                                                    offset & L2E_OFFSET_MASK,
1136                                                    l2_offset, j);
1137                            ret = -EIO;
1138                            goto fail;
1139                        }
1140
1141                        cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
1142                        if (!cluster_index) {
1143                            /* unallocated */
1144                            refcount = 0;
1145                            break;
1146                        }
1147                        if (addend != 0) {
1148                            ret = qcow2_update_cluster_refcount(bs,
1149                                    cluster_index, abs(addend), addend < 0,
1150                                    QCOW2_DISCARD_SNAPSHOT);
1151                            if (ret < 0) {
1152                                goto fail;
1153                            }
1154                        }
1155
1156                        ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1157                        if (ret < 0) {
1158                            goto fail;
1159                        }
1160                        break;
1161
1162                    case QCOW2_CLUSTER_UNALLOCATED:
1163                        refcount = 0;
1164                        break;
1165
1166                    default:
1167                        abort();
1168                }
1169
1170                if (refcount == 1) {
1171                    offset |= QCOW_OFLAG_COPIED;
1172                }
1173                if (offset != old_offset) {
1174                    if (addend > 0) {
1175                        qcow2_cache_set_dependency(bs, s->l2_table_cache,
1176                            s->refcount_block_cache);
1177                    }
1178                    l2_table[j] = cpu_to_be64(offset);
1179                    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache,
1180                                                 l2_table);
1181                }
1182            }
1183
1184            qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
1185
1186            if (addend != 0) {
1187                ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1188                                                        s->cluster_bits,
1189                                                    abs(addend), addend < 0,
1190                                                    QCOW2_DISCARD_SNAPSHOT);
1191                if (ret < 0) {
1192                    goto fail;
1193                }
1194            }
1195            ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1196                                     &refcount);
1197            if (ret < 0) {
1198                goto fail;
1199            } else if (refcount == 1) {
1200                l2_offset |= QCOW_OFLAG_COPIED;
1201            }
1202            if (l2_offset != old_l2_offset) {
1203                l1_table[i] = l2_offset;
1204                l1_modified = 1;
1205            }
1206        }
1207    }
1208
1209    ret = bdrv_flush(bs);
1210fail:
1211    if (l2_table) {
1212        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
1213    }
1214
1215    s->cache_discards = false;
1216    qcow2_process_discards(bs, ret);
1217
1218    /* Update L1 only if it isn't deleted anyway (addend = -1) */
1219    if (ret == 0 && addend >= 0 && l1_modified) {
1220        for (i = 0; i < l1_size; i++) {
1221            cpu_to_be64s(&l1_table[i]);
1222        }
1223
1224        ret = bdrv_pwrite_sync(bs->file, l1_table_offset,
1225                               l1_table, l1_size2);
1226
1227        for (i = 0; i < l1_size; i++) {
1228            be64_to_cpus(&l1_table[i]);
1229        }
1230    }
1231    if (l1_allocated)
1232        g_free(l1_table);
1233    return ret;
1234}
1235
1236
1237
1238
1239/*********************************************************/
1240/* refcount checking functions */
1241
1242
1243static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
1244{
1245    /* This assertion holds because there is no way we can address more than
1246     * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1247     * offsets have to be representable in bytes); due to every cluster
1248     * corresponding to one refcount entry, we are well below that limit */
1249    assert(entries < (UINT64_C(1) << (64 - 9)));
1250
1251    /* Thanks to the assertion this will not overflow, because
1252     * s->refcount_order < 7.
1253     * (note: x << s->refcount_order == x * s->refcount_bits) */
1254    return DIV_ROUND_UP(entries << s->refcount_order, 8);
1255}
1256
1257/**
1258 * Reallocates *array so that it can hold new_size entries. *size must contain
1259 * the current number of entries in *array. If the reallocation fails, *array
1260 * and *size will not be modified and -errno will be returned. If the
1261 * reallocation is successful, *array will be set to the new buffer, *size
1262 * will be set to new_size and 0 will be returned. The size of the reallocated
1263 * refcount array buffer will be aligned to a cluster boundary, and the newly
1264 * allocated area will be zeroed.
1265 */
1266static int realloc_refcount_array(BDRVQcow2State *s, void **array,
1267                                  int64_t *size, int64_t new_size)
1268{
1269    int64_t old_byte_size, new_byte_size;
1270    void *new_ptr;
1271
1272    /* Round to clusters so the array can be directly written to disk */
1273    old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1274                    * s->cluster_size;
1275    new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1276                    * s->cluster_size;
1277
1278    if (new_byte_size == old_byte_size) {
1279        *size = new_size;
1280        return 0;
1281    }
1282
1283    assert(new_byte_size > 0);
1284
1285    if (new_byte_size > SIZE_MAX) {
1286        return -ENOMEM;
1287    }
1288
1289    new_ptr = g_try_realloc(*array, new_byte_size);
1290    if (!new_ptr) {
1291        return -ENOMEM;
1292    }
1293
1294    if (new_byte_size > old_byte_size) {
1295        memset((char *)new_ptr + old_byte_size, 0,
1296               new_byte_size - old_byte_size);
1297    }
1298
1299    *array = new_ptr;
1300    *size  = new_size;
1301
1302    return 0;
1303}
1304
1305/*
1306 * Increases the refcount for a range of clusters in a given refcount table.
1307 * This is used to construct a temporary refcount table out of L1 and L2 tables
1308 * which can be compared to the refcount table saved in the image.
1309 *
1310 * Modifies the number of errors in res.
1311 */
1312static int inc_refcounts(BlockDriverState *bs,
1313                         BdrvCheckResult *res,
1314                         void **refcount_table,
1315                         int64_t *refcount_table_size,
1316                         int64_t offset, int64_t size)
1317{
1318    BDRVQcow2State *s = bs->opaque;
1319    uint64_t start, last, cluster_offset, k, refcount;
1320    int ret;
1321
1322    if (size <= 0) {
1323        return 0;
1324    }
1325
1326    start = start_of_cluster(s, offset);
1327    last = start_of_cluster(s, offset + size - 1);
1328    for(cluster_offset = start; cluster_offset <= last;
1329        cluster_offset += s->cluster_size) {
1330        k = cluster_offset >> s->cluster_bits;
1331        if (k >= *refcount_table_size) {
1332            ret = realloc_refcount_array(s, refcount_table,
1333                                         refcount_table_size, k + 1);
1334            if (ret < 0) {
1335                res->check_errors++;
1336                return ret;
1337            }
1338        }
1339
1340        refcount = s->get_refcount(*refcount_table, k);
1341        if (refcount == s->refcount_max) {
1342            fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1343                    "\n", cluster_offset);
1344            fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1345                    "width or qemu-img convert to create a clean copy if the "
1346                    "image cannot be opened for writing\n");
1347            res->corruptions++;
1348            continue;
1349        }
1350        s->set_refcount(*refcount_table, k, refcount + 1);
1351    }
1352
1353    return 0;
1354}
1355
1356/* Flags for check_refcounts_l1() and check_refcounts_l2() */
1357enum {
1358    CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1359};
1360
1361/*
1362 * Increases the refcount in the given refcount table for the all clusters
1363 * referenced in the L2 table. While doing so, performs some checks on L2
1364 * entries.
1365 *
1366 * Returns the number of errors found by the checks or -errno if an internal
1367 * error occurred.
1368 */
1369static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1370                              void **refcount_table,
1371                              int64_t *refcount_table_size, int64_t l2_offset,
1372                              int flags)
1373{
1374    BDRVQcow2State *s = bs->opaque;
1375    uint64_t *l2_table, l2_entry;
1376    uint64_t next_contiguous_offset = 0;
1377    int i, l2_size, nb_csectors, ret;
1378
1379    /* Read L2 table from disk */
1380    l2_size = s->l2_size * sizeof(uint64_t);
1381    l2_table = g_malloc(l2_size);
1382
1383    ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size);
1384    if (ret < 0) {
1385        fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1386        res->check_errors++;
1387        goto fail;
1388    }
1389
1390    /* Do the actual checks */
1391    for(i = 0; i < s->l2_size; i++) {
1392        l2_entry = be64_to_cpu(l2_table[i]);
1393
1394        switch (qcow2_get_cluster_type(l2_entry)) {
1395        case QCOW2_CLUSTER_COMPRESSED:
1396            /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1397            if (l2_entry & QCOW_OFLAG_COPIED) {
1398                fprintf(stderr, "ERROR: cluster %" PRId64 ": "
1399                    "copied flag must never be set for compressed "
1400                    "clusters\n", l2_entry >> s->cluster_bits);
1401                l2_entry &= ~QCOW_OFLAG_COPIED;
1402                res->corruptions++;
1403            }
1404
1405            /* Mark cluster as used */
1406            nb_csectors = ((l2_entry >> s->csize_shift) &
1407                           s->csize_mask) + 1;
1408            l2_entry &= s->cluster_offset_mask;
1409            ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1410                                l2_entry & ~511, nb_csectors * 512);
1411            if (ret < 0) {
1412                goto fail;
1413            }
1414
1415            if (flags & CHECK_FRAG_INFO) {
1416                res->bfi.allocated_clusters++;
1417                res->bfi.compressed_clusters++;
1418
1419                /* Compressed clusters are fragmented by nature.  Since they
1420                 * take up sub-sector space but we only have sector granularity
1421                 * I/O we need to re-read the same sectors even for adjacent
1422                 * compressed clusters.
1423                 */
1424                res->bfi.fragmented_clusters++;
1425            }
1426            break;
1427
1428        case QCOW2_CLUSTER_ZERO:
1429            if ((l2_entry & L2E_OFFSET_MASK) == 0) {
1430                break;
1431            }
1432            /* fall through */
1433
1434        case QCOW2_CLUSTER_NORMAL:
1435        {
1436            uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1437
1438            if (flags & CHECK_FRAG_INFO) {
1439                res->bfi.allocated_clusters++;
1440                if (next_contiguous_offset &&
1441                    offset != next_contiguous_offset) {
1442                    res->bfi.fragmented_clusters++;
1443                }
1444                next_contiguous_offset = offset + s->cluster_size;
1445            }
1446
1447            /* Mark cluster as used */
1448            ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1449                                offset, s->cluster_size);
1450            if (ret < 0) {
1451                goto fail;
1452            }
1453
1454            /* Correct offsets are cluster aligned */
1455            if (offset_into_cluster(s, offset)) {
1456                fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
1457                    "properly aligned; L2 entry corrupted.\n", offset);
1458                res->corruptions++;
1459            }
1460            break;
1461        }
1462
1463        case QCOW2_CLUSTER_UNALLOCATED:
1464            break;
1465
1466        default:
1467            abort();
1468        }
1469    }
1470
1471    g_free(l2_table);
1472    return 0;
1473
1474fail:
1475    g_free(l2_table);
1476    return ret;
1477}
1478
1479/*
1480 * Increases the refcount for the L1 table, its L2 tables and all referenced
1481 * clusters in the given refcount table. While doing so, performs some checks
1482 * on L1 and L2 entries.
1483 *
1484 * Returns the number of errors found by the checks or -errno if an internal
1485 * error occurred.
1486 */
1487static int check_refcounts_l1(BlockDriverState *bs,
1488                              BdrvCheckResult *res,
1489                              void **refcount_table,
1490                              int64_t *refcount_table_size,
1491                              int64_t l1_table_offset, int l1_size,
1492                              int flags)
1493{
1494    BDRVQcow2State *s = bs->opaque;
1495    uint64_t *l1_table = NULL, l2_offset, l1_size2;
1496    int i, ret;
1497
1498    l1_size2 = l1_size * sizeof(uint64_t);
1499
1500    /* Mark L1 table as used */
1501    ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1502                        l1_table_offset, l1_size2);
1503    if (ret < 0) {
1504        goto fail;
1505    }
1506
1507    /* Read L1 table entries from disk */
1508    if (l1_size2 > 0) {
1509        l1_table = g_try_malloc(l1_size2);
1510        if (l1_table == NULL) {
1511            ret = -ENOMEM;
1512            res->check_errors++;
1513            goto fail;
1514        }
1515        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1516        if (ret < 0) {
1517            fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1518            res->check_errors++;
1519            goto fail;
1520        }
1521        for(i = 0;i < l1_size; i++)
1522            be64_to_cpus(&l1_table[i]);
1523    }
1524
1525    /* Do the actual checks */
1526    for(i = 0; i < l1_size; i++) {
1527        l2_offset = l1_table[i];
1528        if (l2_offset) {
1529            /* Mark L2 table as used */
1530            l2_offset &= L1E_OFFSET_MASK;
1531            ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1532                                l2_offset, s->cluster_size);
1533            if (ret < 0) {
1534                goto fail;
1535            }
1536
1537            /* L2 tables are cluster aligned */
1538            if (offset_into_cluster(s, l2_offset)) {
1539                fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1540                    "cluster aligned; L1 entry corrupted\n", l2_offset);
1541                res->corruptions++;
1542            }
1543
1544            /* Process and check L2 entries */
1545            ret = check_refcounts_l2(bs, res, refcount_table,
1546                                     refcount_table_size, l2_offset, flags);
1547            if (ret < 0) {
1548                goto fail;
1549            }
1550        }
1551    }
1552    g_free(l1_table);
1553    return 0;
1554
1555fail:
1556    g_free(l1_table);
1557    return ret;
1558}
1559
1560/*
1561 * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1562 *
1563 * This function does not print an error message nor does it increment
1564 * check_errors if qcow2_get_refcount fails (this is because such an error will
1565 * have been already detected and sufficiently signaled by the calling function
1566 * (qcow2_check_refcounts) by the time this function is called).
1567 */
1568static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1569                              BdrvCheckMode fix)
1570{
1571    BDRVQcow2State *s = bs->opaque;
1572    uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1573    int ret;
1574    uint64_t refcount;
1575    int i, j;
1576
1577    for (i = 0; i < s->l1_size; i++) {
1578        uint64_t l1_entry = s->l1_table[i];
1579        uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1580        bool l2_dirty = false;
1581
1582        if (!l2_offset) {
1583            continue;
1584        }
1585
1586        ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1587                                 &refcount);
1588        if (ret < 0) {
1589            /* don't print message nor increment check_errors */
1590            continue;
1591        }
1592        if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1593            fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1594                    "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1595                    fix & BDRV_FIX_ERRORS ? "Repairing" :
1596                                            "ERROR",
1597                    i, l1_entry, refcount);
1598            if (fix & BDRV_FIX_ERRORS) {
1599                s->l1_table[i] = refcount == 1
1600                               ? l1_entry |  QCOW_OFLAG_COPIED
1601                               : l1_entry & ~QCOW_OFLAG_COPIED;
1602                ret = qcow2_write_l1_entry(bs, i);
1603                if (ret < 0) {
1604                    res->check_errors++;
1605                    goto fail;
1606                }
1607                res->corruptions_fixed++;
1608            } else {
1609                res->corruptions++;
1610            }
1611        }
1612
1613        ret = bdrv_pread(bs->file, l2_offset, l2_table,
1614                         s->l2_size * sizeof(uint64_t));
1615        if (ret < 0) {
1616            fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1617                    strerror(-ret));
1618            res->check_errors++;
1619            goto fail;
1620        }
1621
1622        for (j = 0; j < s->l2_size; j++) {
1623            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1624            uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
1625            int cluster_type = qcow2_get_cluster_type(l2_entry);
1626
1627            if ((cluster_type == QCOW2_CLUSTER_NORMAL) ||
1628                ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) {
1629                ret = qcow2_get_refcount(bs,
1630                                         data_offset >> s->cluster_bits,
1631                                         &refcount);
1632                if (ret < 0) {
1633                    /* don't print message nor increment check_errors */
1634                    continue;
1635                }
1636                if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
1637                    fprintf(stderr, "%s OFLAG_COPIED data cluster: "
1638                            "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1639                            fix & BDRV_FIX_ERRORS ? "Repairing" :
1640                                                    "ERROR",
1641                            l2_entry, refcount);
1642                    if (fix & BDRV_FIX_ERRORS) {
1643                        l2_table[j] = cpu_to_be64(refcount == 1
1644                                    ? l2_entry |  QCOW_OFLAG_COPIED
1645                                    : l2_entry & ~QCOW_OFLAG_COPIED);
1646                        l2_dirty = true;
1647                        res->corruptions_fixed++;
1648                    } else {
1649                        res->corruptions++;
1650                    }
1651                }
1652            }
1653        }
1654
1655        if (l2_dirty) {
1656            ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1657                                                l2_offset, s->cluster_size);
1658            if (ret < 0) {
1659                fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1660                        "overlap check failed: %s\n", strerror(-ret));
1661                res->check_errors++;
1662                goto fail;
1663            }
1664
1665            ret = bdrv_pwrite(bs->file, l2_offset, l2_table,
1666                              s->cluster_size);
1667            if (ret < 0) {
1668                fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1669                        strerror(-ret));
1670                res->check_errors++;
1671                goto fail;
1672            }
1673        }
1674    }
1675
1676    ret = 0;
1677
1678fail:
1679    qemu_vfree(l2_table);
1680    return ret;
1681}
1682
1683/*
1684 * Checks consistency of refblocks and accounts for each refblock in
1685 * *refcount_table.
1686 */
1687static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
1688                           BdrvCheckMode fix, bool *rebuild,
1689                           void **refcount_table, int64_t *nb_clusters)
1690{
1691    BDRVQcow2State *s = bs->opaque;
1692    int64_t i, size;
1693    int ret;
1694
1695    for(i = 0; i < s->refcount_table_size; i++) {
1696        uint64_t offset, cluster;
1697        offset = s->refcount_table[i];
1698        cluster = offset >> s->cluster_bits;
1699
1700        /* Refcount blocks are cluster aligned */
1701        if (offset_into_cluster(s, offset)) {
1702            fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
1703                "cluster aligned; refcount table entry corrupted\n", i);
1704            res->corruptions++;
1705            *rebuild = true;
1706            continue;
1707        }
1708
1709        if (cluster >= *nb_clusters) {
1710            fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
1711                    fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
1712
1713            if (fix & BDRV_FIX_ERRORS) {
1714                int64_t new_nb_clusters;
1715
1716                if (offset > INT64_MAX - s->cluster_size) {
1717                    ret = -EINVAL;
1718                    goto resize_fail;
1719                }
1720
1721                ret = bdrv_truncate(bs->file->bs, offset + s->cluster_size);
1722                if (ret < 0) {
1723                    goto resize_fail;
1724                }
1725                size = bdrv_getlength(bs->file->bs);
1726                if (size < 0) {
1727                    ret = size;
1728                    goto resize_fail;
1729                }
1730
1731                new_nb_clusters = size_to_clusters(s, size);
1732                assert(new_nb_clusters >= *nb_clusters);
1733
1734                ret = realloc_refcount_array(s, refcount_table,
1735                                             nb_clusters, new_nb_clusters);
1736                if (ret < 0) {
1737                    res->check_errors++;
1738                    return ret;
1739                }
1740
1741                if (cluster >= *nb_clusters) {
1742                    ret = -EINVAL;
1743                    goto resize_fail;
1744                }
1745
1746                res->corruptions_fixed++;
1747                ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1748                                    offset, s->cluster_size);
1749                if (ret < 0) {
1750                    return ret;
1751                }
1752                /* No need to check whether the refcount is now greater than 1:
1753                 * This area was just allocated and zeroed, so it can only be
1754                 * exactly 1 after inc_refcounts() */
1755                continue;
1756
1757resize_fail:
1758                res->corruptions++;
1759                *rebuild = true;
1760                fprintf(stderr, "ERROR could not resize image: %s\n",
1761                        strerror(-ret));
1762            } else {
1763                res->corruptions++;
1764            }
1765            continue;
1766        }
1767
1768        if (offset != 0) {
1769            ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1770                                offset, s->cluster_size);
1771            if (ret < 0) {
1772                return ret;
1773            }
1774            if (s->get_refcount(*refcount_table, cluster) != 1) {
1775                fprintf(stderr, "ERROR refcount block %" PRId64
1776                        " refcount=%" PRIu64 "\n", i,
1777                        s->get_refcount(*refcount_table, cluster));
1778                res->corruptions++;
1779                *rebuild = true;
1780            }
1781        }
1782    }
1783
1784    return 0;
1785}
1786
1787/*
1788 * Calculates an in-memory refcount table.
1789 */
1790static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1791                               BdrvCheckMode fix, bool *rebuild,
1792                               void **refcount_table, int64_t *nb_clusters)
1793{
1794    BDRVQcow2State *s = bs->opaque;
1795    int64_t i;
1796    QCowSnapshot *sn;
1797    int ret;
1798
1799    if (!*refcount_table) {
1800        int64_t old_size = 0;
1801        ret = realloc_refcount_array(s, refcount_table,
1802                                     &old_size, *nb_clusters);
1803        if (ret < 0) {
1804            res->check_errors++;
1805            return ret;
1806        }
1807    }
1808
1809    /* header */
1810    ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1811                        0, s->cluster_size);
1812    if (ret < 0) {
1813        return ret;
1814    }
1815
1816    /* current L1 table */
1817    ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1818                             s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO);
1819    if (ret < 0) {
1820        return ret;
1821    }
1822
1823    /* snapshots */
1824    for (i = 0; i < s->nb_snapshots; i++) {
1825        sn = s->snapshots + i;
1826        ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1827                                 sn->l1_table_offset, sn->l1_size, 0);
1828        if (ret < 0) {
1829            return ret;
1830        }
1831    }
1832    ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1833                        s->snapshots_offset, s->snapshots_size);
1834    if (ret < 0) {
1835        return ret;
1836    }
1837
1838    /* refcount data */
1839    ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1840                        s->refcount_table_offset,
1841                        s->refcount_table_size * sizeof(uint64_t));
1842    if (ret < 0) {
1843        return ret;
1844    }
1845
1846    return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
1847}
1848
1849/*
1850 * Compares the actual reference count for each cluster in the image against the
1851 * refcount as reported by the refcount structures on-disk.
1852 */
1853static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1854                              BdrvCheckMode fix, bool *rebuild,
1855                              int64_t *highest_cluster,
1856                              void *refcount_table, int64_t nb_clusters)
1857{
1858    BDRVQcow2State *s = bs->opaque;
1859    int64_t i;
1860    uint64_t refcount1, refcount2;
1861    int ret;
1862
1863    for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
1864        ret = qcow2_get_refcount(bs, i, &refcount1);
1865        if (ret < 0) {
1866            fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
1867                    i, strerror(-ret));
1868            res->check_errors++;
1869            continue;
1870        }
1871
1872        refcount2 = s->get_refcount(refcount_table, i);
1873
1874        if (refcount1 > 0 || refcount2 > 0) {
1875            *highest_cluster = i;
1876        }
1877
1878        if (refcount1 != refcount2) {
1879            /* Check if we're allowed to fix the mismatch */
1880            int *num_fixed = NULL;
1881            if (refcount1 == 0) {
1882                *rebuild = true;
1883            } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
1884                num_fixed = &res->leaks_fixed;
1885            } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
1886                num_fixed = &res->corruptions_fixed;
1887            }
1888
1889            fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
1890                    " reference=%" PRIu64 "\n",
1891                   num_fixed != NULL     ? "Repairing" :
1892                   refcount1 < refcount2 ? "ERROR" :
1893                                           "Leaked",
1894                   i, refcount1, refcount2);
1895
1896            if (num_fixed) {
1897                ret = update_refcount(bs, i << s->cluster_bits, 1,
1898                                      refcount_diff(refcount1, refcount2),
1899                                      refcount1 > refcount2,
1900                                      QCOW2_DISCARD_ALWAYS);
1901                if (ret >= 0) {
1902                    (*num_fixed)++;
1903                    continue;
1904                }
1905            }
1906
1907            /* And if we couldn't, print an error */
1908            if (refcount1 < refcount2) {
1909                res->corruptions++;
1910            } else {
1911                res->leaks++;
1912            }
1913        }
1914    }
1915}
1916
1917/*
1918 * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
1919 * the on-disk refcount structures.
1920 *
1921 * On input, *first_free_cluster tells where to start looking, and need not
1922 * actually be a free cluster; the returned offset will not be before that
1923 * cluster.  On output, *first_free_cluster points to the first gap found, even
1924 * if that gap was too small to be used as the returned offset.
1925 *
1926 * Note that *first_free_cluster is a cluster index whereas the return value is
1927 * an offset.
1928 */
1929static int64_t alloc_clusters_imrt(BlockDriverState *bs,
1930                                   int cluster_count,
1931                                   void **refcount_table,
1932                                   int64_t *imrt_nb_clusters,
1933                                   int64_t *first_free_cluster)
1934{
1935    BDRVQcow2State *s = bs->opaque;
1936    int64_t cluster = *first_free_cluster, i;
1937    bool first_gap = true;
1938    int contiguous_free_clusters;
1939    int ret;
1940
1941    /* Starting at *first_free_cluster, find a range of at least cluster_count
1942     * continuously free clusters */
1943    for (contiguous_free_clusters = 0;
1944         cluster < *imrt_nb_clusters &&
1945         contiguous_free_clusters < cluster_count;
1946         cluster++)
1947    {
1948        if (!s->get_refcount(*refcount_table, cluster)) {
1949            contiguous_free_clusters++;
1950            if (first_gap) {
1951                /* If this is the first free cluster found, update
1952                 * *first_free_cluster accordingly */
1953                *first_free_cluster = cluster;
1954                first_gap = false;
1955            }
1956        } else if (contiguous_free_clusters) {
1957            contiguous_free_clusters = 0;
1958        }
1959    }
1960
1961    /* If contiguous_free_clusters is greater than zero, it contains the number
1962     * of continuously free clusters until the current cluster; the first free
1963     * cluster in the current "gap" is therefore
1964     * cluster - contiguous_free_clusters */
1965
1966    /* If no such range could be found, grow the in-memory refcount table
1967     * accordingly to append free clusters at the end of the image */
1968    if (contiguous_free_clusters < cluster_count) {
1969        /* contiguous_free_clusters clusters are already empty at the image end;
1970         * we need cluster_count clusters; therefore, we have to allocate
1971         * cluster_count - contiguous_free_clusters new clusters at the end of
1972         * the image (which is the current value of cluster; note that cluster
1973         * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
1974         * the image end) */
1975        ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
1976                                     cluster + cluster_count
1977                                     - contiguous_free_clusters);
1978        if (ret < 0) {
1979            return ret;
1980        }
1981    }
1982
1983    /* Go back to the first free cluster */
1984    cluster -= contiguous_free_clusters;
1985    for (i = 0; i < cluster_count; i++) {
1986        s->set_refcount(*refcount_table, cluster + i, 1);
1987    }
1988
1989    return cluster << s->cluster_bits;
1990}
1991
1992/*
1993 * Creates a new refcount structure based solely on the in-memory information
1994 * given through *refcount_table. All necessary allocations will be reflected
1995 * in that array.
1996 *
1997 * On success, the old refcount structure is leaked (it will be covered by the
1998 * new refcount structure).
1999 */
2000static int rebuild_refcount_structure(BlockDriverState *bs,
2001                                      BdrvCheckResult *res,
2002                                      void **refcount_table,
2003                                      int64_t *nb_clusters)
2004{
2005    BDRVQcow2State *s = bs->opaque;
2006    int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0;
2007    int64_t refblock_offset, refblock_start, refblock_index;
2008    uint32_t reftable_size = 0;
2009    uint64_t *on_disk_reftable = NULL;
2010    void *on_disk_refblock;
2011    int ret = 0;
2012    struct {
2013        uint64_t reftable_offset;
2014        uint32_t reftable_clusters;
2015    } QEMU_PACKED reftable_offset_and_clusters;
2016
2017    qcow2_cache_empty(bs, s->refcount_block_cache);
2018
2019write_refblocks:
2020    for (; cluster < *nb_clusters; cluster++) {
2021        if (!s->get_refcount(*refcount_table, cluster)) {
2022            continue;
2023        }
2024
2025        refblock_index = cluster >> s->refcount_block_bits;
2026        refblock_start = refblock_index << s->refcount_block_bits;
2027
2028        /* Don't allocate a cluster in a refblock already written to disk */
2029        if (first_free_cluster < refblock_start) {
2030            first_free_cluster = refblock_start;
2031        }
2032        refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2033                                              nb_clusters, &first_free_cluster);
2034        if (refblock_offset < 0) {
2035            fprintf(stderr, "ERROR allocating refblock: %s\n",
2036                    strerror(-refblock_offset));
2037            res->check_errors++;
2038            ret = refblock_offset;
2039            goto fail;
2040        }
2041
2042        if (reftable_size <= refblock_index) {
2043            uint32_t old_reftable_size = reftable_size;
2044            uint64_t *new_on_disk_reftable;
2045
2046            reftable_size = ROUND_UP((refblock_index + 1) * sizeof(uint64_t),
2047                                     s->cluster_size) / sizeof(uint64_t);
2048            new_on_disk_reftable = g_try_realloc(on_disk_reftable,
2049                                                 reftable_size *
2050                                                 sizeof(uint64_t));
2051            if (!new_on_disk_reftable) {
2052                res->check_errors++;
2053                ret = -ENOMEM;
2054                goto fail;
2055            }
2056            on_disk_reftable = new_on_disk_reftable;
2057
2058            memset(on_disk_reftable + old_reftable_size, 0,
2059                   (reftable_size - old_reftable_size) * sizeof(uint64_t));
2060
2061            /* The offset we have for the reftable is now no longer valid;
2062             * this will leak that range, but we can easily fix that by running
2063             * a leak-fixing check after this rebuild operation */
2064            reftable_offset = -1;
2065        }
2066        on_disk_reftable[refblock_index] = refblock_offset;
2067
2068        /* If this is apparently the last refblock (for now), try to squeeze the
2069         * reftable in */
2070        if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits &&
2071            reftable_offset < 0)
2072        {
2073            uint64_t reftable_clusters = size_to_clusters(s, reftable_size *
2074                                                          sizeof(uint64_t));
2075            reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2076                                                  refcount_table, nb_clusters,
2077                                                  &first_free_cluster);
2078            if (reftable_offset < 0) {
2079                fprintf(stderr, "ERROR allocating reftable: %s\n",
2080                        strerror(-reftable_offset));
2081                res->check_errors++;
2082                ret = reftable_offset;
2083                goto fail;
2084            }
2085        }
2086
2087        ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2088                                            s->cluster_size);
2089        if (ret < 0) {
2090            fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2091            goto fail;
2092        }
2093
2094        /* The size of *refcount_table is always cluster-aligned, therefore the
2095         * write operation will not overflow */
2096        on_disk_refblock = (void *)((char *) *refcount_table +
2097                                    refblock_index * s->cluster_size);
2098
2099        ret = bdrv_write(bs->file, refblock_offset / BDRV_SECTOR_SIZE,
2100                         on_disk_refblock, s->cluster_sectors);
2101        if (ret < 0) {
2102            fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2103            goto fail;
2104        }
2105
2106        /* Go to the end of this refblock */
2107        cluster = refblock_start + s->refcount_block_size - 1;
2108    }
2109
2110    if (reftable_offset < 0) {
2111        uint64_t post_refblock_start, reftable_clusters;
2112
2113        post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size);
2114        reftable_clusters = size_to_clusters(s,
2115                                             reftable_size * sizeof(uint64_t));
2116        /* Not pretty but simple */
2117        if (first_free_cluster < post_refblock_start) {
2118            first_free_cluster = post_refblock_start;
2119        }
2120        reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2121                                              refcount_table, nb_clusters,
2122                                              &first_free_cluster);
2123        if (reftable_offset < 0) {
2124            fprintf(stderr, "ERROR allocating reftable: %s\n",
2125                    strerror(-reftable_offset));
2126            res->check_errors++;
2127            ret = reftable_offset;
2128            goto fail;
2129        }
2130
2131        goto write_refblocks;
2132    }
2133
2134    assert(on_disk_reftable);
2135
2136    for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2137        cpu_to_be64s(&on_disk_reftable[refblock_index]);
2138    }
2139
2140    ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset,
2141                                        reftable_size * sizeof(uint64_t));
2142    if (ret < 0) {
2143        fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2144        goto fail;
2145    }
2146
2147    assert(reftable_size < INT_MAX / sizeof(uint64_t));
2148    ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable,
2149                      reftable_size * sizeof(uint64_t));
2150    if (ret < 0) {
2151        fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2152        goto fail;
2153    }
2154
2155    /* Enter new reftable into the image header */
2156    reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
2157    reftable_offset_and_clusters.reftable_clusters =
2158        cpu_to_be32(size_to_clusters(s, reftable_size * sizeof(uint64_t)));
2159    ret = bdrv_pwrite_sync(bs->file,
2160                           offsetof(QCowHeader, refcount_table_offset),
2161                           &reftable_offset_and_clusters,
2162                           sizeof(reftable_offset_and_clusters));
2163    if (ret < 0) {
2164        fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret));
2165        goto fail;
2166    }
2167
2168    for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2169        be64_to_cpus(&on_disk_reftable[refblock_index]);
2170    }
2171    s->refcount_table = on_disk_reftable;
2172    s->refcount_table_offset = reftable_offset;
2173    s->refcount_table_size = reftable_size;
2174
2175    return 0;
2176
2177fail:
2178    g_free(on_disk_reftable);
2179    return ret;
2180}
2181
2182/*
2183 * Checks an image for refcount consistency.
2184 *
2185 * Returns 0 if no errors are found, the number of errors in case the image is
2186 * detected as corrupted, and -errno when an internal error occurred.
2187 */
2188int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2189                          BdrvCheckMode fix)
2190{
2191    BDRVQcow2State *s = bs->opaque;
2192    BdrvCheckResult pre_compare_res;
2193    int64_t size, highest_cluster, nb_clusters;
2194    void *refcount_table = NULL;
2195    bool rebuild = false;
2196    int ret;
2197
2198    size = bdrv_getlength(bs->file->bs);
2199    if (size < 0) {
2200        res->check_errors++;
2201        return size;
2202    }
2203
2204    nb_clusters = size_to_clusters(s, size);
2205    if (nb_clusters > INT_MAX) {
2206        res->check_errors++;
2207        return -EFBIG;
2208    }
2209
2210    res->bfi.total_clusters =
2211        size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2212
2213    ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2214                              &nb_clusters);
2215    if (ret < 0) {
2216        goto fail;
2217    }
2218
2219    /* In case we don't need to rebuild the refcount structure (but want to fix
2220     * something), this function is immediately called again, in which case the
2221     * result should be ignored */
2222    pre_compare_res = *res;
2223    compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2224                      nb_clusters);
2225
2226    if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2227        BdrvCheckResult old_res = *res;
2228        int fresh_leaks = 0;
2229
2230        fprintf(stderr, "Rebuilding refcount structure\n");
2231        ret = rebuild_refcount_structure(bs, res, &refcount_table,
2232                                         &nb_clusters);
2233        if (ret < 0) {
2234            goto fail;
2235        }
2236
2237        res->corruptions = 0;
2238        res->leaks = 0;
2239
2240        /* Because the old reftable has been exchanged for a new one the
2241         * references have to be recalculated */
2242        rebuild = false;
2243        memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2244        ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2245                                  &nb_clusters);
2246        if (ret < 0) {
2247            goto fail;
2248        }
2249
2250        if (fix & BDRV_FIX_LEAKS) {
2251            /* The old refcount structures are now leaked, fix it; the result
2252             * can be ignored, aside from leaks which were introduced by
2253             * rebuild_refcount_structure() that could not be fixed */
2254            BdrvCheckResult saved_res = *res;
2255            *res = (BdrvCheckResult){ 0 };
2256
2257            compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2258                              &highest_cluster, refcount_table, nb_clusters);
2259            if (rebuild) {
2260                fprintf(stderr, "ERROR rebuilt refcount structure is still "
2261                        "broken\n");
2262            }
2263
2264            /* Any leaks accounted for here were introduced by
2265             * rebuild_refcount_structure() because that function has created a
2266             * new refcount structure from scratch */
2267            fresh_leaks = res->leaks;
2268            *res = saved_res;
2269        }
2270
2271        if (res->corruptions < old_res.corruptions) {
2272            res->corruptions_fixed += old_res.corruptions - res->corruptions;
2273        }
2274        if (res->leaks < old_res.leaks) {
2275            res->leaks_fixed += old_res.leaks - res->leaks;
2276        }
2277        res->leaks += fresh_leaks;
2278    } else if (fix) {
2279        if (rebuild) {
2280            fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2281            res->check_errors++;
2282            ret = -EIO;
2283            goto fail;
2284        }
2285
2286        if (res->leaks || res->corruptions) {
2287            *res = pre_compare_res;
2288            compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2289                              refcount_table, nb_clusters);
2290        }
2291    }
2292
2293    /* check OFLAG_COPIED */
2294    ret = check_oflag_copied(bs, res, fix);
2295    if (ret < 0) {
2296        goto fail;
2297    }
2298
2299    res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2300    ret = 0;
2301
2302fail:
2303    g_free(refcount_table);
2304
2305    return ret;
2306}
2307
2308#define overlaps_with(ofs, sz) \
2309    ranges_overlap(offset, size, ofs, sz)
2310
2311/*
2312 * Checks if the given offset into the image file is actually free to use by
2313 * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2314 * i.e. a sanity check without relying on the refcount tables.
2315 *
2316 * The ign parameter specifies what checks not to perform (being a bitmask of
2317 * QCow2MetadataOverlap values), i.e., what sections to ignore.
2318 *
2319 * Returns:
2320 * - 0 if writing to this offset will not affect the mentioned metadata
2321 * - a positive QCow2MetadataOverlap value indicating one overlapping section
2322 * - a negative value (-errno) indicating an error while performing a check,
2323 *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
2324 */
2325int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2326                                 int64_t size)
2327{
2328    BDRVQcow2State *s = bs->opaque;
2329    int chk = s->overlap_check & ~ign;
2330    int i, j;
2331
2332    if (!size) {
2333        return 0;
2334    }
2335
2336    if (chk & QCOW2_OL_MAIN_HEADER) {
2337        if (offset < s->cluster_size) {
2338            return QCOW2_OL_MAIN_HEADER;
2339        }
2340    }
2341
2342    /* align range to test to cluster boundaries */
2343    size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
2344    offset = start_of_cluster(s, offset);
2345
2346    if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2347        if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
2348            return QCOW2_OL_ACTIVE_L1;
2349        }
2350    }
2351
2352    if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2353        if (overlaps_with(s->refcount_table_offset,
2354            s->refcount_table_size * sizeof(uint64_t))) {
2355            return QCOW2_OL_REFCOUNT_TABLE;
2356        }
2357    }
2358
2359    if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2360        if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2361            return QCOW2_OL_SNAPSHOT_TABLE;
2362        }
2363    }
2364
2365    if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2366        for (i = 0; i < s->nb_snapshots; i++) {
2367            if (s->snapshots[i].l1_size &&
2368                overlaps_with(s->snapshots[i].l1_table_offset,
2369                s->snapshots[i].l1_size * sizeof(uint64_t))) {
2370                return QCOW2_OL_INACTIVE_L1;
2371            }
2372        }
2373    }
2374
2375    if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2376        for (i = 0; i < s->l1_size; i++) {
2377            if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2378                overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2379                s->cluster_size)) {
2380                return QCOW2_OL_ACTIVE_L2;
2381            }
2382        }
2383    }
2384
2385    if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2386        for (i = 0; i < s->refcount_table_size; i++) {
2387            if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2388                overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2389                s->cluster_size)) {
2390                return QCOW2_OL_REFCOUNT_BLOCK;
2391            }
2392        }
2393    }
2394
2395    if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2396        for (i = 0; i < s->nb_snapshots; i++) {
2397            uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2398            uint32_t l1_sz  = s->snapshots[i].l1_size;
2399            uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
2400            uint64_t *l1 = g_try_malloc(l1_sz2);
2401            int ret;
2402
2403            if (l1_sz2 && l1 == NULL) {
2404                return -ENOMEM;
2405            }
2406
2407            ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
2408            if (ret < 0) {
2409                g_free(l1);
2410                return ret;
2411            }
2412
2413            for (j = 0; j < l1_sz; j++) {
2414                uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
2415                if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
2416                    g_free(l1);
2417                    return QCOW2_OL_INACTIVE_L2;
2418                }
2419            }
2420
2421            g_free(l1);
2422        }
2423    }
2424
2425    return 0;
2426}
2427
2428static const char *metadata_ol_names[] = {
2429    [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header",
2430    [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table",
2431    [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table",
2432    [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
2433    [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
2434    [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
2435    [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table",
2436    [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table",
2437};
2438
2439/*
2440 * First performs a check for metadata overlaps (through
2441 * qcow2_check_metadata_overlap); if that fails with a negative value (error
2442 * while performing a check), that value is returned. If an impending overlap
2443 * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
2444 * and -EIO returned.
2445 *
2446 * Returns 0 if there were neither overlaps nor errors while checking for
2447 * overlaps; or a negative value (-errno) on error.
2448 */
2449int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
2450                                  int64_t size)
2451{
2452    int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
2453
2454    if (ret < 0) {
2455        return ret;
2456    } else if (ret > 0) {
2457        int metadata_ol_bitnr = ctz32(ret);
2458        assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
2459
2460        qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
2461                                "write on metadata (overlaps with %s)",
2462                                metadata_ol_names[metadata_ol_bitnr]);
2463        return -EIO;
2464    }
2465
2466    return 0;
2467}
2468
2469/* A pointer to a function of this type is given to walk_over_reftable(). That
2470 * function will create refblocks and pass them to a RefblockFinishOp once they
2471 * are completed (@refblock). @refblock_empty is set if the refblock is
2472 * completely empty.
2473 *
2474 * Along with the refblock, a corresponding reftable entry is passed, in the
2475 * reftable @reftable (which may be reallocated) at @reftable_index.
2476 *
2477 * @allocated should be set to true if a new cluster has been allocated.
2478 */
2479typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
2480                               uint64_t reftable_index, uint64_t *reftable_size,
2481                               void *refblock, bool refblock_empty,
2482                               bool *allocated, Error **errp);
2483
2484/**
2485 * This "operation" for walk_over_reftable() allocates the refblock on disk (if
2486 * it is not empty) and inserts its offset into the new reftable. The size of
2487 * this new reftable is increased as required.
2488 */
2489static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
2490                          uint64_t reftable_index, uint64_t *reftable_size,
2491                          void *refblock, bool refblock_empty, bool *allocated,
2492                          Error **errp)
2493{
2494    BDRVQcow2State *s = bs->opaque;
2495    int64_t offset;
2496
2497    if (!refblock_empty && reftable_index >= *reftable_size) {
2498        uint64_t *new_reftable;
2499        uint64_t new_reftable_size;
2500
2501        new_reftable_size = ROUND_UP(reftable_index + 1,
2502                                     s->cluster_size / sizeof(uint64_t));
2503        if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
2504            error_setg(errp,
2505                       "This operation would make the refcount table grow "
2506                       "beyond the maximum size supported by QEMU, aborting");
2507            return -ENOTSUP;
2508        }
2509
2510        new_reftable = g_try_realloc(*reftable, new_reftable_size *
2511                                                sizeof(uint64_t));
2512        if (!new_reftable) {
2513            error_setg(errp, "Failed to increase reftable buffer size");
2514            return -ENOMEM;
2515        }
2516
2517        memset(new_reftable + *reftable_size, 0,
2518               (new_reftable_size - *reftable_size) * sizeof(uint64_t));
2519
2520        *reftable      = new_reftable;
2521        *reftable_size = new_reftable_size;
2522    }
2523
2524    if (!refblock_empty && !(*reftable)[reftable_index]) {
2525        offset = qcow2_alloc_clusters(bs, s->cluster_size);
2526        if (offset < 0) {
2527            error_setg_errno(errp, -offset, "Failed to allocate refblock");
2528            return offset;
2529        }
2530        (*reftable)[reftable_index] = offset;
2531        *allocated = true;
2532    }
2533
2534    return 0;
2535}
2536
2537/**
2538 * This "operation" for walk_over_reftable() writes the refblock to disk at the
2539 * offset specified by the new reftable's entry. It does not modify the new
2540 * reftable or change any refcounts.
2541 */
2542static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
2543                          uint64_t reftable_index, uint64_t *reftable_size,
2544                          void *refblock, bool refblock_empty, bool *allocated,
2545                          Error **errp)
2546{
2547    BDRVQcow2State *s = bs->opaque;
2548    int64_t offset;
2549    int ret;
2550
2551    if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
2552        offset = (*reftable)[reftable_index];
2553
2554        ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
2555        if (ret < 0) {
2556            error_setg_errno(errp, -ret, "Overlap check failed");
2557            return ret;
2558        }
2559
2560        ret = bdrv_pwrite(bs->file, offset, refblock, s->cluster_size);
2561        if (ret < 0) {
2562            error_setg_errno(errp, -ret, "Failed to write refblock");
2563            return ret;
2564        }
2565    } else {
2566        assert(refblock_empty);
2567    }
2568
2569    return 0;
2570}
2571
2572/**
2573 * This function walks over the existing reftable and every referenced refblock;
2574 * if @new_set_refcount is non-NULL, it is called for every refcount entry to
2575 * create an equal new entry in the passed @new_refblock. Once that
2576 * @new_refblock is completely filled, @operation will be called.
2577 *
2578 * @status_cb and @cb_opaque are used for the amend operation's status callback.
2579 * @index is the index of the walk_over_reftable() calls and @total is the total
2580 * number of walk_over_reftable() calls per amend operation. Both are used for
2581 * calculating the parameters for the status callback.
2582 *
2583 * @allocated is set to true if a new cluster has been allocated.
2584 */
2585static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
2586                              uint64_t *new_reftable_index,
2587                              uint64_t *new_reftable_size,
2588                              void *new_refblock, int new_refblock_size,
2589                              int new_refcount_bits,
2590                              RefblockFinishOp *operation, bool *allocated,
2591                              Qcow2SetRefcountFunc *new_set_refcount,
2592                              BlockDriverAmendStatusCB *status_cb,
2593                              void *cb_opaque, int index, int total,
2594                              Error **errp)
2595{
2596    BDRVQcow2State *s = bs->opaque;
2597    uint64_t reftable_index;
2598    bool new_refblock_empty = true;
2599    int refblock_index;
2600    int new_refblock_index = 0;
2601    int ret;
2602
2603    for (reftable_index = 0; reftable_index < s->refcount_table_size;
2604         reftable_index++)
2605    {
2606        uint64_t refblock_offset = s->refcount_table[reftable_index]
2607                                 & REFT_OFFSET_MASK;
2608
2609        status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
2610                  (uint64_t)total * s->refcount_table_size, cb_opaque);
2611
2612        if (refblock_offset) {
2613            void *refblock;
2614
2615            if (offset_into_cluster(s, refblock_offset)) {
2616                qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
2617                                        PRIx64 " unaligned (reftable index: %#"
2618                                        PRIx64 ")", refblock_offset,
2619                                        reftable_index);
2620                error_setg(errp,
2621                           "Image is corrupt (unaligned refblock offset)");
2622                return -EIO;
2623            }
2624
2625            ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
2626                                  &refblock);
2627            if (ret < 0) {
2628                error_setg_errno(errp, -ret, "Failed to retrieve refblock");
2629                return ret;
2630            }
2631
2632            for (refblock_index = 0; refblock_index < s->refcount_block_size;
2633                 refblock_index++)
2634            {
2635                uint64_t refcount;
2636
2637                if (new_refblock_index >= new_refblock_size) {
2638                    /* new_refblock is now complete */
2639                    ret = operation(bs, new_reftable, *new_reftable_index,
2640                                    new_reftable_size, new_refblock,
2641                                    new_refblock_empty, allocated, errp);
2642                    if (ret < 0) {
2643                        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2644                        return ret;
2645                    }
2646
2647                    (*new_reftable_index)++;
2648                    new_refblock_index = 0;
2649                    new_refblock_empty = true;
2650                }
2651
2652                refcount = s->get_refcount(refblock, refblock_index);
2653                if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
2654                    uint64_t offset;
2655
2656                    qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2657
2658                    offset = ((reftable_index << s->refcount_block_bits)
2659                              + refblock_index) << s->cluster_bits;
2660
2661                    error_setg(errp, "Cannot decrease refcount entry width to "
2662                               "%i bits: Cluster at offset %#" PRIx64 " has a "
2663                               "refcount of %" PRIu64, new_refcount_bits,
2664                               offset, refcount);
2665                    return -EINVAL;
2666                }
2667
2668                if (new_set_refcount) {
2669                    new_set_refcount(new_refblock, new_refblock_index++,
2670                                     refcount);
2671                } else {
2672                    new_refblock_index++;
2673                }
2674                new_refblock_empty = new_refblock_empty && refcount == 0;
2675            }
2676
2677            qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2678        } else {
2679            /* No refblock means every refcount is 0 */
2680            for (refblock_index = 0; refblock_index < s->refcount_block_size;
2681                 refblock_index++)
2682            {
2683                if (new_refblock_index >= new_refblock_size) {
2684                    /* new_refblock is now complete */
2685                    ret = operation(bs, new_reftable, *new_reftable_index,
2686                                    new_reftable_size, new_refblock,
2687                                    new_refblock_empty, allocated, errp);
2688                    if (ret < 0) {
2689                        return ret;
2690                    }
2691
2692                    (*new_reftable_index)++;
2693                    new_refblock_index = 0;
2694                    new_refblock_empty = true;
2695                }
2696
2697                if (new_set_refcount) {
2698                    new_set_refcount(new_refblock, new_refblock_index++, 0);
2699                } else {
2700                    new_refblock_index++;
2701                }
2702            }
2703        }
2704    }
2705
2706    if (new_refblock_index > 0) {
2707        /* Complete the potentially existing partially filled final refblock */
2708        if (new_set_refcount) {
2709            for (; new_refblock_index < new_refblock_size;
2710                 new_refblock_index++)
2711            {
2712                new_set_refcount(new_refblock, new_refblock_index, 0);
2713            }
2714        }
2715
2716        ret = operation(bs, new_reftable, *new_reftable_index,
2717                        new_reftable_size, new_refblock, new_refblock_empty,
2718                        allocated, errp);
2719        if (ret < 0) {
2720            return ret;
2721        }
2722
2723        (*new_reftable_index)++;
2724    }
2725
2726    status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
2727              (uint64_t)total * s->refcount_table_size, cb_opaque);
2728
2729    return 0;
2730}
2731
2732int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
2733                                BlockDriverAmendStatusCB *status_cb,
2734                                void *cb_opaque, Error **errp)
2735{
2736    BDRVQcow2State *s = bs->opaque;
2737    Qcow2GetRefcountFunc *new_get_refcount;
2738    Qcow2SetRefcountFunc *new_set_refcount;
2739    void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
2740    uint64_t *new_reftable = NULL, new_reftable_size = 0;
2741    uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
2742    uint64_t new_reftable_index = 0;
2743    uint64_t i;
2744    int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
2745    int new_refblock_size, new_refcount_bits = 1 << refcount_order;
2746    int old_refcount_order;
2747    int walk_index = 0;
2748    int ret;
2749    bool new_allocation;
2750
2751    assert(s->qcow_version >= 3);
2752    assert(refcount_order >= 0 && refcount_order <= 6);
2753
2754    /* see qcow2_open() */
2755    new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
2756
2757    new_get_refcount = get_refcount_funcs[refcount_order];
2758    new_set_refcount = set_refcount_funcs[refcount_order];
2759
2760
2761    do {
2762        int total_walks;
2763
2764        new_allocation = false;
2765
2766        /* At least we have to do this walk and the one which writes the
2767         * refblocks; also, at least we have to do this loop here at least
2768         * twice (normally), first to do the allocations, and second to
2769         * determine that everything is correctly allocated, this then makes
2770         * three walks in total */
2771        total_walks = MAX(walk_index + 2, 3);
2772
2773        /* First, allocate the structures so they are present in the refcount
2774         * structures */
2775        ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
2776                                 &new_reftable_size, NULL, new_refblock_size,
2777                                 new_refcount_bits, &alloc_refblock,
2778                                 &new_allocation, NULL, status_cb, cb_opaque,
2779                                 walk_index++, total_walks, errp);
2780        if (ret < 0) {
2781            goto done;
2782        }
2783
2784        new_reftable_index = 0;
2785
2786        if (new_allocation) {
2787            if (new_reftable_offset) {
2788                qcow2_free_clusters(bs, new_reftable_offset,
2789                                    allocated_reftable_size * sizeof(uint64_t),
2790                                    QCOW2_DISCARD_NEVER);
2791            }
2792
2793            new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
2794                                                           sizeof(uint64_t));
2795            if (new_reftable_offset < 0) {
2796                error_setg_errno(errp, -new_reftable_offset,
2797                                 "Failed to allocate the new reftable");
2798                ret = new_reftable_offset;
2799                goto done;
2800            }
2801            allocated_reftable_size = new_reftable_size;
2802        }
2803    } while (new_allocation);
2804
2805    /* Second, write the new refblocks */
2806    ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
2807                             &new_reftable_size, new_refblock,
2808                             new_refblock_size, new_refcount_bits,
2809                             &flush_refblock, &new_allocation, new_set_refcount,
2810                             status_cb, cb_opaque, walk_index, walk_index + 1,
2811                             errp);
2812    if (ret < 0) {
2813        goto done;
2814    }
2815    assert(!new_allocation);
2816
2817
2818    /* Write the new reftable */
2819    ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
2820                                        new_reftable_size * sizeof(uint64_t));
2821    if (ret < 0) {
2822        error_setg_errno(errp, -ret, "Overlap check failed");
2823        goto done;
2824    }
2825
2826    for (i = 0; i < new_reftable_size; i++) {
2827        cpu_to_be64s(&new_reftable[i]);
2828    }
2829
2830    ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable,
2831                      new_reftable_size * sizeof(uint64_t));
2832
2833    for (i = 0; i < new_reftable_size; i++) {
2834        be64_to_cpus(&new_reftable[i]);
2835    }
2836
2837    if (ret < 0) {
2838        error_setg_errno(errp, -ret, "Failed to write the new reftable");
2839        goto done;
2840    }
2841
2842
2843    /* Empty the refcount cache */
2844    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
2845    if (ret < 0) {
2846        error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
2847        goto done;
2848    }
2849
2850    /* Update the image header to point to the new reftable; this only updates
2851     * the fields which are relevant to qcow2_update_header(); other fields
2852     * such as s->refcount_table or s->refcount_bits stay stale for now
2853     * (because we have to restore everything if qcow2_update_header() fails) */
2854    old_refcount_order  = s->refcount_order;
2855    old_reftable_size   = s->refcount_table_size;
2856    old_reftable_offset = s->refcount_table_offset;
2857
2858    s->refcount_order        = refcount_order;
2859    s->refcount_table_size   = new_reftable_size;
2860    s->refcount_table_offset = new_reftable_offset;
2861
2862    ret = qcow2_update_header(bs);
2863    if (ret < 0) {
2864        s->refcount_order        = old_refcount_order;
2865        s->refcount_table_size   = old_reftable_size;
2866        s->refcount_table_offset = old_reftable_offset;
2867        error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
2868        goto done;
2869    }
2870
2871    /* Now update the rest of the in-memory information */
2872    old_reftable = s->refcount_table;
2873    s->refcount_table = new_reftable;
2874
2875    s->refcount_bits = 1 << refcount_order;
2876    s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
2877    s->refcount_max += s->refcount_max - 1;
2878
2879    s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
2880    s->refcount_block_size = 1 << s->refcount_block_bits;
2881
2882    s->get_refcount = new_get_refcount;
2883    s->set_refcount = new_set_refcount;
2884
2885    /* For cleaning up all old refblocks and the old reftable below the "done"
2886     * label */
2887    new_reftable        = old_reftable;
2888    new_reftable_size   = old_reftable_size;
2889    new_reftable_offset = old_reftable_offset;
2890
2891done:
2892    if (new_reftable) {
2893        /* On success, new_reftable actually points to the old reftable (and
2894         * new_reftable_size is the old reftable's size); but that is just
2895         * fine */
2896        for (i = 0; i < new_reftable_size; i++) {
2897            uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
2898            if (offset) {
2899                qcow2_free_clusters(bs, offset, s->cluster_size,
2900                                    QCOW2_DISCARD_OTHER);
2901            }
2902        }
2903        g_free(new_reftable);
2904
2905        if (new_reftable_offset > 0) {
2906            qcow2_free_clusters(bs, new_reftable_offset,
2907                                new_reftable_size * sizeof(uint64_t),
2908                                QCOW2_DISCARD_OTHER);
2909        }
2910    }
2911
2912    qemu_vfree(new_refblock);
2913    return ret;
2914}
2915