qemu/block/qcow2-refcount.c
<<
>>
Prefs
   1/*
   2 * Block driver for the QCOW version 2 format
   3 *
   4 * Copyright (c) 2004-2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "qapi/error.h"
  27#include "qemu-common.h"
  28#include "block/block_int.h"
  29#include "block/qcow2.h"
  30#include "qemu/range.h"
  31
  32static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
  33static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
  34                            int64_t offset, int64_t length, uint64_t addend,
  35                            bool decrease, enum qcow2_discard_type type);
  36
  37static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
  38static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
  39static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
  40static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
  41static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
  42static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
  43static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
  44
  45static void set_refcount_ro0(void *refcount_array, uint64_t index,
  46                             uint64_t value);
  47static void set_refcount_ro1(void *refcount_array, uint64_t index,
  48                             uint64_t value);
  49static void set_refcount_ro2(void *refcount_array, uint64_t index,
  50                             uint64_t value);
  51static void set_refcount_ro3(void *refcount_array, uint64_t index,
  52                             uint64_t value);
  53static void set_refcount_ro4(void *refcount_array, uint64_t index,
  54                             uint64_t value);
  55static void set_refcount_ro5(void *refcount_array, uint64_t index,
  56                             uint64_t value);
  57static void set_refcount_ro6(void *refcount_array, uint64_t index,
  58                             uint64_t value);
  59
  60
  61static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
  62    &get_refcount_ro0,
  63    &get_refcount_ro1,
  64    &get_refcount_ro2,
  65    &get_refcount_ro3,
  66    &get_refcount_ro4,
  67    &get_refcount_ro5,
  68    &get_refcount_ro6
  69};
  70
  71static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
  72    &set_refcount_ro0,
  73    &set_refcount_ro1,
  74    &set_refcount_ro2,
  75    &set_refcount_ro3,
  76    &set_refcount_ro4,
  77    &set_refcount_ro5,
  78    &set_refcount_ro6
  79};
  80
  81
  82/*********************************************************/
  83/* refcount handling */
  84
  85int qcow2_refcount_init(BlockDriverState *bs)
  86{
  87    BDRVQcow2State *s = bs->opaque;
  88    unsigned int refcount_table_size2, i;
  89    int ret;
  90
  91    assert(s->refcount_order >= 0 && s->refcount_order <= 6);
  92
  93    s->get_refcount = get_refcount_funcs[s->refcount_order];
  94    s->set_refcount = set_refcount_funcs[s->refcount_order];
  95
  96    assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
  97    refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
  98    s->refcount_table = g_try_malloc(refcount_table_size2);
  99
 100    if (s->refcount_table_size > 0) {
 101        if (s->refcount_table == NULL) {
 102            ret = -ENOMEM;
 103            goto fail;
 104        }
 105        BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
 106        ret = bdrv_pread(bs->file->bs, s->refcount_table_offset,
 107                         s->refcount_table, refcount_table_size2);
 108        if (ret < 0) {
 109            goto fail;
 110        }
 111        for(i = 0; i < s->refcount_table_size; i++)
 112            be64_to_cpus(&s->refcount_table[i]);
 113    }
 114    return 0;
 115 fail:
 116    return ret;
 117}
 118
 119void qcow2_refcount_close(BlockDriverState *bs)
 120{
 121    BDRVQcow2State *s = bs->opaque;
 122    g_free(s->refcount_table);
 123}
 124
 125
 126static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
 127{
 128    return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
 129}
 130
 131static void set_refcount_ro0(void *refcount_array, uint64_t index,
 132                             uint64_t value)
 133{
 134    assert(!(value >> 1));
 135    ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
 136    ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
 137}
 138
 139static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
 140{
 141    return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
 142           & 0x3;
 143}
 144
 145static void set_refcount_ro1(void *refcount_array, uint64_t index,
 146                             uint64_t value)
 147{
 148    assert(!(value >> 2));
 149    ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
 150    ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
 151}
 152
 153static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
 154{
 155    return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
 156           & 0xf;
 157}
 158
 159static void set_refcount_ro2(void *refcount_array, uint64_t index,
 160                             uint64_t value)
 161{
 162    assert(!(value >> 4));
 163    ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
 164    ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
 165}
 166
 167static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
 168{
 169    return ((const uint8_t *)refcount_array)[index];
 170}
 171
 172static void set_refcount_ro3(void *refcount_array, uint64_t index,
 173                             uint64_t value)
 174{
 175    assert(!(value >> 8));
 176    ((uint8_t *)refcount_array)[index] = value;
 177}
 178
 179static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
 180{
 181    return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
 182}
 183
 184static void set_refcount_ro4(void *refcount_array, uint64_t index,
 185                             uint64_t value)
 186{
 187    assert(!(value >> 16));
 188    ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
 189}
 190
 191static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
 192{
 193    return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
 194}
 195
 196static void set_refcount_ro5(void *refcount_array, uint64_t index,
 197                             uint64_t value)
 198{
 199    assert(!(value >> 32));
 200    ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
 201}
 202
 203static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
 204{
 205    return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
 206}
 207
 208static void set_refcount_ro6(void *refcount_array, uint64_t index,
 209                             uint64_t value)
 210{
 211    ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
 212}
 213
 214
 215static int load_refcount_block(BlockDriverState *bs,
 216                               int64_t refcount_block_offset,
 217                               void **refcount_block)
 218{
 219    BDRVQcow2State *s = bs->opaque;
 220    int ret;
 221
 222    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
 223    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
 224        refcount_block);
 225
 226    return ret;
 227}
 228
 229/*
 230 * Retrieves the refcount of the cluster given by its index and stores it in
 231 * *refcount. Returns 0 on success and -errno on failure.
 232 */
 233int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
 234                       uint64_t *refcount)
 235{
 236    BDRVQcow2State *s = bs->opaque;
 237    uint64_t refcount_table_index, block_index;
 238    int64_t refcount_block_offset;
 239    int ret;
 240    void *refcount_block;
 241
 242    refcount_table_index = cluster_index >> s->refcount_block_bits;
 243    if (refcount_table_index >= s->refcount_table_size) {
 244        *refcount = 0;
 245        return 0;
 246    }
 247    refcount_block_offset =
 248        s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
 249    if (!refcount_block_offset) {
 250        *refcount = 0;
 251        return 0;
 252    }
 253
 254    if (offset_into_cluster(s, refcount_block_offset)) {
 255        qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
 256                                " unaligned (reftable index: %#" PRIx64 ")",
 257                                refcount_block_offset, refcount_table_index);
 258        return -EIO;
 259    }
 260
 261    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
 262                          &refcount_block);
 263    if (ret < 0) {
 264        return ret;
 265    }
 266
 267    block_index = cluster_index & (s->refcount_block_size - 1);
 268    *refcount = s->get_refcount(refcount_block, block_index);
 269
 270    qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
 271
 272    return 0;
 273}
 274
 275/*
 276 * Rounds the refcount table size up to avoid growing the table for each single
 277 * refcount block that is allocated.
 278 */
 279static unsigned int next_refcount_table_size(BDRVQcow2State *s,
 280    unsigned int min_size)
 281{
 282    unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1;
 283    unsigned int refcount_table_clusters =
 284        MAX(1, s->refcount_table_size >> (s->cluster_bits - 3));
 285
 286    while (min_clusters > refcount_table_clusters) {
 287        refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
 288    }
 289
 290    return refcount_table_clusters << (s->cluster_bits - 3);
 291}
 292
 293
 294/* Checks if two offsets are described by the same refcount block */
 295static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
 296    uint64_t offset_b)
 297{
 298    uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
 299    uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
 300
 301    return (block_a == block_b);
 302}
 303
 304/*
 305 * Loads a refcount block. If it doesn't exist yet, it is allocated first
 306 * (including growing the refcount table if needed).
 307 *
 308 * Returns 0 on success or -errno in error case
 309 */
 310static int alloc_refcount_block(BlockDriverState *bs,
 311                                int64_t cluster_index, void **refcount_block)
 312{
 313    BDRVQcow2State *s = bs->opaque;
 314    unsigned int refcount_table_index;
 315    int ret;
 316
 317    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
 318
 319    /* Find the refcount block for the given cluster */
 320    refcount_table_index = cluster_index >> s->refcount_block_bits;
 321
 322    if (refcount_table_index < s->refcount_table_size) {
 323
 324        uint64_t refcount_block_offset =
 325            s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
 326
 327        /* If it's already there, we're done */
 328        if (refcount_block_offset) {
 329            if (offset_into_cluster(s, refcount_block_offset)) {
 330                qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
 331                                        PRIx64 " unaligned (reftable index: "
 332                                        "%#x)", refcount_block_offset,
 333                                        refcount_table_index);
 334                return -EIO;
 335            }
 336
 337             return load_refcount_block(bs, refcount_block_offset,
 338                                        refcount_block);
 339        }
 340    }
 341
 342    /*
 343     * If we came here, we need to allocate something. Something is at least
 344     * a cluster for the new refcount block. It may also include a new refcount
 345     * table if the old refcount table is too small.
 346     *
 347     * Note that allocating clusters here needs some special care:
 348     *
 349     * - We can't use the normal qcow2_alloc_clusters(), it would try to
 350     *   increase the refcount and very likely we would end up with an endless
 351     *   recursion. Instead we must place the refcount blocks in a way that
 352     *   they can describe them themselves.
 353     *
 354     * - We need to consider that at this point we are inside update_refcounts
 355     *   and potentially doing an initial refcount increase. This means that
 356     *   some clusters have already been allocated by the caller, but their
 357     *   refcount isn't accurate yet. If we allocate clusters for metadata, we
 358     *   need to return -EAGAIN to signal the caller that it needs to restart
 359     *   the search for free clusters.
 360     *
 361     * - alloc_clusters_noref and qcow2_free_clusters may load a different
 362     *   refcount block into the cache
 363     */
 364
 365    *refcount_block = NULL;
 366
 367    /* We write to the refcount table, so we might depend on L2 tables */
 368    ret = qcow2_cache_flush(bs, s->l2_table_cache);
 369    if (ret < 0) {
 370        return ret;
 371    }
 372
 373    /* Allocate the refcount block itself and mark it as used */
 374    int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
 375    if (new_block < 0) {
 376        return new_block;
 377    }
 378
 379#ifdef DEBUG_ALLOC2
 380    fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
 381        " at %" PRIx64 "\n",
 382        refcount_table_index, cluster_index << s->cluster_bits, new_block);
 383#endif
 384
 385    if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
 386        /* Zero the new refcount block before updating it */
 387        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
 388                                    refcount_block);
 389        if (ret < 0) {
 390            goto fail_block;
 391        }
 392
 393        memset(*refcount_block, 0, s->cluster_size);
 394
 395        /* The block describes itself, need to update the cache */
 396        int block_index = (new_block >> s->cluster_bits) &
 397            (s->refcount_block_size - 1);
 398        s->set_refcount(*refcount_block, block_index, 1);
 399    } else {
 400        /* Described somewhere else. This can recurse at most twice before we
 401         * arrive at a block that describes itself. */
 402        ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
 403                              QCOW2_DISCARD_NEVER);
 404        if (ret < 0) {
 405            goto fail_block;
 406        }
 407
 408        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 409        if (ret < 0) {
 410            goto fail_block;
 411        }
 412
 413        /* Initialize the new refcount block only after updating its refcount,
 414         * update_refcount uses the refcount cache itself */
 415        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
 416                                    refcount_block);
 417        if (ret < 0) {
 418            goto fail_block;
 419        }
 420
 421        memset(*refcount_block, 0, s->cluster_size);
 422    }
 423
 424    /* Now the new refcount block needs to be written to disk */
 425    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
 426    qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, *refcount_block);
 427    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 428    if (ret < 0) {
 429        goto fail_block;
 430    }
 431
 432    /* If the refcount table is big enough, just hook the block up there */
 433    if (refcount_table_index < s->refcount_table_size) {
 434        uint64_t data64 = cpu_to_be64(new_block);
 435        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
 436        ret = bdrv_pwrite_sync(bs->file->bs,
 437            s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
 438            &data64, sizeof(data64));
 439        if (ret < 0) {
 440            goto fail_block;
 441        }
 442
 443        s->refcount_table[refcount_table_index] = new_block;
 444
 445        /* The new refcount block may be where the caller intended to put its
 446         * data, so let it restart the search. */
 447        return -EAGAIN;
 448    }
 449
 450    qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
 451
 452    /*
 453     * If we come here, we need to grow the refcount table. Again, a new
 454     * refcount table needs some space and we can't simply allocate to avoid
 455     * endless recursion.
 456     *
 457     * Therefore let's grab new refcount blocks at the end of the image, which
 458     * will describe themselves and the new refcount table. This way we can
 459     * reference them only in the new table and do the switch to the new
 460     * refcount table at once without producing an inconsistent state in
 461     * between.
 462     */
 463    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
 464
 465    /* Calculate the number of refcount blocks needed so far; this will be the
 466     * basis for calculating the index of the first cluster used for the
 467     * self-describing refcount structures which we are about to create.
 468     *
 469     * Because we reached this point, there cannot be any refcount entries for
 470     * cluster_index or higher indices yet. However, because new_block has been
 471     * allocated to describe that cluster (and it will assume this role later
 472     * on), we cannot use that index; also, new_block may actually have a higher
 473     * cluster index than cluster_index, so it needs to be taken into account
 474     * here (and 1 needs to be added to its value because that cluster is used).
 475     */
 476    uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
 477                                            (new_block >> s->cluster_bits) + 1),
 478                                        s->refcount_block_size);
 479
 480    if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
 481        return -EFBIG;
 482    }
 483
 484    /* And now we need at least one block more for the new metadata */
 485    uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
 486    uint64_t last_table_size;
 487    uint64_t blocks_clusters;
 488    do {
 489        uint64_t table_clusters =
 490            size_to_clusters(s, table_size * sizeof(uint64_t));
 491        blocks_clusters = 1 +
 492            ((table_clusters + s->refcount_block_size - 1)
 493            / s->refcount_block_size);
 494        uint64_t meta_clusters = table_clusters + blocks_clusters;
 495
 496        last_table_size = table_size;
 497        table_size = next_refcount_table_size(s, blocks_used +
 498            ((meta_clusters + s->refcount_block_size - 1)
 499            / s->refcount_block_size));
 500
 501    } while (last_table_size != table_size);
 502
 503#ifdef DEBUG_ALLOC2
 504    fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n",
 505        s->refcount_table_size, table_size);
 506#endif
 507
 508    /* Create the new refcount table and blocks */
 509    uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
 510        s->cluster_size;
 511    uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
 512    uint64_t *new_table = g_try_new0(uint64_t, table_size);
 513    void *new_blocks = g_try_malloc0(blocks_clusters * s->cluster_size);
 514
 515    assert(table_size > 0 && blocks_clusters > 0);
 516    if (new_table == NULL || new_blocks == NULL) {
 517        ret = -ENOMEM;
 518        goto fail_table;
 519    }
 520
 521    /* Fill the new refcount table */
 522    memcpy(new_table, s->refcount_table,
 523        s->refcount_table_size * sizeof(uint64_t));
 524    new_table[refcount_table_index] = new_block;
 525
 526    int i;
 527    for (i = 0; i < blocks_clusters; i++) {
 528        new_table[blocks_used + i] = meta_offset + (i * s->cluster_size);
 529    }
 530
 531    /* Fill the refcount blocks */
 532    uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t));
 533    int block = 0;
 534    for (i = 0; i < table_clusters + blocks_clusters; i++) {
 535        s->set_refcount(new_blocks, block++, 1);
 536    }
 537
 538    /* Write refcount blocks to disk */
 539    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
 540    ret = bdrv_pwrite_sync(bs->file->bs, meta_offset, new_blocks,
 541        blocks_clusters * s->cluster_size);
 542    g_free(new_blocks);
 543    new_blocks = NULL;
 544    if (ret < 0) {
 545        goto fail_table;
 546    }
 547
 548    /* Write refcount table to disk */
 549    for(i = 0; i < table_size; i++) {
 550        cpu_to_be64s(&new_table[i]);
 551    }
 552
 553    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
 554    ret = bdrv_pwrite_sync(bs->file->bs, table_offset, new_table,
 555        table_size * sizeof(uint64_t));
 556    if (ret < 0) {
 557        goto fail_table;
 558    }
 559
 560    for(i = 0; i < table_size; i++) {
 561        be64_to_cpus(&new_table[i]);
 562    }
 563
 564    /* Hook up the new refcount table in the qcow2 header */
 565    struct QEMU_PACKED {
 566        uint64_t d64;
 567        uint32_t d32;
 568    } data;
 569    cpu_to_be64w(&data.d64, table_offset);
 570    cpu_to_be32w(&data.d32, table_clusters);
 571    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
 572    ret = bdrv_pwrite_sync(bs->file->bs,
 573                           offsetof(QCowHeader, refcount_table_offset),
 574                           &data, sizeof(data));
 575    if (ret < 0) {
 576        goto fail_table;
 577    }
 578
 579    /* And switch it in memory */
 580    uint64_t old_table_offset = s->refcount_table_offset;
 581    uint64_t old_table_size = s->refcount_table_size;
 582
 583    g_free(s->refcount_table);
 584    s->refcount_table = new_table;
 585    s->refcount_table_size = table_size;
 586    s->refcount_table_offset = table_offset;
 587
 588    /* Free old table. */
 589    qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
 590                        QCOW2_DISCARD_OTHER);
 591
 592    ret = load_refcount_block(bs, new_block, refcount_block);
 593    if (ret < 0) {
 594        return ret;
 595    }
 596
 597    /* If we were trying to do the initial refcount update for some cluster
 598     * allocation, we might have used the same clusters to store newly
 599     * allocated metadata. Make the caller search some new space. */
 600    return -EAGAIN;
 601
 602fail_table:
 603    g_free(new_blocks);
 604    g_free(new_table);
 605fail_block:
 606    if (*refcount_block != NULL) {
 607        qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
 608    }
 609    return ret;
 610}
 611
 612void qcow2_process_discards(BlockDriverState *bs, int ret)
 613{
 614    BDRVQcow2State *s = bs->opaque;
 615    Qcow2DiscardRegion *d, *next;
 616
 617    QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
 618        QTAILQ_REMOVE(&s->discards, d, next);
 619
 620        /* Discard is optional, ignore the return value */
 621        if (ret >= 0) {
 622            bdrv_discard(bs->file->bs,
 623                         d->offset >> BDRV_SECTOR_BITS,
 624                         d->bytes >> BDRV_SECTOR_BITS);
 625        }
 626
 627        g_free(d);
 628    }
 629}
 630
 631static void update_refcount_discard(BlockDriverState *bs,
 632                                    uint64_t offset, uint64_t length)
 633{
 634    BDRVQcow2State *s = bs->opaque;
 635    Qcow2DiscardRegion *d, *p, *next;
 636
 637    QTAILQ_FOREACH(d, &s->discards, next) {
 638        uint64_t new_start = MIN(offset, d->offset);
 639        uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
 640
 641        if (new_end - new_start <= length + d->bytes) {
 642            /* There can't be any overlap, areas ending up here have no
 643             * references any more and therefore shouldn't get freed another
 644             * time. */
 645            assert(d->bytes + length == new_end - new_start);
 646            d->offset = new_start;
 647            d->bytes = new_end - new_start;
 648            goto found;
 649        }
 650    }
 651
 652    d = g_malloc(sizeof(*d));
 653    *d = (Qcow2DiscardRegion) {
 654        .bs     = bs,
 655        .offset = offset,
 656        .bytes  = length,
 657    };
 658    QTAILQ_INSERT_TAIL(&s->discards, d, next);
 659
 660found:
 661    /* Merge discard requests if they are adjacent now */
 662    QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
 663        if (p == d
 664            || p->offset > d->offset + d->bytes
 665            || d->offset > p->offset + p->bytes)
 666        {
 667            continue;
 668        }
 669
 670        /* Still no overlap possible */
 671        assert(p->offset == d->offset + d->bytes
 672            || d->offset == p->offset + p->bytes);
 673
 674        QTAILQ_REMOVE(&s->discards, p, next);
 675        d->offset = MIN(d->offset, p->offset);
 676        d->bytes += p->bytes;
 677        g_free(p);
 678    }
 679}
 680
 681/* XXX: cache several refcount block clusters ? */
 682/* @addend is the absolute value of the addend; if @decrease is set, @addend
 683 * will be subtracted from the current refcount, otherwise it will be added */
 684static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
 685                                                   int64_t offset,
 686                                                   int64_t length,
 687                                                   uint64_t addend,
 688                                                   bool decrease,
 689                                                   enum qcow2_discard_type type)
 690{
 691    BDRVQcow2State *s = bs->opaque;
 692    int64_t start, last, cluster_offset;
 693    void *refcount_block = NULL;
 694    int64_t old_table_index = -1;
 695    int ret;
 696
 697#ifdef DEBUG_ALLOC2
 698    fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
 699            " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
 700            addend);
 701#endif
 702    if (length < 0) {
 703        return -EINVAL;
 704    } else if (length == 0) {
 705        return 0;
 706    }
 707
 708    if (decrease) {
 709        qcow2_cache_set_dependency(bs, s->refcount_block_cache,
 710            s->l2_table_cache);
 711    }
 712
 713    start = start_of_cluster(s, offset);
 714    last = start_of_cluster(s, offset + length - 1);
 715    for(cluster_offset = start; cluster_offset <= last;
 716        cluster_offset += s->cluster_size)
 717    {
 718        int block_index;
 719        uint64_t refcount;
 720        int64_t cluster_index = cluster_offset >> s->cluster_bits;
 721        int64_t table_index = cluster_index >> s->refcount_block_bits;
 722
 723        /* Load the refcount block and allocate it if needed */
 724        if (table_index != old_table_index) {
 725            if (refcount_block) {
 726                qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
 727            }
 728            ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
 729            if (ret < 0) {
 730                goto fail;
 731            }
 732        }
 733        old_table_index = table_index;
 734
 735        qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
 736                                     refcount_block);
 737
 738        /* we can update the count and save it */
 739        block_index = cluster_index & (s->refcount_block_size - 1);
 740
 741        refcount = s->get_refcount(refcount_block, block_index);
 742        if (decrease ? (refcount - addend > refcount)
 743                     : (refcount + addend < refcount ||
 744                        refcount + addend > s->refcount_max))
 745        {
 746            ret = -EINVAL;
 747            goto fail;
 748        }
 749        if (decrease) {
 750            refcount -= addend;
 751        } else {
 752            refcount += addend;
 753        }
 754        if (refcount == 0 && cluster_index < s->free_cluster_index) {
 755            s->free_cluster_index = cluster_index;
 756        }
 757        s->set_refcount(refcount_block, block_index, refcount);
 758
 759        if (refcount == 0 && s->discard_passthrough[type]) {
 760            update_refcount_discard(bs, cluster_offset, s->cluster_size);
 761        }
 762    }
 763
 764    ret = 0;
 765fail:
 766    if (!s->cache_discards) {
 767        qcow2_process_discards(bs, ret);
 768    }
 769
 770    /* Write last changed block to disk */
 771    if (refcount_block) {
 772        qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
 773    }
 774
 775    /*
 776     * Try do undo any updates if an error is returned (This may succeed in
 777     * some cases like ENOSPC for allocating a new refcount block)
 778     */
 779    if (ret < 0) {
 780        int dummy;
 781        dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
 782                                !decrease, QCOW2_DISCARD_NEVER);
 783        (void)dummy;
 784    }
 785
 786    return ret;
 787}
 788
 789/*
 790 * Increases or decreases the refcount of a given cluster.
 791 *
 792 * @addend is the absolute value of the addend; if @decrease is set, @addend
 793 * will be subtracted from the current refcount, otherwise it will be added.
 794 *
 795 * On success 0 is returned; on failure -errno is returned.
 796 */
 797int qcow2_update_cluster_refcount(BlockDriverState *bs,
 798                                  int64_t cluster_index,
 799                                  uint64_t addend, bool decrease,
 800                                  enum qcow2_discard_type type)
 801{
 802    BDRVQcow2State *s = bs->opaque;
 803    int ret;
 804
 805    ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
 806                          decrease, type);
 807    if (ret < 0) {
 808        return ret;
 809    }
 810
 811    return 0;
 812}
 813
 814
 815
 816/*********************************************************/
 817/* cluster allocation functions */
 818
 819
 820
 821/* return < 0 if error */
 822static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
 823{
 824    BDRVQcow2State *s = bs->opaque;
 825    uint64_t i, nb_clusters, refcount;
 826    int ret;
 827
 828    /* We can't allocate clusters if they may still be queued for discard. */
 829    if (s->cache_discards) {
 830        qcow2_process_discards(bs, 0);
 831    }
 832
 833    nb_clusters = size_to_clusters(s, size);
 834retry:
 835    for(i = 0; i < nb_clusters; i++) {
 836        uint64_t next_cluster_index = s->free_cluster_index++;
 837        ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
 838
 839        if (ret < 0) {
 840            return ret;
 841        } else if (refcount != 0) {
 842            goto retry;
 843        }
 844    }
 845
 846    /* Make sure that all offsets in the "allocated" range are representable
 847     * in an int64_t */
 848    if (s->free_cluster_index > 0 &&
 849        s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits))
 850    {
 851        return -EFBIG;
 852    }
 853
 854#ifdef DEBUG_ALLOC2
 855    fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
 856            size,
 857            (s->free_cluster_index - nb_clusters) << s->cluster_bits);
 858#endif
 859    return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
 860}
 861
 862int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
 863{
 864    int64_t offset;
 865    int ret;
 866
 867    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
 868    do {
 869        offset = alloc_clusters_noref(bs, size);
 870        if (offset < 0) {
 871            return offset;
 872        }
 873
 874        ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
 875    } while (ret == -EAGAIN);
 876
 877    if (ret < 0) {
 878        return ret;
 879    }
 880
 881    return offset;
 882}
 883
 884int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
 885                                int64_t nb_clusters)
 886{
 887    BDRVQcow2State *s = bs->opaque;
 888    uint64_t cluster_index, refcount;
 889    uint64_t i;
 890    int ret;
 891
 892    assert(nb_clusters >= 0);
 893    if (nb_clusters == 0) {
 894        return 0;
 895    }
 896
 897    do {
 898        /* Check how many clusters there are free */
 899        cluster_index = offset >> s->cluster_bits;
 900        for(i = 0; i < nb_clusters; i++) {
 901            ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
 902            if (ret < 0) {
 903                return ret;
 904            } else if (refcount != 0) {
 905                break;
 906            }
 907        }
 908
 909        /* And then allocate them */
 910        ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
 911                              QCOW2_DISCARD_NEVER);
 912    } while (ret == -EAGAIN);
 913
 914    if (ret < 0) {
 915        return ret;
 916    }
 917
 918    return i;
 919}
 920
 921/* only used to allocate compressed sectors. We try to allocate
 922   contiguous sectors. size must be <= cluster_size */
 923int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
 924{
 925    BDRVQcow2State *s = bs->opaque;
 926    int64_t offset;
 927    size_t free_in_cluster;
 928    int ret;
 929
 930    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
 931    assert(size > 0 && size <= s->cluster_size);
 932    assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
 933
 934    offset = s->free_byte_offset;
 935
 936    if (offset) {
 937        uint64_t refcount;
 938        ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
 939        if (ret < 0) {
 940            return ret;
 941        }
 942
 943        if (refcount == s->refcount_max) {
 944            offset = 0;
 945        }
 946    }
 947
 948    free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
 949    do {
 950        if (!offset || free_in_cluster < size) {
 951            int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size);
 952            if (new_cluster < 0) {
 953                return new_cluster;
 954            }
 955
 956            if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
 957                offset = new_cluster;
 958                free_in_cluster = s->cluster_size;
 959            } else {
 960                free_in_cluster += s->cluster_size;
 961            }
 962        }
 963
 964        assert(offset);
 965        ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
 966        if (ret < 0) {
 967            offset = 0;
 968        }
 969    } while (ret == -EAGAIN);
 970    if (ret < 0) {
 971        return ret;
 972    }
 973
 974    /* The cluster refcount was incremented; refcount blocks must be flushed
 975     * before the caller's L2 table updates. */
 976    qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
 977
 978    s->free_byte_offset = offset + size;
 979    if (!offset_into_cluster(s, s->free_byte_offset)) {
 980        s->free_byte_offset = 0;
 981    }
 982
 983    return offset;
 984}
 985
 986void qcow2_free_clusters(BlockDriverState *bs,
 987                          int64_t offset, int64_t size,
 988                          enum qcow2_discard_type type)
 989{
 990    int ret;
 991
 992    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
 993    ret = update_refcount(bs, offset, size, 1, true, type);
 994    if (ret < 0) {
 995        fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
 996        /* TODO Remember the clusters to free them later and avoid leaking */
 997    }
 998}
 999
1000/*
1001 * Free a cluster using its L2 entry (handles clusters of all types, e.g.
1002 * normal cluster, compressed cluster, etc.)
1003 */
1004void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
1005                             int nb_clusters, enum qcow2_discard_type type)
1006{
1007    BDRVQcow2State *s = bs->opaque;
1008
1009    switch (qcow2_get_cluster_type(l2_entry)) {
1010    case QCOW2_CLUSTER_COMPRESSED:
1011        {
1012            int nb_csectors;
1013            nb_csectors = ((l2_entry >> s->csize_shift) &
1014                           s->csize_mask) + 1;
1015            qcow2_free_clusters(bs,
1016                (l2_entry & s->cluster_offset_mask) & ~511,
1017                nb_csectors * 512, type);
1018        }
1019        break;
1020    case QCOW2_CLUSTER_NORMAL:
1021    case QCOW2_CLUSTER_ZERO:
1022        if (l2_entry & L2E_OFFSET_MASK) {
1023            if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1024                qcow2_signal_corruption(bs, false, -1, -1,
1025                                        "Cannot free unaligned cluster %#llx",
1026                                        l2_entry & L2E_OFFSET_MASK);
1027            } else {
1028                qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1029                                    nb_clusters << s->cluster_bits, type);
1030            }
1031        }
1032        break;
1033    case QCOW2_CLUSTER_UNALLOCATED:
1034        break;
1035    default:
1036        abort();
1037    }
1038}
1039
1040
1041
1042/*********************************************************/
1043/* snapshots and image creation */
1044
1045
1046
1047/* update the refcounts of snapshots and the copied flag */
1048int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1049    int64_t l1_table_offset, int l1_size, int addend)
1050{
1051    BDRVQcow2State *s = bs->opaque;
1052    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, refcount;
1053    bool l1_allocated = false;
1054    int64_t old_offset, old_l2_offset;
1055    int i, j, l1_modified = 0, nb_csectors;
1056    int ret;
1057
1058    assert(addend >= -1 && addend <= 1);
1059
1060    l2_table = NULL;
1061    l1_table = NULL;
1062    l1_size2 = l1_size * sizeof(uint64_t);
1063
1064    s->cache_discards = true;
1065
1066    /* WARNING: qcow2_snapshot_goto relies on this function not using the
1067     * l1_table_offset when it is the current s->l1_table_offset! Be careful
1068     * when changing this! */
1069    if (l1_table_offset != s->l1_table_offset) {
1070        l1_table = g_try_malloc0(align_offset(l1_size2, 512));
1071        if (l1_size2 && l1_table == NULL) {
1072            ret = -ENOMEM;
1073            goto fail;
1074        }
1075        l1_allocated = true;
1076
1077        ret = bdrv_pread(bs->file->bs, l1_table_offset, l1_table, l1_size2);
1078        if (ret < 0) {
1079            goto fail;
1080        }
1081
1082        for(i = 0;i < l1_size; i++)
1083            be64_to_cpus(&l1_table[i]);
1084    } else {
1085        assert(l1_size == s->l1_size);
1086        l1_table = s->l1_table;
1087        l1_allocated = false;
1088    }
1089
1090    for(i = 0; i < l1_size; i++) {
1091        l2_offset = l1_table[i];
1092        if (l2_offset) {
1093            old_l2_offset = l2_offset;
1094            l2_offset &= L1E_OFFSET_MASK;
1095
1096            if (offset_into_cluster(s, l2_offset)) {
1097                qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1098                                        PRIx64 " unaligned (L1 index: %#x)",
1099                                        l2_offset, i);
1100                ret = -EIO;
1101                goto fail;
1102            }
1103
1104            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
1105                (void**) &l2_table);
1106            if (ret < 0) {
1107                goto fail;
1108            }
1109
1110            for(j = 0; j < s->l2_size; j++) {
1111                uint64_t cluster_index;
1112
1113                offset = be64_to_cpu(l2_table[j]);
1114                old_offset = offset;
1115                offset &= ~QCOW_OFLAG_COPIED;
1116
1117                switch (qcow2_get_cluster_type(offset)) {
1118                    case QCOW2_CLUSTER_COMPRESSED:
1119                        nb_csectors = ((offset >> s->csize_shift) &
1120                                       s->csize_mask) + 1;
1121                        if (addend != 0) {
1122                            ret = update_refcount(bs,
1123                                (offset & s->cluster_offset_mask) & ~511,
1124                                nb_csectors * 512, abs(addend), addend < 0,
1125                                QCOW2_DISCARD_SNAPSHOT);
1126                            if (ret < 0) {
1127                                goto fail;
1128                            }
1129                        }
1130                        /* compressed clusters are never modified */
1131                        refcount = 2;
1132                        break;
1133
1134                    case QCOW2_CLUSTER_NORMAL:
1135                    case QCOW2_CLUSTER_ZERO:
1136                        if (offset_into_cluster(s, offset & L2E_OFFSET_MASK)) {
1137                            qcow2_signal_corruption(bs, true, -1, -1, "Data "
1138                                                    "cluster offset %#llx "
1139                                                    "unaligned (L2 offset: %#"
1140                                                    PRIx64 ", L2 index: %#x)",
1141                                                    offset & L2E_OFFSET_MASK,
1142                                                    l2_offset, j);
1143                            ret = -EIO;
1144                            goto fail;
1145                        }
1146
1147                        cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
1148                        if (!cluster_index) {
1149                            /* unallocated */
1150                            refcount = 0;
1151                            break;
1152                        }
1153                        if (addend != 0) {
1154                            ret = qcow2_update_cluster_refcount(bs,
1155                                    cluster_index, abs(addend), addend < 0,
1156                                    QCOW2_DISCARD_SNAPSHOT);
1157                            if (ret < 0) {
1158                                goto fail;
1159                            }
1160                        }
1161
1162                        ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1163                        if (ret < 0) {
1164                            goto fail;
1165                        }
1166                        break;
1167
1168                    case QCOW2_CLUSTER_UNALLOCATED:
1169                        refcount = 0;
1170                        break;
1171
1172                    default:
1173                        abort();
1174                }
1175
1176                if (refcount == 1) {
1177                    offset |= QCOW_OFLAG_COPIED;
1178                }
1179                if (offset != old_offset) {
1180                    if (addend > 0) {
1181                        qcow2_cache_set_dependency(bs, s->l2_table_cache,
1182                            s->refcount_block_cache);
1183                    }
1184                    l2_table[j] = cpu_to_be64(offset);
1185                    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache,
1186                                                 l2_table);
1187                }
1188            }
1189
1190            qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
1191
1192            if (addend != 0) {
1193                ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1194                                                        s->cluster_bits,
1195                                                    abs(addend), addend < 0,
1196                                                    QCOW2_DISCARD_SNAPSHOT);
1197                if (ret < 0) {
1198                    goto fail;
1199                }
1200            }
1201            ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1202                                     &refcount);
1203            if (ret < 0) {
1204                goto fail;
1205            } else if (refcount == 1) {
1206                l2_offset |= QCOW_OFLAG_COPIED;
1207            }
1208            if (l2_offset != old_l2_offset) {
1209                l1_table[i] = l2_offset;
1210                l1_modified = 1;
1211            }
1212        }
1213    }
1214
1215    ret = bdrv_flush(bs);
1216fail:
1217    if (l2_table) {
1218        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
1219    }
1220
1221    s->cache_discards = false;
1222    qcow2_process_discards(bs, ret);
1223
1224    /* Update L1 only if it isn't deleted anyway (addend = -1) */
1225    if (ret == 0 && addend >= 0 && l1_modified) {
1226        for (i = 0; i < l1_size; i++) {
1227            cpu_to_be64s(&l1_table[i]);
1228        }
1229
1230        ret = bdrv_pwrite_sync(bs->file->bs, l1_table_offset,
1231                               l1_table, l1_size2);
1232
1233        for (i = 0; i < l1_size; i++) {
1234            be64_to_cpus(&l1_table[i]);
1235        }
1236    }
1237    if (l1_allocated)
1238        g_free(l1_table);
1239    return ret;
1240}
1241
1242
1243
1244
1245/*********************************************************/
1246/* refcount checking functions */
1247
1248
1249static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
1250{
1251    /* This assertion holds because there is no way we can address more than
1252     * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1253     * offsets have to be representable in bytes); due to every cluster
1254     * corresponding to one refcount entry, we are well below that limit */
1255    assert(entries < (UINT64_C(1) << (64 - 9)));
1256
1257    /* Thanks to the assertion this will not overflow, because
1258     * s->refcount_order < 7.
1259     * (note: x << s->refcount_order == x * s->refcount_bits) */
1260    return DIV_ROUND_UP(entries << s->refcount_order, 8);
1261}
1262
1263/**
1264 * Reallocates *array so that it can hold new_size entries. *size must contain
1265 * the current number of entries in *array. If the reallocation fails, *array
1266 * and *size will not be modified and -errno will be returned. If the
1267 * reallocation is successful, *array will be set to the new buffer, *size
1268 * will be set to new_size and 0 will be returned. The size of the reallocated
1269 * refcount array buffer will be aligned to a cluster boundary, and the newly
1270 * allocated area will be zeroed.
1271 */
1272static int realloc_refcount_array(BDRVQcow2State *s, void **array,
1273                                  int64_t *size, int64_t new_size)
1274{
1275    int64_t old_byte_size, new_byte_size;
1276    void *new_ptr;
1277
1278    /* Round to clusters so the array can be directly written to disk */
1279    old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1280                    * s->cluster_size;
1281    new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1282                    * s->cluster_size;
1283
1284    if (new_byte_size == old_byte_size) {
1285        *size = new_size;
1286        return 0;
1287    }
1288
1289    assert(new_byte_size > 0);
1290
1291    if (new_byte_size > SIZE_MAX) {
1292        return -ENOMEM;
1293    }
1294
1295    new_ptr = g_try_realloc(*array, new_byte_size);
1296    if (!new_ptr) {
1297        return -ENOMEM;
1298    }
1299
1300    if (new_byte_size > old_byte_size) {
1301        memset((char *)new_ptr + old_byte_size, 0,
1302               new_byte_size - old_byte_size);
1303    }
1304
1305    *array = new_ptr;
1306    *size  = new_size;
1307
1308    return 0;
1309}
1310
1311/*
1312 * Increases the refcount for a range of clusters in a given refcount table.
1313 * This is used to construct a temporary refcount table out of L1 and L2 tables
1314 * which can be compared to the refcount table saved in the image.
1315 *
1316 * Modifies the number of errors in res.
1317 */
1318static int inc_refcounts(BlockDriverState *bs,
1319                         BdrvCheckResult *res,
1320                         void **refcount_table,
1321                         int64_t *refcount_table_size,
1322                         int64_t offset, int64_t size)
1323{
1324    BDRVQcow2State *s = bs->opaque;
1325    uint64_t start, last, cluster_offset, k, refcount;
1326    int ret;
1327
1328    if (size <= 0) {
1329        return 0;
1330    }
1331
1332    start = start_of_cluster(s, offset);
1333    last = start_of_cluster(s, offset + size - 1);
1334    for(cluster_offset = start; cluster_offset <= last;
1335        cluster_offset += s->cluster_size) {
1336        k = cluster_offset >> s->cluster_bits;
1337        if (k >= *refcount_table_size) {
1338            ret = realloc_refcount_array(s, refcount_table,
1339                                         refcount_table_size, k + 1);
1340            if (ret < 0) {
1341                res->check_errors++;
1342                return ret;
1343            }
1344        }
1345
1346        refcount = s->get_refcount(*refcount_table, k);
1347        if (refcount == s->refcount_max) {
1348            fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1349                    "\n", cluster_offset);
1350            fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1351                    "width or qemu-img convert to create a clean copy if the "
1352                    "image cannot be opened for writing\n");
1353            res->corruptions++;
1354            continue;
1355        }
1356        s->set_refcount(*refcount_table, k, refcount + 1);
1357    }
1358
1359    return 0;
1360}
1361
1362/* Flags for check_refcounts_l1() and check_refcounts_l2() */
1363enum {
1364    CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1365};
1366
1367/*
1368 * Increases the refcount in the given refcount table for the all clusters
1369 * referenced in the L2 table. While doing so, performs some checks on L2
1370 * entries.
1371 *
1372 * Returns the number of errors found by the checks or -errno if an internal
1373 * error occurred.
1374 */
1375static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1376                              void **refcount_table,
1377                              int64_t *refcount_table_size, int64_t l2_offset,
1378                              int flags)
1379{
1380    BDRVQcow2State *s = bs->opaque;
1381    uint64_t *l2_table, l2_entry;
1382    uint64_t next_contiguous_offset = 0;
1383    int i, l2_size, nb_csectors, ret;
1384
1385    /* Read L2 table from disk */
1386    l2_size = s->l2_size * sizeof(uint64_t);
1387    l2_table = g_malloc(l2_size);
1388
1389    ret = bdrv_pread(bs->file->bs, l2_offset, l2_table, l2_size);
1390    if (ret < 0) {
1391        fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1392        res->check_errors++;
1393        goto fail;
1394    }
1395
1396    /* Do the actual checks */
1397    for(i = 0; i < s->l2_size; i++) {
1398        l2_entry = be64_to_cpu(l2_table[i]);
1399
1400        switch (qcow2_get_cluster_type(l2_entry)) {
1401        case QCOW2_CLUSTER_COMPRESSED:
1402            /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1403            if (l2_entry & QCOW_OFLAG_COPIED) {
1404                fprintf(stderr, "ERROR: cluster %" PRId64 ": "
1405                    "copied flag must never be set for compressed "
1406                    "clusters\n", l2_entry >> s->cluster_bits);
1407                l2_entry &= ~QCOW_OFLAG_COPIED;
1408                res->corruptions++;
1409            }
1410
1411            /* Mark cluster as used */
1412            nb_csectors = ((l2_entry >> s->csize_shift) &
1413                           s->csize_mask) + 1;
1414            l2_entry &= s->cluster_offset_mask;
1415            ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1416                                l2_entry & ~511, nb_csectors * 512);
1417            if (ret < 0) {
1418                goto fail;
1419            }
1420
1421            if (flags & CHECK_FRAG_INFO) {
1422                res->bfi.allocated_clusters++;
1423                res->bfi.compressed_clusters++;
1424
1425                /* Compressed clusters are fragmented by nature.  Since they
1426                 * take up sub-sector space but we only have sector granularity
1427                 * I/O we need to re-read the same sectors even for adjacent
1428                 * compressed clusters.
1429                 */
1430                res->bfi.fragmented_clusters++;
1431            }
1432            break;
1433
1434        case QCOW2_CLUSTER_ZERO:
1435            if ((l2_entry & L2E_OFFSET_MASK) == 0) {
1436                break;
1437            }
1438            /* fall through */
1439
1440        case QCOW2_CLUSTER_NORMAL:
1441        {
1442            uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1443
1444            if (flags & CHECK_FRAG_INFO) {
1445                res->bfi.allocated_clusters++;
1446                if (next_contiguous_offset &&
1447                    offset != next_contiguous_offset) {
1448                    res->bfi.fragmented_clusters++;
1449                }
1450                next_contiguous_offset = offset + s->cluster_size;
1451            }
1452
1453            /* Mark cluster as used */
1454            ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1455                                offset, s->cluster_size);
1456            if (ret < 0) {
1457                goto fail;
1458            }
1459
1460            /* Correct offsets are cluster aligned */
1461            if (offset_into_cluster(s, offset)) {
1462                fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
1463                    "properly aligned; L2 entry corrupted.\n", offset);
1464                res->corruptions++;
1465            }
1466            break;
1467        }
1468
1469        case QCOW2_CLUSTER_UNALLOCATED:
1470            break;
1471
1472        default:
1473            abort();
1474        }
1475    }
1476
1477    g_free(l2_table);
1478    return 0;
1479
1480fail:
1481    g_free(l2_table);
1482    return ret;
1483}
1484
1485/*
1486 * Increases the refcount for the L1 table, its L2 tables and all referenced
1487 * clusters in the given refcount table. While doing so, performs some checks
1488 * on L1 and L2 entries.
1489 *
1490 * Returns the number of errors found by the checks or -errno if an internal
1491 * error occurred.
1492 */
1493static int check_refcounts_l1(BlockDriverState *bs,
1494                              BdrvCheckResult *res,
1495                              void **refcount_table,
1496                              int64_t *refcount_table_size,
1497                              int64_t l1_table_offset, int l1_size,
1498                              int flags)
1499{
1500    BDRVQcow2State *s = bs->opaque;
1501    uint64_t *l1_table = NULL, l2_offset, l1_size2;
1502    int i, ret;
1503
1504    l1_size2 = l1_size * sizeof(uint64_t);
1505
1506    /* Mark L1 table as used */
1507    ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1508                        l1_table_offset, l1_size2);
1509    if (ret < 0) {
1510        goto fail;
1511    }
1512
1513    /* Read L1 table entries from disk */
1514    if (l1_size2 > 0) {
1515        l1_table = g_try_malloc(l1_size2);
1516        if (l1_table == NULL) {
1517            ret = -ENOMEM;
1518            res->check_errors++;
1519            goto fail;
1520        }
1521        ret = bdrv_pread(bs->file->bs, l1_table_offset, l1_table, l1_size2);
1522        if (ret < 0) {
1523            fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1524            res->check_errors++;
1525            goto fail;
1526        }
1527        for(i = 0;i < l1_size; i++)
1528            be64_to_cpus(&l1_table[i]);
1529    }
1530
1531    /* Do the actual checks */
1532    for(i = 0; i < l1_size; i++) {
1533        l2_offset = l1_table[i];
1534        if (l2_offset) {
1535            /* Mark L2 table as used */
1536            l2_offset &= L1E_OFFSET_MASK;
1537            ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1538                                l2_offset, s->cluster_size);
1539            if (ret < 0) {
1540                goto fail;
1541            }
1542
1543            /* L2 tables are cluster aligned */
1544            if (offset_into_cluster(s, l2_offset)) {
1545                fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1546                    "cluster aligned; L1 entry corrupted\n", l2_offset);
1547                res->corruptions++;
1548            }
1549
1550            /* Process and check L2 entries */
1551            ret = check_refcounts_l2(bs, res, refcount_table,
1552                                     refcount_table_size, l2_offset, flags);
1553            if (ret < 0) {
1554                goto fail;
1555            }
1556        }
1557    }
1558    g_free(l1_table);
1559    return 0;
1560
1561fail:
1562    g_free(l1_table);
1563    return ret;
1564}
1565
1566/*
1567 * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1568 *
1569 * This function does not print an error message nor does it increment
1570 * check_errors if qcow2_get_refcount fails (this is because such an error will
1571 * have been already detected and sufficiently signaled by the calling function
1572 * (qcow2_check_refcounts) by the time this function is called).
1573 */
1574static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1575                              BdrvCheckMode fix)
1576{
1577    BDRVQcow2State *s = bs->opaque;
1578    uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1579    int ret;
1580    uint64_t refcount;
1581    int i, j;
1582
1583    for (i = 0; i < s->l1_size; i++) {
1584        uint64_t l1_entry = s->l1_table[i];
1585        uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1586        bool l2_dirty = false;
1587
1588        if (!l2_offset) {
1589            continue;
1590        }
1591
1592        ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1593                                 &refcount);
1594        if (ret < 0) {
1595            /* don't print message nor increment check_errors */
1596            continue;
1597        }
1598        if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1599            fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1600                    "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1601                    fix & BDRV_FIX_ERRORS ? "Repairing" :
1602                                            "ERROR",
1603                    i, l1_entry, refcount);
1604            if (fix & BDRV_FIX_ERRORS) {
1605                s->l1_table[i] = refcount == 1
1606                               ? l1_entry |  QCOW_OFLAG_COPIED
1607                               : l1_entry & ~QCOW_OFLAG_COPIED;
1608                ret = qcow2_write_l1_entry(bs, i);
1609                if (ret < 0) {
1610                    res->check_errors++;
1611                    goto fail;
1612                }
1613                res->corruptions_fixed++;
1614            } else {
1615                res->corruptions++;
1616            }
1617        }
1618
1619        ret = bdrv_pread(bs->file->bs, l2_offset, l2_table,
1620                         s->l2_size * sizeof(uint64_t));
1621        if (ret < 0) {
1622            fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1623                    strerror(-ret));
1624            res->check_errors++;
1625            goto fail;
1626        }
1627
1628        for (j = 0; j < s->l2_size; j++) {
1629            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1630            uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
1631            int cluster_type = qcow2_get_cluster_type(l2_entry);
1632
1633            if ((cluster_type == QCOW2_CLUSTER_NORMAL) ||
1634                ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) {
1635                ret = qcow2_get_refcount(bs,
1636                                         data_offset >> s->cluster_bits,
1637                                         &refcount);
1638                if (ret < 0) {
1639                    /* don't print message nor increment check_errors */
1640                    continue;
1641                }
1642                if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
1643                    fprintf(stderr, "%s OFLAG_COPIED data cluster: "
1644                            "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1645                            fix & BDRV_FIX_ERRORS ? "Repairing" :
1646                                                    "ERROR",
1647                            l2_entry, refcount);
1648                    if (fix & BDRV_FIX_ERRORS) {
1649                        l2_table[j] = cpu_to_be64(refcount == 1
1650                                    ? l2_entry |  QCOW_OFLAG_COPIED
1651                                    : l2_entry & ~QCOW_OFLAG_COPIED);
1652                        l2_dirty = true;
1653                        res->corruptions_fixed++;
1654                    } else {
1655                        res->corruptions++;
1656                    }
1657                }
1658            }
1659        }
1660
1661        if (l2_dirty) {
1662            ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1663                                                l2_offset, s->cluster_size);
1664            if (ret < 0) {
1665                fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1666                        "overlap check failed: %s\n", strerror(-ret));
1667                res->check_errors++;
1668                goto fail;
1669            }
1670
1671            ret = bdrv_pwrite(bs->file->bs, l2_offset, l2_table,
1672                              s->cluster_size);
1673            if (ret < 0) {
1674                fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1675                        strerror(-ret));
1676                res->check_errors++;
1677                goto fail;
1678            }
1679        }
1680    }
1681
1682    ret = 0;
1683
1684fail:
1685    qemu_vfree(l2_table);
1686    return ret;
1687}
1688
1689/*
1690 * Checks consistency of refblocks and accounts for each refblock in
1691 * *refcount_table.
1692 */
1693static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
1694                           BdrvCheckMode fix, bool *rebuild,
1695                           void **refcount_table, int64_t *nb_clusters)
1696{
1697    BDRVQcow2State *s = bs->opaque;
1698    int64_t i, size;
1699    int ret;
1700
1701    for(i = 0; i < s->refcount_table_size; i++) {
1702        uint64_t offset, cluster;
1703        offset = s->refcount_table[i];
1704        cluster = offset >> s->cluster_bits;
1705
1706        /* Refcount blocks are cluster aligned */
1707        if (offset_into_cluster(s, offset)) {
1708            fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
1709                "cluster aligned; refcount table entry corrupted\n", i);
1710            res->corruptions++;
1711            *rebuild = true;
1712            continue;
1713        }
1714
1715        if (cluster >= *nb_clusters) {
1716            fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
1717                    fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
1718
1719            if (fix & BDRV_FIX_ERRORS) {
1720                int64_t new_nb_clusters;
1721
1722                if (offset > INT64_MAX - s->cluster_size) {
1723                    ret = -EINVAL;
1724                    goto resize_fail;
1725                }
1726
1727                ret = bdrv_truncate(bs->file->bs, offset + s->cluster_size);
1728                if (ret < 0) {
1729                    goto resize_fail;
1730                }
1731                size = bdrv_getlength(bs->file->bs);
1732                if (size < 0) {
1733                    ret = size;
1734                    goto resize_fail;
1735                }
1736
1737                new_nb_clusters = size_to_clusters(s, size);
1738                assert(new_nb_clusters >= *nb_clusters);
1739
1740                ret = realloc_refcount_array(s, refcount_table,
1741                                             nb_clusters, new_nb_clusters);
1742                if (ret < 0) {
1743                    res->check_errors++;
1744                    return ret;
1745                }
1746
1747                if (cluster >= *nb_clusters) {
1748                    ret = -EINVAL;
1749                    goto resize_fail;
1750                }
1751
1752                res->corruptions_fixed++;
1753                ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1754                                    offset, s->cluster_size);
1755                if (ret < 0) {
1756                    return ret;
1757                }
1758                /* No need to check whether the refcount is now greater than 1:
1759                 * This area was just allocated and zeroed, so it can only be
1760                 * exactly 1 after inc_refcounts() */
1761                continue;
1762
1763resize_fail:
1764                res->corruptions++;
1765                *rebuild = true;
1766                fprintf(stderr, "ERROR could not resize image: %s\n",
1767                        strerror(-ret));
1768            } else {
1769                res->corruptions++;
1770            }
1771            continue;
1772        }
1773
1774        if (offset != 0) {
1775            ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1776                                offset, s->cluster_size);
1777            if (ret < 0) {
1778                return ret;
1779            }
1780            if (s->get_refcount(*refcount_table, cluster) != 1) {
1781                fprintf(stderr, "ERROR refcount block %" PRId64
1782                        " refcount=%" PRIu64 "\n", i,
1783                        s->get_refcount(*refcount_table, cluster));
1784                res->corruptions++;
1785                *rebuild = true;
1786            }
1787        }
1788    }
1789
1790    return 0;
1791}
1792
1793/*
1794 * Calculates an in-memory refcount table.
1795 */
1796static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1797                               BdrvCheckMode fix, bool *rebuild,
1798                               void **refcount_table, int64_t *nb_clusters)
1799{
1800    BDRVQcow2State *s = bs->opaque;
1801    int64_t i;
1802    QCowSnapshot *sn;
1803    int ret;
1804
1805    if (!*refcount_table) {
1806        int64_t old_size = 0;
1807        ret = realloc_refcount_array(s, refcount_table,
1808                                     &old_size, *nb_clusters);
1809        if (ret < 0) {
1810            res->check_errors++;
1811            return ret;
1812        }
1813    }
1814
1815    /* header */
1816    ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1817                        0, s->cluster_size);
1818    if (ret < 0) {
1819        return ret;
1820    }
1821
1822    /* current L1 table */
1823    ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1824                             s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO);
1825    if (ret < 0) {
1826        return ret;
1827    }
1828
1829    /* snapshots */
1830    for (i = 0; i < s->nb_snapshots; i++) {
1831        sn = s->snapshots + i;
1832        ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1833                                 sn->l1_table_offset, sn->l1_size, 0);
1834        if (ret < 0) {
1835            return ret;
1836        }
1837    }
1838    ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1839                        s->snapshots_offset, s->snapshots_size);
1840    if (ret < 0) {
1841        return ret;
1842    }
1843
1844    /* refcount data */
1845    ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1846                        s->refcount_table_offset,
1847                        s->refcount_table_size * sizeof(uint64_t));
1848    if (ret < 0) {
1849        return ret;
1850    }
1851
1852    return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
1853}
1854
1855/*
1856 * Compares the actual reference count for each cluster in the image against the
1857 * refcount as reported by the refcount structures on-disk.
1858 */
1859static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1860                              BdrvCheckMode fix, bool *rebuild,
1861                              int64_t *highest_cluster,
1862                              void *refcount_table, int64_t nb_clusters)
1863{
1864    BDRVQcow2State *s = bs->opaque;
1865    int64_t i;
1866    uint64_t refcount1, refcount2;
1867    int ret;
1868
1869    for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
1870        ret = qcow2_get_refcount(bs, i, &refcount1);
1871        if (ret < 0) {
1872            fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
1873                    i, strerror(-ret));
1874            res->check_errors++;
1875            continue;
1876        }
1877
1878        refcount2 = s->get_refcount(refcount_table, i);
1879
1880        if (refcount1 > 0 || refcount2 > 0) {
1881            *highest_cluster = i;
1882        }
1883
1884        if (refcount1 != refcount2) {
1885            /* Check if we're allowed to fix the mismatch */
1886            int *num_fixed = NULL;
1887            if (refcount1 == 0) {
1888                *rebuild = true;
1889            } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
1890                num_fixed = &res->leaks_fixed;
1891            } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
1892                num_fixed = &res->corruptions_fixed;
1893            }
1894
1895            fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
1896                    " reference=%" PRIu64 "\n",
1897                   num_fixed != NULL     ? "Repairing" :
1898                   refcount1 < refcount2 ? "ERROR" :
1899                                           "Leaked",
1900                   i, refcount1, refcount2);
1901
1902            if (num_fixed) {
1903                ret = update_refcount(bs, i << s->cluster_bits, 1,
1904                                      refcount_diff(refcount1, refcount2),
1905                                      refcount1 > refcount2,
1906                                      QCOW2_DISCARD_ALWAYS);
1907                if (ret >= 0) {
1908                    (*num_fixed)++;
1909                    continue;
1910                }
1911            }
1912
1913            /* And if we couldn't, print an error */
1914            if (refcount1 < refcount2) {
1915                res->corruptions++;
1916            } else {
1917                res->leaks++;
1918            }
1919        }
1920    }
1921}
1922
1923/*
1924 * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
1925 * the on-disk refcount structures.
1926 *
1927 * On input, *first_free_cluster tells where to start looking, and need not
1928 * actually be a free cluster; the returned offset will not be before that
1929 * cluster.  On output, *first_free_cluster points to the first gap found, even
1930 * if that gap was too small to be used as the returned offset.
1931 *
1932 * Note that *first_free_cluster is a cluster index whereas the return value is
1933 * an offset.
1934 */
1935static int64_t alloc_clusters_imrt(BlockDriverState *bs,
1936                                   int cluster_count,
1937                                   void **refcount_table,
1938                                   int64_t *imrt_nb_clusters,
1939                                   int64_t *first_free_cluster)
1940{
1941    BDRVQcow2State *s = bs->opaque;
1942    int64_t cluster = *first_free_cluster, i;
1943    bool first_gap = true;
1944    int contiguous_free_clusters;
1945    int ret;
1946
1947    /* Starting at *first_free_cluster, find a range of at least cluster_count
1948     * continuously free clusters */
1949    for (contiguous_free_clusters = 0;
1950         cluster < *imrt_nb_clusters &&
1951         contiguous_free_clusters < cluster_count;
1952         cluster++)
1953    {
1954        if (!s->get_refcount(*refcount_table, cluster)) {
1955            contiguous_free_clusters++;
1956            if (first_gap) {
1957                /* If this is the first free cluster found, update
1958                 * *first_free_cluster accordingly */
1959                *first_free_cluster = cluster;
1960                first_gap = false;
1961            }
1962        } else if (contiguous_free_clusters) {
1963            contiguous_free_clusters = 0;
1964        }
1965    }
1966
1967    /* If contiguous_free_clusters is greater than zero, it contains the number
1968     * of continuously free clusters until the current cluster; the first free
1969     * cluster in the current "gap" is therefore
1970     * cluster - contiguous_free_clusters */
1971
1972    /* If no such range could be found, grow the in-memory refcount table
1973     * accordingly to append free clusters at the end of the image */
1974    if (contiguous_free_clusters < cluster_count) {
1975        /* contiguous_free_clusters clusters are already empty at the image end;
1976         * we need cluster_count clusters; therefore, we have to allocate
1977         * cluster_count - contiguous_free_clusters new clusters at the end of
1978         * the image (which is the current value of cluster; note that cluster
1979         * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
1980         * the image end) */
1981        ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
1982                                     cluster + cluster_count
1983                                     - contiguous_free_clusters);
1984        if (ret < 0) {
1985            return ret;
1986        }
1987    }
1988
1989    /* Go back to the first free cluster */
1990    cluster -= contiguous_free_clusters;
1991    for (i = 0; i < cluster_count; i++) {
1992        s->set_refcount(*refcount_table, cluster + i, 1);
1993    }
1994
1995    return cluster << s->cluster_bits;
1996}
1997
1998/*
1999 * Creates a new refcount structure based solely on the in-memory information
2000 * given through *refcount_table. All necessary allocations will be reflected
2001 * in that array.
2002 *
2003 * On success, the old refcount structure is leaked (it will be covered by the
2004 * new refcount structure).
2005 */
2006static int rebuild_refcount_structure(BlockDriverState *bs,
2007                                      BdrvCheckResult *res,
2008                                      void **refcount_table,
2009                                      int64_t *nb_clusters)
2010{
2011    BDRVQcow2State *s = bs->opaque;
2012    int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0;
2013    int64_t refblock_offset, refblock_start, refblock_index;
2014    uint32_t reftable_size = 0;
2015    uint64_t *on_disk_reftable = NULL;
2016    void *on_disk_refblock;
2017    int ret = 0;
2018    struct {
2019        uint64_t reftable_offset;
2020        uint32_t reftable_clusters;
2021    } QEMU_PACKED reftable_offset_and_clusters;
2022
2023    qcow2_cache_empty(bs, s->refcount_block_cache);
2024
2025write_refblocks:
2026    for (; cluster < *nb_clusters; cluster++) {
2027        if (!s->get_refcount(*refcount_table, cluster)) {
2028            continue;
2029        }
2030
2031        refblock_index = cluster >> s->refcount_block_bits;
2032        refblock_start = refblock_index << s->refcount_block_bits;
2033
2034        /* Don't allocate a cluster in a refblock already written to disk */
2035        if (first_free_cluster < refblock_start) {
2036            first_free_cluster = refblock_start;
2037        }
2038        refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2039                                              nb_clusters, &first_free_cluster);
2040        if (refblock_offset < 0) {
2041            fprintf(stderr, "ERROR allocating refblock: %s\n",
2042                    strerror(-refblock_offset));
2043            res->check_errors++;
2044            ret = refblock_offset;
2045            goto fail;
2046        }
2047
2048        if (reftable_size <= refblock_index) {
2049            uint32_t old_reftable_size = reftable_size;
2050            uint64_t *new_on_disk_reftable;
2051
2052            reftable_size = ROUND_UP((refblock_index + 1) * sizeof(uint64_t),
2053                                     s->cluster_size) / sizeof(uint64_t);
2054            new_on_disk_reftable = g_try_realloc(on_disk_reftable,
2055                                                 reftable_size *
2056                                                 sizeof(uint64_t));
2057            if (!new_on_disk_reftable) {
2058                res->check_errors++;
2059                ret = -ENOMEM;
2060                goto fail;
2061            }
2062            on_disk_reftable = new_on_disk_reftable;
2063
2064            memset(on_disk_reftable + old_reftable_size, 0,
2065                   (reftable_size - old_reftable_size) * sizeof(uint64_t));
2066
2067            /* The offset we have for the reftable is now no longer valid;
2068             * this will leak that range, but we can easily fix that by running
2069             * a leak-fixing check after this rebuild operation */
2070            reftable_offset = -1;
2071        }
2072        on_disk_reftable[refblock_index] = refblock_offset;
2073
2074        /* If this is apparently the last refblock (for now), try to squeeze the
2075         * reftable in */
2076        if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits &&
2077            reftable_offset < 0)
2078        {
2079            uint64_t reftable_clusters = size_to_clusters(s, reftable_size *
2080                                                          sizeof(uint64_t));
2081            reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2082                                                  refcount_table, nb_clusters,
2083                                                  &first_free_cluster);
2084            if (reftable_offset < 0) {
2085                fprintf(stderr, "ERROR allocating reftable: %s\n",
2086                        strerror(-reftable_offset));
2087                res->check_errors++;
2088                ret = reftable_offset;
2089                goto fail;
2090            }
2091        }
2092
2093        ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2094                                            s->cluster_size);
2095        if (ret < 0) {
2096            fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2097            goto fail;
2098        }
2099
2100        /* The size of *refcount_table is always cluster-aligned, therefore the
2101         * write operation will not overflow */
2102        on_disk_refblock = (void *)((char *) *refcount_table +
2103                                    refblock_index * s->cluster_size);
2104
2105        ret = bdrv_write(bs->file->bs, refblock_offset / BDRV_SECTOR_SIZE,
2106                         on_disk_refblock, s->cluster_sectors);
2107        if (ret < 0) {
2108            fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2109            goto fail;
2110        }
2111
2112        /* Go to the end of this refblock */
2113        cluster = refblock_start + s->refcount_block_size - 1;
2114    }
2115
2116    if (reftable_offset < 0) {
2117        uint64_t post_refblock_start, reftable_clusters;
2118
2119        post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size);
2120        reftable_clusters = size_to_clusters(s,
2121                                             reftable_size * sizeof(uint64_t));
2122        /* Not pretty but simple */
2123        if (first_free_cluster < post_refblock_start) {
2124            first_free_cluster = post_refblock_start;
2125        }
2126        reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2127                                              refcount_table, nb_clusters,
2128                                              &first_free_cluster);
2129        if (reftable_offset < 0) {
2130            fprintf(stderr, "ERROR allocating reftable: %s\n",
2131                    strerror(-reftable_offset));
2132            res->check_errors++;
2133            ret = reftable_offset;
2134            goto fail;
2135        }
2136
2137        goto write_refblocks;
2138    }
2139
2140    assert(on_disk_reftable);
2141
2142    for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2143        cpu_to_be64s(&on_disk_reftable[refblock_index]);
2144    }
2145
2146    ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset,
2147                                        reftable_size * sizeof(uint64_t));
2148    if (ret < 0) {
2149        fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2150        goto fail;
2151    }
2152
2153    assert(reftable_size < INT_MAX / sizeof(uint64_t));
2154    ret = bdrv_pwrite(bs->file->bs, reftable_offset, on_disk_reftable,
2155                      reftable_size * sizeof(uint64_t));
2156    if (ret < 0) {
2157        fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2158        goto fail;
2159    }
2160
2161    /* Enter new reftable into the image header */
2162    cpu_to_be64w(&reftable_offset_and_clusters.reftable_offset,
2163                 reftable_offset);
2164    cpu_to_be32w(&reftable_offset_and_clusters.reftable_clusters,
2165                 size_to_clusters(s, reftable_size * sizeof(uint64_t)));
2166    ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader,
2167                                                  refcount_table_offset),
2168                           &reftable_offset_and_clusters,
2169                           sizeof(reftable_offset_and_clusters));
2170    if (ret < 0) {
2171        fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret));
2172        goto fail;
2173    }
2174
2175    for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2176        be64_to_cpus(&on_disk_reftable[refblock_index]);
2177    }
2178    s->refcount_table = on_disk_reftable;
2179    s->refcount_table_offset = reftable_offset;
2180    s->refcount_table_size = reftable_size;
2181
2182    return 0;
2183
2184fail:
2185    g_free(on_disk_reftable);
2186    return ret;
2187}
2188
2189/*
2190 * Checks an image for refcount consistency.
2191 *
2192 * Returns 0 if no errors are found, the number of errors in case the image is
2193 * detected as corrupted, and -errno when an internal error occurred.
2194 */
2195int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2196                          BdrvCheckMode fix)
2197{
2198    BDRVQcow2State *s = bs->opaque;
2199    BdrvCheckResult pre_compare_res;
2200    int64_t size, highest_cluster, nb_clusters;
2201    void *refcount_table = NULL;
2202    bool rebuild = false;
2203    int ret;
2204
2205    size = bdrv_getlength(bs->file->bs);
2206    if (size < 0) {
2207        res->check_errors++;
2208        return size;
2209    }
2210
2211    nb_clusters = size_to_clusters(s, size);
2212    if (nb_clusters > INT_MAX) {
2213        res->check_errors++;
2214        return -EFBIG;
2215    }
2216
2217    res->bfi.total_clusters =
2218        size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2219
2220    ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2221                              &nb_clusters);
2222    if (ret < 0) {
2223        goto fail;
2224    }
2225
2226    /* In case we don't need to rebuild the refcount structure (but want to fix
2227     * something), this function is immediately called again, in which case the
2228     * result should be ignored */
2229    pre_compare_res = *res;
2230    compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2231                      nb_clusters);
2232
2233    if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2234        BdrvCheckResult old_res = *res;
2235        int fresh_leaks = 0;
2236
2237        fprintf(stderr, "Rebuilding refcount structure\n");
2238        ret = rebuild_refcount_structure(bs, res, &refcount_table,
2239                                         &nb_clusters);
2240        if (ret < 0) {
2241            goto fail;
2242        }
2243
2244        res->corruptions = 0;
2245        res->leaks = 0;
2246
2247        /* Because the old reftable has been exchanged for a new one the
2248         * references have to be recalculated */
2249        rebuild = false;
2250        memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2251        ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2252                                  &nb_clusters);
2253        if (ret < 0) {
2254            goto fail;
2255        }
2256
2257        if (fix & BDRV_FIX_LEAKS) {
2258            /* The old refcount structures are now leaked, fix it; the result
2259             * can be ignored, aside from leaks which were introduced by
2260             * rebuild_refcount_structure() that could not be fixed */
2261            BdrvCheckResult saved_res = *res;
2262            *res = (BdrvCheckResult){ 0 };
2263
2264            compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2265                              &highest_cluster, refcount_table, nb_clusters);
2266            if (rebuild) {
2267                fprintf(stderr, "ERROR rebuilt refcount structure is still "
2268                        "broken\n");
2269            }
2270
2271            /* Any leaks accounted for here were introduced by
2272             * rebuild_refcount_structure() because that function has created a
2273             * new refcount structure from scratch */
2274            fresh_leaks = res->leaks;
2275            *res = saved_res;
2276        }
2277
2278        if (res->corruptions < old_res.corruptions) {
2279            res->corruptions_fixed += old_res.corruptions - res->corruptions;
2280        }
2281        if (res->leaks < old_res.leaks) {
2282            res->leaks_fixed += old_res.leaks - res->leaks;
2283        }
2284        res->leaks += fresh_leaks;
2285    } else if (fix) {
2286        if (rebuild) {
2287            fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2288            res->check_errors++;
2289            ret = -EIO;
2290            goto fail;
2291        }
2292
2293        if (res->leaks || res->corruptions) {
2294            *res = pre_compare_res;
2295            compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2296                              refcount_table, nb_clusters);
2297        }
2298    }
2299
2300    /* check OFLAG_COPIED */
2301    ret = check_oflag_copied(bs, res, fix);
2302    if (ret < 0) {
2303        goto fail;
2304    }
2305
2306    res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2307    ret = 0;
2308
2309fail:
2310    g_free(refcount_table);
2311
2312    return ret;
2313}
2314
2315#define overlaps_with(ofs, sz) \
2316    ranges_overlap(offset, size, ofs, sz)
2317
2318/*
2319 * Checks if the given offset into the image file is actually free to use by
2320 * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2321 * i.e. a sanity check without relying on the refcount tables.
2322 *
2323 * The ign parameter specifies what checks not to perform (being a bitmask of
2324 * QCow2MetadataOverlap values), i.e., what sections to ignore.
2325 *
2326 * Returns:
2327 * - 0 if writing to this offset will not affect the mentioned metadata
2328 * - a positive QCow2MetadataOverlap value indicating one overlapping section
2329 * - a negative value (-errno) indicating an error while performing a check,
2330 *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
2331 */
2332int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2333                                 int64_t size)
2334{
2335    BDRVQcow2State *s = bs->opaque;
2336    int chk = s->overlap_check & ~ign;
2337    int i, j;
2338
2339    if (!size) {
2340        return 0;
2341    }
2342
2343    if (chk & QCOW2_OL_MAIN_HEADER) {
2344        if (offset < s->cluster_size) {
2345            return QCOW2_OL_MAIN_HEADER;
2346        }
2347    }
2348
2349    /* align range to test to cluster boundaries */
2350    size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
2351    offset = start_of_cluster(s, offset);
2352
2353    if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2354        if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
2355            return QCOW2_OL_ACTIVE_L1;
2356        }
2357    }
2358
2359    if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2360        if (overlaps_with(s->refcount_table_offset,
2361            s->refcount_table_size * sizeof(uint64_t))) {
2362            return QCOW2_OL_REFCOUNT_TABLE;
2363        }
2364    }
2365
2366    if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2367        if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2368            return QCOW2_OL_SNAPSHOT_TABLE;
2369        }
2370    }
2371
2372    if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2373        for (i = 0; i < s->nb_snapshots; i++) {
2374            if (s->snapshots[i].l1_size &&
2375                overlaps_with(s->snapshots[i].l1_table_offset,
2376                s->snapshots[i].l1_size * sizeof(uint64_t))) {
2377                return QCOW2_OL_INACTIVE_L1;
2378            }
2379        }
2380    }
2381
2382    if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2383        for (i = 0; i < s->l1_size; i++) {
2384            if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2385                overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2386                s->cluster_size)) {
2387                return QCOW2_OL_ACTIVE_L2;
2388            }
2389        }
2390    }
2391
2392    if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2393        for (i = 0; i < s->refcount_table_size; i++) {
2394            if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2395                overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2396                s->cluster_size)) {
2397                return QCOW2_OL_REFCOUNT_BLOCK;
2398            }
2399        }
2400    }
2401
2402    if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2403        for (i = 0; i < s->nb_snapshots; i++) {
2404            uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2405            uint32_t l1_sz  = s->snapshots[i].l1_size;
2406            uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
2407            uint64_t *l1 = g_try_malloc(l1_sz2);
2408            int ret;
2409
2410            if (l1_sz2 && l1 == NULL) {
2411                return -ENOMEM;
2412            }
2413
2414            ret = bdrv_pread(bs->file->bs, l1_ofs, l1, l1_sz2);
2415            if (ret < 0) {
2416                g_free(l1);
2417                return ret;
2418            }
2419
2420            for (j = 0; j < l1_sz; j++) {
2421                uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
2422                if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
2423                    g_free(l1);
2424                    return QCOW2_OL_INACTIVE_L2;
2425                }
2426            }
2427
2428            g_free(l1);
2429        }
2430    }
2431
2432    return 0;
2433}
2434
2435static const char *metadata_ol_names[] = {
2436    [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header",
2437    [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table",
2438    [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table",
2439    [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
2440    [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
2441    [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
2442    [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table",
2443    [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table",
2444};
2445
2446/*
2447 * First performs a check for metadata overlaps (through
2448 * qcow2_check_metadata_overlap); if that fails with a negative value (error
2449 * while performing a check), that value is returned. If an impending overlap
2450 * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
2451 * and -EIO returned.
2452 *
2453 * Returns 0 if there were neither overlaps nor errors while checking for
2454 * overlaps; or a negative value (-errno) on error.
2455 */
2456int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
2457                                  int64_t size)
2458{
2459    int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
2460
2461    if (ret < 0) {
2462        return ret;
2463    } else if (ret > 0) {
2464        int metadata_ol_bitnr = ctz32(ret);
2465        assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
2466
2467        qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
2468                                "write on metadata (overlaps with %s)",
2469                                metadata_ol_names[metadata_ol_bitnr]);
2470        return -EIO;
2471    }
2472
2473    return 0;
2474}
2475
2476/* A pointer to a function of this type is given to walk_over_reftable(). That
2477 * function will create refblocks and pass them to a RefblockFinishOp once they
2478 * are completed (@refblock). @refblock_empty is set if the refblock is
2479 * completely empty.
2480 *
2481 * Along with the refblock, a corresponding reftable entry is passed, in the
2482 * reftable @reftable (which may be reallocated) at @reftable_index.
2483 *
2484 * @allocated should be set to true if a new cluster has been allocated.
2485 */
2486typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
2487                               uint64_t reftable_index, uint64_t *reftable_size,
2488                               void *refblock, bool refblock_empty,
2489                               bool *allocated, Error **errp);
2490
2491/**
2492 * This "operation" for walk_over_reftable() allocates the refblock on disk (if
2493 * it is not empty) and inserts its offset into the new reftable. The size of
2494 * this new reftable is increased as required.
2495 */
2496static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
2497                          uint64_t reftable_index, uint64_t *reftable_size,
2498                          void *refblock, bool refblock_empty, bool *allocated,
2499                          Error **errp)
2500{
2501    BDRVQcow2State *s = bs->opaque;
2502    int64_t offset;
2503
2504    if (!refblock_empty && reftable_index >= *reftable_size) {
2505        uint64_t *new_reftable;
2506        uint64_t new_reftable_size;
2507
2508        new_reftable_size = ROUND_UP(reftable_index + 1,
2509                                     s->cluster_size / sizeof(uint64_t));
2510        if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
2511            error_setg(errp,
2512                       "This operation would make the refcount table grow "
2513                       "beyond the maximum size supported by QEMU, aborting");
2514            return -ENOTSUP;
2515        }
2516
2517        new_reftable = g_try_realloc(*reftable, new_reftable_size *
2518                                                sizeof(uint64_t));
2519        if (!new_reftable) {
2520            error_setg(errp, "Failed to increase reftable buffer size");
2521            return -ENOMEM;
2522        }
2523
2524        memset(new_reftable + *reftable_size, 0,
2525               (new_reftable_size - *reftable_size) * sizeof(uint64_t));
2526
2527        *reftable      = new_reftable;
2528        *reftable_size = new_reftable_size;
2529    }
2530
2531    if (!refblock_empty && !(*reftable)[reftable_index]) {
2532        offset = qcow2_alloc_clusters(bs, s->cluster_size);
2533        if (offset < 0) {
2534            error_setg_errno(errp, -offset, "Failed to allocate refblock");
2535            return offset;
2536        }
2537        (*reftable)[reftable_index] = offset;
2538        *allocated = true;
2539    }
2540
2541    return 0;
2542}
2543
2544/**
2545 * This "operation" for walk_over_reftable() writes the refblock to disk at the
2546 * offset specified by the new reftable's entry. It does not modify the new
2547 * reftable or change any refcounts.
2548 */
2549static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
2550                          uint64_t reftable_index, uint64_t *reftable_size,
2551                          void *refblock, bool refblock_empty, bool *allocated,
2552                          Error **errp)
2553{
2554    BDRVQcow2State *s = bs->opaque;
2555    int64_t offset;
2556    int ret;
2557
2558    if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
2559        offset = (*reftable)[reftable_index];
2560
2561        ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
2562        if (ret < 0) {
2563            error_setg_errno(errp, -ret, "Overlap check failed");
2564            return ret;
2565        }
2566
2567        ret = bdrv_pwrite(bs->file->bs, offset, refblock, s->cluster_size);
2568        if (ret < 0) {
2569            error_setg_errno(errp, -ret, "Failed to write refblock");
2570            return ret;
2571        }
2572    } else {
2573        assert(refblock_empty);
2574    }
2575
2576    return 0;
2577}
2578
2579/**
2580 * This function walks over the existing reftable and every referenced refblock;
2581 * if @new_set_refcount is non-NULL, it is called for every refcount entry to
2582 * create an equal new entry in the passed @new_refblock. Once that
2583 * @new_refblock is completely filled, @operation will be called.
2584 *
2585 * @status_cb and @cb_opaque are used for the amend operation's status callback.
2586 * @index is the index of the walk_over_reftable() calls and @total is the total
2587 * number of walk_over_reftable() calls per amend operation. Both are used for
2588 * calculating the parameters for the status callback.
2589 *
2590 * @allocated is set to true if a new cluster has been allocated.
2591 */
2592static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
2593                              uint64_t *new_reftable_index,
2594                              uint64_t *new_reftable_size,
2595                              void *new_refblock, int new_refblock_size,
2596                              int new_refcount_bits,
2597                              RefblockFinishOp *operation, bool *allocated,
2598                              Qcow2SetRefcountFunc *new_set_refcount,
2599                              BlockDriverAmendStatusCB *status_cb,
2600                              void *cb_opaque, int index, int total,
2601                              Error **errp)
2602{
2603    BDRVQcow2State *s = bs->opaque;
2604    uint64_t reftable_index;
2605    bool new_refblock_empty = true;
2606    int refblock_index;
2607    int new_refblock_index = 0;
2608    int ret;
2609
2610    for (reftable_index = 0; reftable_index < s->refcount_table_size;
2611         reftable_index++)
2612    {
2613        uint64_t refblock_offset = s->refcount_table[reftable_index]
2614                                 & REFT_OFFSET_MASK;
2615
2616        status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
2617                  (uint64_t)total * s->refcount_table_size, cb_opaque);
2618
2619        if (refblock_offset) {
2620            void *refblock;
2621
2622            if (offset_into_cluster(s, refblock_offset)) {
2623                qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
2624                                        PRIx64 " unaligned (reftable index: %#"
2625                                        PRIx64 ")", refblock_offset,
2626                                        reftable_index);
2627                error_setg(errp,
2628                           "Image is corrupt (unaligned refblock offset)");
2629                return -EIO;
2630            }
2631
2632            ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
2633                                  &refblock);
2634            if (ret < 0) {
2635                error_setg_errno(errp, -ret, "Failed to retrieve refblock");
2636                return ret;
2637            }
2638
2639            for (refblock_index = 0; refblock_index < s->refcount_block_size;
2640                 refblock_index++)
2641            {
2642                uint64_t refcount;
2643
2644                if (new_refblock_index >= new_refblock_size) {
2645                    /* new_refblock is now complete */
2646                    ret = operation(bs, new_reftable, *new_reftable_index,
2647                                    new_reftable_size, new_refblock,
2648                                    new_refblock_empty, allocated, errp);
2649                    if (ret < 0) {
2650                        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2651                        return ret;
2652                    }
2653
2654                    (*new_reftable_index)++;
2655                    new_refblock_index = 0;
2656                    new_refblock_empty = true;
2657                }
2658
2659                refcount = s->get_refcount(refblock, refblock_index);
2660                if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
2661                    uint64_t offset;
2662
2663                    qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2664
2665                    offset = ((reftable_index << s->refcount_block_bits)
2666                              + refblock_index) << s->cluster_bits;
2667
2668                    error_setg(errp, "Cannot decrease refcount entry width to "
2669                               "%i bits: Cluster at offset %#" PRIx64 " has a "
2670                               "refcount of %" PRIu64, new_refcount_bits,
2671                               offset, refcount);
2672                    return -EINVAL;
2673                }
2674
2675                if (new_set_refcount) {
2676                    new_set_refcount(new_refblock, new_refblock_index++,
2677                                     refcount);
2678                } else {
2679                    new_refblock_index++;
2680                }
2681                new_refblock_empty = new_refblock_empty && refcount == 0;
2682            }
2683
2684            qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2685        } else {
2686            /* No refblock means every refcount is 0 */
2687            for (refblock_index = 0; refblock_index < s->refcount_block_size;
2688                 refblock_index++)
2689            {
2690                if (new_refblock_index >= new_refblock_size) {
2691                    /* new_refblock is now complete */
2692                    ret = operation(bs, new_reftable, *new_reftable_index,
2693                                    new_reftable_size, new_refblock,
2694                                    new_refblock_empty, allocated, errp);
2695                    if (ret < 0) {
2696                        return ret;
2697                    }
2698
2699                    (*new_reftable_index)++;
2700                    new_refblock_index = 0;
2701                    new_refblock_empty = true;
2702                }
2703
2704                if (new_set_refcount) {
2705                    new_set_refcount(new_refblock, new_refblock_index++, 0);
2706                } else {
2707                    new_refblock_index++;
2708                }
2709            }
2710        }
2711    }
2712
2713    if (new_refblock_index > 0) {
2714        /* Complete the potentially existing partially filled final refblock */
2715        if (new_set_refcount) {
2716            for (; new_refblock_index < new_refblock_size;
2717                 new_refblock_index++)
2718            {
2719                new_set_refcount(new_refblock, new_refblock_index, 0);
2720            }
2721        }
2722
2723        ret = operation(bs, new_reftable, *new_reftable_index,
2724                        new_reftable_size, new_refblock, new_refblock_empty,
2725                        allocated, errp);
2726        if (ret < 0) {
2727            return ret;
2728        }
2729
2730        (*new_reftable_index)++;
2731    }
2732
2733    status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
2734              (uint64_t)total * s->refcount_table_size, cb_opaque);
2735
2736    return 0;
2737}
2738
2739int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
2740                                BlockDriverAmendStatusCB *status_cb,
2741                                void *cb_opaque, Error **errp)
2742{
2743    BDRVQcow2State *s = bs->opaque;
2744    Qcow2GetRefcountFunc *new_get_refcount;
2745    Qcow2SetRefcountFunc *new_set_refcount;
2746    void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
2747    uint64_t *new_reftable = NULL, new_reftable_size = 0;
2748    uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
2749    uint64_t new_reftable_index = 0;
2750    uint64_t i;
2751    int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
2752    int new_refblock_size, new_refcount_bits = 1 << refcount_order;
2753    int old_refcount_order;
2754    int walk_index = 0;
2755    int ret;
2756    bool new_allocation;
2757
2758    assert(s->qcow_version >= 3);
2759    assert(refcount_order >= 0 && refcount_order <= 6);
2760
2761    /* see qcow2_open() */
2762    new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
2763
2764    new_get_refcount = get_refcount_funcs[refcount_order];
2765    new_set_refcount = set_refcount_funcs[refcount_order];
2766
2767
2768    do {
2769        int total_walks;
2770
2771        new_allocation = false;
2772
2773        /* At least we have to do this walk and the one which writes the
2774         * refblocks; also, at least we have to do this loop here at least
2775         * twice (normally), first to do the allocations, and second to
2776         * determine that everything is correctly allocated, this then makes
2777         * three walks in total */
2778        total_walks = MAX(walk_index + 2, 3);
2779
2780        /* First, allocate the structures so they are present in the refcount
2781         * structures */
2782        ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
2783                                 &new_reftable_size, NULL, new_refblock_size,
2784                                 new_refcount_bits, &alloc_refblock,
2785                                 &new_allocation, NULL, status_cb, cb_opaque,
2786                                 walk_index++, total_walks, errp);
2787        if (ret < 0) {
2788            goto done;
2789        }
2790
2791        new_reftable_index = 0;
2792
2793        if (new_allocation) {
2794            if (new_reftable_offset) {
2795                qcow2_free_clusters(bs, new_reftable_offset,
2796                                    allocated_reftable_size * sizeof(uint64_t),
2797                                    QCOW2_DISCARD_NEVER);
2798            }
2799
2800            new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
2801                                                           sizeof(uint64_t));
2802            if (new_reftable_offset < 0) {
2803                error_setg_errno(errp, -new_reftable_offset,
2804                                 "Failed to allocate the new reftable");
2805                ret = new_reftable_offset;
2806                goto done;
2807            }
2808            allocated_reftable_size = new_reftable_size;
2809        }
2810    } while (new_allocation);
2811
2812    /* Second, write the new refblocks */
2813    ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
2814                             &new_reftable_size, new_refblock,
2815                             new_refblock_size, new_refcount_bits,
2816                             &flush_refblock, &new_allocation, new_set_refcount,
2817                             status_cb, cb_opaque, walk_index, walk_index + 1,
2818                             errp);
2819    if (ret < 0) {
2820        goto done;
2821    }
2822    assert(!new_allocation);
2823
2824
2825    /* Write the new reftable */
2826    ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
2827                                        new_reftable_size * sizeof(uint64_t));
2828    if (ret < 0) {
2829        error_setg_errno(errp, -ret, "Overlap check failed");
2830        goto done;
2831    }
2832
2833    for (i = 0; i < new_reftable_size; i++) {
2834        cpu_to_be64s(&new_reftable[i]);
2835    }
2836
2837    ret = bdrv_pwrite(bs->file->bs, new_reftable_offset, new_reftable,
2838                      new_reftable_size * sizeof(uint64_t));
2839
2840    for (i = 0; i < new_reftable_size; i++) {
2841        be64_to_cpus(&new_reftable[i]);
2842    }
2843
2844    if (ret < 0) {
2845        error_setg_errno(errp, -ret, "Failed to write the new reftable");
2846        goto done;
2847    }
2848
2849
2850    /* Empty the refcount cache */
2851    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
2852    if (ret < 0) {
2853        error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
2854        goto done;
2855    }
2856
2857    /* Update the image header to point to the new reftable; this only updates
2858     * the fields which are relevant to qcow2_update_header(); other fields
2859     * such as s->refcount_table or s->refcount_bits stay stale for now
2860     * (because we have to restore everything if qcow2_update_header() fails) */
2861    old_refcount_order  = s->refcount_order;
2862    old_reftable_size   = s->refcount_table_size;
2863    old_reftable_offset = s->refcount_table_offset;
2864
2865    s->refcount_order        = refcount_order;
2866    s->refcount_table_size   = new_reftable_size;
2867    s->refcount_table_offset = new_reftable_offset;
2868
2869    ret = qcow2_update_header(bs);
2870    if (ret < 0) {
2871        s->refcount_order        = old_refcount_order;
2872        s->refcount_table_size   = old_reftable_size;
2873        s->refcount_table_offset = old_reftable_offset;
2874        error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
2875        goto done;
2876    }
2877
2878    /* Now update the rest of the in-memory information */
2879    old_reftable = s->refcount_table;
2880    s->refcount_table = new_reftable;
2881
2882    s->refcount_bits = 1 << refcount_order;
2883    s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
2884    s->refcount_max += s->refcount_max - 1;
2885
2886    s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
2887    s->refcount_block_size = 1 << s->refcount_block_bits;
2888
2889    s->get_refcount = new_get_refcount;
2890    s->set_refcount = new_set_refcount;
2891
2892    /* For cleaning up all old refblocks and the old reftable below the "done"
2893     * label */
2894    new_reftable        = old_reftable;
2895    new_reftable_size   = old_reftable_size;
2896    new_reftable_offset = old_reftable_offset;
2897
2898done:
2899    if (new_reftable) {
2900        /* On success, new_reftable actually points to the old reftable (and
2901         * new_reftable_size is the old reftable's size); but that is just
2902         * fine */
2903        for (i = 0; i < new_reftable_size; i++) {
2904            uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
2905            if (offset) {
2906                qcow2_free_clusters(bs, offset, s->cluster_size,
2907                                    QCOW2_DISCARD_OTHER);
2908            }
2909        }
2910        g_free(new_reftable);
2911
2912        if (new_reftable_offset > 0) {
2913            qcow2_free_clusters(bs, new_reftable_offset,
2914                                new_reftable_size * sizeof(uint64_t),
2915                                QCOW2_DISCARD_OTHER);
2916        }
2917    }
2918
2919    qemu_vfree(new_refblock);
2920    return ret;
2921}
2922