qemu/block/qcow2-refcount.c
<<
>>
Prefs
   1/*
   2 * Block driver for the QCOW version 2 format
   3 *
   4 * Copyright (c) 2004-2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "qapi/error.h"
  27#include "qemu-common.h"
  28#include "block/block_int.h"
  29#include "block/qcow2.h"
  30#include "qemu/range.h"
  31#include "qemu/bswap.h"
  32
  33static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
  34static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
  35                            int64_t offset, int64_t length, uint64_t addend,
  36                            bool decrease, enum qcow2_discard_type type);
  37
  38static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
  39static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
  40static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
  41static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
  42static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
  43static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
  44static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
  45
  46static void set_refcount_ro0(void *refcount_array, uint64_t index,
  47                             uint64_t value);
  48static void set_refcount_ro1(void *refcount_array, uint64_t index,
  49                             uint64_t value);
  50static void set_refcount_ro2(void *refcount_array, uint64_t index,
  51                             uint64_t value);
  52static void set_refcount_ro3(void *refcount_array, uint64_t index,
  53                             uint64_t value);
  54static void set_refcount_ro4(void *refcount_array, uint64_t index,
  55                             uint64_t value);
  56static void set_refcount_ro5(void *refcount_array, uint64_t index,
  57                             uint64_t value);
  58static void set_refcount_ro6(void *refcount_array, uint64_t index,
  59                             uint64_t value);
  60
  61
  62static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
  63    &get_refcount_ro0,
  64    &get_refcount_ro1,
  65    &get_refcount_ro2,
  66    &get_refcount_ro3,
  67    &get_refcount_ro4,
  68    &get_refcount_ro5,
  69    &get_refcount_ro6
  70};
  71
  72static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
  73    &set_refcount_ro0,
  74    &set_refcount_ro1,
  75    &set_refcount_ro2,
  76    &set_refcount_ro3,
  77    &set_refcount_ro4,
  78    &set_refcount_ro5,
  79    &set_refcount_ro6
  80};
  81
  82
  83/*********************************************************/
  84/* refcount handling */
  85
  86static void update_max_refcount_table_index(BDRVQcow2State *s)
  87{
  88    unsigned i = s->refcount_table_size - 1;
  89    while (i > 0 && (s->refcount_table[i] & REFT_OFFSET_MASK) == 0) {
  90        i--;
  91    }
  92    /* Set s->max_refcount_table_index to the index of the last used entry */
  93    s->max_refcount_table_index = i;
  94}
  95
  96int qcow2_refcount_init(BlockDriverState *bs)
  97{
  98    BDRVQcow2State *s = bs->opaque;
  99    unsigned int refcount_table_size2, i;
 100    int ret;
 101
 102    assert(s->refcount_order >= 0 && s->refcount_order <= 6);
 103
 104    s->get_refcount = get_refcount_funcs[s->refcount_order];
 105    s->set_refcount = set_refcount_funcs[s->refcount_order];
 106
 107    assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
 108    refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
 109    s->refcount_table = g_try_malloc(refcount_table_size2);
 110
 111    if (s->refcount_table_size > 0) {
 112        if (s->refcount_table == NULL) {
 113            ret = -ENOMEM;
 114            goto fail;
 115        }
 116        BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
 117        ret = bdrv_pread(bs->file, s->refcount_table_offset,
 118                         s->refcount_table, refcount_table_size2);
 119        if (ret < 0) {
 120            goto fail;
 121        }
 122        for(i = 0; i < s->refcount_table_size; i++)
 123            be64_to_cpus(&s->refcount_table[i]);
 124        update_max_refcount_table_index(s);
 125    }
 126    return 0;
 127 fail:
 128    return ret;
 129}
 130
 131void qcow2_refcount_close(BlockDriverState *bs)
 132{
 133    BDRVQcow2State *s = bs->opaque;
 134    g_free(s->refcount_table);
 135}
 136
 137
 138static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
 139{
 140    return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
 141}
 142
 143static void set_refcount_ro0(void *refcount_array, uint64_t index,
 144                             uint64_t value)
 145{
 146    assert(!(value >> 1));
 147    ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
 148    ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
 149}
 150
 151static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
 152{
 153    return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
 154           & 0x3;
 155}
 156
 157static void set_refcount_ro1(void *refcount_array, uint64_t index,
 158                             uint64_t value)
 159{
 160    assert(!(value >> 2));
 161    ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
 162    ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
 163}
 164
 165static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
 166{
 167    return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
 168           & 0xf;
 169}
 170
 171static void set_refcount_ro2(void *refcount_array, uint64_t index,
 172                             uint64_t value)
 173{
 174    assert(!(value >> 4));
 175    ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
 176    ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
 177}
 178
 179static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
 180{
 181    return ((const uint8_t *)refcount_array)[index];
 182}
 183
 184static void set_refcount_ro3(void *refcount_array, uint64_t index,
 185                             uint64_t value)
 186{
 187    assert(!(value >> 8));
 188    ((uint8_t *)refcount_array)[index] = value;
 189}
 190
 191static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
 192{
 193    return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
 194}
 195
 196static void set_refcount_ro4(void *refcount_array, uint64_t index,
 197                             uint64_t value)
 198{
 199    assert(!(value >> 16));
 200    ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
 201}
 202
 203static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
 204{
 205    return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
 206}
 207
 208static void set_refcount_ro5(void *refcount_array, uint64_t index,
 209                             uint64_t value)
 210{
 211    assert(!(value >> 32));
 212    ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
 213}
 214
 215static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
 216{
 217    return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
 218}
 219
 220static void set_refcount_ro6(void *refcount_array, uint64_t index,
 221                             uint64_t value)
 222{
 223    ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
 224}
 225
 226
 227static int load_refcount_block(BlockDriverState *bs,
 228                               int64_t refcount_block_offset,
 229                               void **refcount_block)
 230{
 231    BDRVQcow2State *s = bs->opaque;
 232
 233    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
 234    return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
 235                           refcount_block);
 236}
 237
 238/*
 239 * Retrieves the refcount of the cluster given by its index and stores it in
 240 * *refcount. Returns 0 on success and -errno on failure.
 241 */
 242int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
 243                       uint64_t *refcount)
 244{
 245    BDRVQcow2State *s = bs->opaque;
 246    uint64_t refcount_table_index, block_index;
 247    int64_t refcount_block_offset;
 248    int ret;
 249    void *refcount_block;
 250
 251    refcount_table_index = cluster_index >> s->refcount_block_bits;
 252    if (refcount_table_index >= s->refcount_table_size) {
 253        *refcount = 0;
 254        return 0;
 255    }
 256    refcount_block_offset =
 257        s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
 258    if (!refcount_block_offset) {
 259        *refcount = 0;
 260        return 0;
 261    }
 262
 263    if (offset_into_cluster(s, refcount_block_offset)) {
 264        qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
 265                                " unaligned (reftable index: %#" PRIx64 ")",
 266                                refcount_block_offset, refcount_table_index);
 267        return -EIO;
 268    }
 269
 270    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
 271                          &refcount_block);
 272    if (ret < 0) {
 273        return ret;
 274    }
 275
 276    block_index = cluster_index & (s->refcount_block_size - 1);
 277    *refcount = s->get_refcount(refcount_block, block_index);
 278
 279    qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
 280
 281    return 0;
 282}
 283
 284/* Checks if two offsets are described by the same refcount block */
 285static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
 286    uint64_t offset_b)
 287{
 288    uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
 289    uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
 290
 291    return (block_a == block_b);
 292}
 293
 294/*
 295 * Loads a refcount block. If it doesn't exist yet, it is allocated first
 296 * (including growing the refcount table if needed).
 297 *
 298 * Returns 0 on success or -errno in error case
 299 */
 300static int alloc_refcount_block(BlockDriverState *bs,
 301                                int64_t cluster_index, void **refcount_block)
 302{
 303    BDRVQcow2State *s = bs->opaque;
 304    unsigned int refcount_table_index;
 305    int64_t ret;
 306
 307    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
 308
 309    /* Find the refcount block for the given cluster */
 310    refcount_table_index = cluster_index >> s->refcount_block_bits;
 311
 312    if (refcount_table_index < s->refcount_table_size) {
 313
 314        uint64_t refcount_block_offset =
 315            s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
 316
 317        /* If it's already there, we're done */
 318        if (refcount_block_offset) {
 319            if (offset_into_cluster(s, refcount_block_offset)) {
 320                qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
 321                                        PRIx64 " unaligned (reftable index: "
 322                                        "%#x)", refcount_block_offset,
 323                                        refcount_table_index);
 324                return -EIO;
 325            }
 326
 327             return load_refcount_block(bs, refcount_block_offset,
 328                                        refcount_block);
 329        }
 330    }
 331
 332    /*
 333     * If we came here, we need to allocate something. Something is at least
 334     * a cluster for the new refcount block. It may also include a new refcount
 335     * table if the old refcount table is too small.
 336     *
 337     * Note that allocating clusters here needs some special care:
 338     *
 339     * - We can't use the normal qcow2_alloc_clusters(), it would try to
 340     *   increase the refcount and very likely we would end up with an endless
 341     *   recursion. Instead we must place the refcount blocks in a way that
 342     *   they can describe them themselves.
 343     *
 344     * - We need to consider that at this point we are inside update_refcounts
 345     *   and potentially doing an initial refcount increase. This means that
 346     *   some clusters have already been allocated by the caller, but their
 347     *   refcount isn't accurate yet. If we allocate clusters for metadata, we
 348     *   need to return -EAGAIN to signal the caller that it needs to restart
 349     *   the search for free clusters.
 350     *
 351     * - alloc_clusters_noref and qcow2_free_clusters may load a different
 352     *   refcount block into the cache
 353     */
 354
 355    *refcount_block = NULL;
 356
 357    /* We write to the refcount table, so we might depend on L2 tables */
 358    ret = qcow2_cache_flush(bs, s->l2_table_cache);
 359    if (ret < 0) {
 360        return ret;
 361    }
 362
 363    /* Allocate the refcount block itself and mark it as used */
 364    int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
 365    if (new_block < 0) {
 366        return new_block;
 367    }
 368
 369#ifdef DEBUG_ALLOC2
 370    fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
 371        " at %" PRIx64 "\n",
 372        refcount_table_index, cluster_index << s->cluster_bits, new_block);
 373#endif
 374
 375    if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
 376        /* Zero the new refcount block before updating it */
 377        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
 378                                    refcount_block);
 379        if (ret < 0) {
 380            goto fail;
 381        }
 382
 383        memset(*refcount_block, 0, s->cluster_size);
 384
 385        /* The block describes itself, need to update the cache */
 386        int block_index = (new_block >> s->cluster_bits) &
 387            (s->refcount_block_size - 1);
 388        s->set_refcount(*refcount_block, block_index, 1);
 389    } else {
 390        /* Described somewhere else. This can recurse at most twice before we
 391         * arrive at a block that describes itself. */
 392        ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
 393                              QCOW2_DISCARD_NEVER);
 394        if (ret < 0) {
 395            goto fail;
 396        }
 397
 398        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 399        if (ret < 0) {
 400            goto fail;
 401        }
 402
 403        /* Initialize the new refcount block only after updating its refcount,
 404         * update_refcount uses the refcount cache itself */
 405        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
 406                                    refcount_block);
 407        if (ret < 0) {
 408            goto fail;
 409        }
 410
 411        memset(*refcount_block, 0, s->cluster_size);
 412    }
 413
 414    /* Now the new refcount block needs to be written to disk */
 415    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
 416    qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, *refcount_block);
 417    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 418    if (ret < 0) {
 419        goto fail;
 420    }
 421
 422    /* If the refcount table is big enough, just hook the block up there */
 423    if (refcount_table_index < s->refcount_table_size) {
 424        uint64_t data64 = cpu_to_be64(new_block);
 425        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
 426        ret = bdrv_pwrite_sync(bs->file,
 427            s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
 428            &data64, sizeof(data64));
 429        if (ret < 0) {
 430            goto fail;
 431        }
 432
 433        s->refcount_table[refcount_table_index] = new_block;
 434        /* If there's a hole in s->refcount_table then it can happen
 435         * that refcount_table_index < s->max_refcount_table_index */
 436        s->max_refcount_table_index =
 437            MAX(s->max_refcount_table_index, refcount_table_index);
 438
 439        /* The new refcount block may be where the caller intended to put its
 440         * data, so let it restart the search. */
 441        return -EAGAIN;
 442    }
 443
 444    qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
 445
 446    /*
 447     * If we come here, we need to grow the refcount table. Again, a new
 448     * refcount table needs some space and we can't simply allocate to avoid
 449     * endless recursion.
 450     *
 451     * Therefore let's grab new refcount blocks at the end of the image, which
 452     * will describe themselves and the new refcount table. This way we can
 453     * reference them only in the new table and do the switch to the new
 454     * refcount table at once without producing an inconsistent state in
 455     * between.
 456     */
 457    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
 458
 459    /* Calculate the number of refcount blocks needed so far; this will be the
 460     * basis for calculating the index of the first cluster used for the
 461     * self-describing refcount structures which we are about to create.
 462     *
 463     * Because we reached this point, there cannot be any refcount entries for
 464     * cluster_index or higher indices yet. However, because new_block has been
 465     * allocated to describe that cluster (and it will assume this role later
 466     * on), we cannot use that index; also, new_block may actually have a higher
 467     * cluster index than cluster_index, so it needs to be taken into account
 468     * here (and 1 needs to be added to its value because that cluster is used).
 469     */
 470    uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
 471                                            (new_block >> s->cluster_bits) + 1),
 472                                        s->refcount_block_size);
 473
 474    /* Create the new refcount table and blocks */
 475    uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
 476        s->cluster_size;
 477
 478    ret = qcow2_refcount_area(bs, meta_offset, 0, false,
 479                              refcount_table_index, new_block);
 480    if (ret < 0) {
 481        return ret;
 482    }
 483
 484    ret = load_refcount_block(bs, new_block, refcount_block);
 485    if (ret < 0) {
 486        return ret;
 487    }
 488
 489    /* If we were trying to do the initial refcount update for some cluster
 490     * allocation, we might have used the same clusters to store newly
 491     * allocated metadata. Make the caller search some new space. */
 492    return -EAGAIN;
 493
 494fail:
 495    if (*refcount_block != NULL) {
 496        qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
 497    }
 498    return ret;
 499}
 500
 501/*
 502 * Starting at @start_offset, this function creates new self-covering refcount
 503 * structures: A new refcount table and refcount blocks which cover all of
 504 * themselves, and a number of @additional_clusters beyond their end.
 505 * @start_offset must be at the end of the image file, that is, there must be
 506 * only empty space beyond it.
 507 * If @exact_size is false, the refcount table will have 50 % more entries than
 508 * necessary so it will not need to grow again soon.
 509 * If @new_refblock_offset is not zero, it contains the offset of a refcount
 510 * block that should be entered into the new refcount table at index
 511 * @new_refblock_index.
 512 *
 513 * Returns: The offset after the new refcount structures (i.e. where the
 514 *          @additional_clusters may be placed) on success, -errno on error.
 515 */
 516int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
 517                            uint64_t additional_clusters, bool exact_size,
 518                            int new_refblock_index,
 519                            uint64_t new_refblock_offset)
 520{
 521    BDRVQcow2State *s = bs->opaque;
 522    uint64_t total_refblock_count_u64, additional_refblock_count;
 523    int total_refblock_count, table_size, area_reftable_index, table_clusters;
 524    int i;
 525    uint64_t table_offset, block_offset, end_offset;
 526    int ret;
 527    uint64_t *new_table;
 528
 529    assert(!(start_offset % s->cluster_size));
 530
 531    qcow2_refcount_metadata_size(start_offset / s->cluster_size +
 532                                 additional_clusters,
 533                                 s->cluster_size, s->refcount_order,
 534                                 !exact_size, &total_refblock_count_u64);
 535    if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) {
 536        return -EFBIG;
 537    }
 538    total_refblock_count = total_refblock_count_u64;
 539
 540    /* Index in the refcount table of the first refcount block to cover the area
 541     * of refcount structures we are about to create; we know that
 542     * @total_refblock_count can cover @start_offset, so this will definitely
 543     * fit into an int. */
 544    area_reftable_index = (start_offset / s->cluster_size) /
 545                          s->refcount_block_size;
 546
 547    if (exact_size) {
 548        table_size = total_refblock_count;
 549    } else {
 550        table_size = total_refblock_count +
 551                     DIV_ROUND_UP(total_refblock_count, 2);
 552    }
 553    /* The qcow2 file can only store the reftable size in number of clusters */
 554    table_size = ROUND_UP(table_size, s->cluster_size / sizeof(uint64_t));
 555    table_clusters = (table_size * sizeof(uint64_t)) / s->cluster_size;
 556
 557    if (table_size > QCOW_MAX_REFTABLE_SIZE) {
 558        return -EFBIG;
 559    }
 560
 561    new_table = g_try_new0(uint64_t, table_size);
 562
 563    assert(table_size > 0);
 564    if (new_table == NULL) {
 565        ret = -ENOMEM;
 566        goto fail;
 567    }
 568
 569    /* Fill the new refcount table */
 570    if (table_size > s->max_refcount_table_index) {
 571        /* We're actually growing the reftable */
 572        memcpy(new_table, s->refcount_table,
 573               (s->max_refcount_table_index + 1) * sizeof(uint64_t));
 574    } else {
 575        /* Improbable case: We're shrinking the reftable. However, the caller
 576         * has assured us that there is only empty space beyond @start_offset,
 577         * so we can simply drop all of the refblocks that won't fit into the
 578         * new reftable. */
 579        memcpy(new_table, s->refcount_table, table_size * sizeof(uint64_t));
 580    }
 581
 582    if (new_refblock_offset) {
 583        assert(new_refblock_index < total_refblock_count);
 584        new_table[new_refblock_index] = new_refblock_offset;
 585    }
 586
 587    /* Count how many new refblocks we have to create */
 588    additional_refblock_count = 0;
 589    for (i = area_reftable_index; i < total_refblock_count; i++) {
 590        if (!new_table[i]) {
 591            additional_refblock_count++;
 592        }
 593    }
 594
 595    table_offset = start_offset + additional_refblock_count * s->cluster_size;
 596    end_offset = table_offset + table_clusters * s->cluster_size;
 597
 598    /* Fill the refcount blocks, and create new ones, if necessary */
 599    block_offset = start_offset;
 600    for (i = area_reftable_index; i < total_refblock_count; i++) {
 601        void *refblock_data;
 602        uint64_t first_offset_covered;
 603
 604        /* Reuse an existing refblock if possible, create a new one otherwise */
 605        if (new_table[i]) {
 606            ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i],
 607                                  &refblock_data);
 608            if (ret < 0) {
 609                goto fail;
 610            }
 611        } else {
 612            ret = qcow2_cache_get_empty(bs, s->refcount_block_cache,
 613                                        block_offset, &refblock_data);
 614            if (ret < 0) {
 615                goto fail;
 616            }
 617            memset(refblock_data, 0, s->cluster_size);
 618            qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
 619                                         refblock_data);
 620
 621            new_table[i] = block_offset;
 622            block_offset += s->cluster_size;
 623        }
 624
 625        /* First host offset covered by this refblock */
 626        first_offset_covered = (uint64_t)i * s->refcount_block_size *
 627                               s->cluster_size;
 628        if (first_offset_covered < end_offset) {
 629            int j, end_index;
 630
 631            /* Set the refcount of all of the new refcount structures to 1 */
 632
 633            if (first_offset_covered < start_offset) {
 634                assert(i == area_reftable_index);
 635                j = (start_offset - first_offset_covered) / s->cluster_size;
 636                assert(j < s->refcount_block_size);
 637            } else {
 638                j = 0;
 639            }
 640
 641            end_index = MIN((end_offset - first_offset_covered) /
 642                            s->cluster_size,
 643                            s->refcount_block_size);
 644
 645            for (; j < end_index; j++) {
 646                /* The caller guaranteed us this space would be empty */
 647                assert(s->get_refcount(refblock_data, j) == 0);
 648                s->set_refcount(refblock_data, j, 1);
 649            }
 650
 651            qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
 652                                         refblock_data);
 653        }
 654
 655        qcow2_cache_put(bs, s->refcount_block_cache, &refblock_data);
 656    }
 657
 658    assert(block_offset == table_offset);
 659
 660    /* Write refcount blocks to disk */
 661    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
 662    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 663    if (ret < 0) {
 664        goto fail;
 665    }
 666
 667    /* Write refcount table to disk */
 668    for (i = 0; i < total_refblock_count; i++) {
 669        cpu_to_be64s(&new_table[i]);
 670    }
 671
 672    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
 673    ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
 674        table_size * sizeof(uint64_t));
 675    if (ret < 0) {
 676        goto fail;
 677    }
 678
 679    for (i = 0; i < total_refblock_count; i++) {
 680        be64_to_cpus(&new_table[i]);
 681    }
 682
 683    /* Hook up the new refcount table in the qcow2 header */
 684    struct QEMU_PACKED {
 685        uint64_t d64;
 686        uint32_t d32;
 687    } data;
 688    data.d64 = cpu_to_be64(table_offset);
 689    data.d32 = cpu_to_be32(table_clusters);
 690    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
 691    ret = bdrv_pwrite_sync(bs->file,
 692                           offsetof(QCowHeader, refcount_table_offset),
 693                           &data, sizeof(data));
 694    if (ret < 0) {
 695        goto fail;
 696    }
 697
 698    /* And switch it in memory */
 699    uint64_t old_table_offset = s->refcount_table_offset;
 700    uint64_t old_table_size = s->refcount_table_size;
 701
 702    g_free(s->refcount_table);
 703    s->refcount_table = new_table;
 704    s->refcount_table_size = table_size;
 705    s->refcount_table_offset = table_offset;
 706    update_max_refcount_table_index(s);
 707
 708    /* Free old table. */
 709    qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
 710                        QCOW2_DISCARD_OTHER);
 711
 712    return end_offset;
 713
 714fail:
 715    g_free(new_table);
 716    return ret;
 717}
 718
 719void qcow2_process_discards(BlockDriverState *bs, int ret)
 720{
 721    BDRVQcow2State *s = bs->opaque;
 722    Qcow2DiscardRegion *d, *next;
 723
 724    QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
 725        QTAILQ_REMOVE(&s->discards, d, next);
 726
 727        /* Discard is optional, ignore the return value */
 728        if (ret >= 0) {
 729            bdrv_pdiscard(bs->file->bs, d->offset, d->bytes);
 730        }
 731
 732        g_free(d);
 733    }
 734}
 735
 736static void update_refcount_discard(BlockDriverState *bs,
 737                                    uint64_t offset, uint64_t length)
 738{
 739    BDRVQcow2State *s = bs->opaque;
 740    Qcow2DiscardRegion *d, *p, *next;
 741
 742    QTAILQ_FOREACH(d, &s->discards, next) {
 743        uint64_t new_start = MIN(offset, d->offset);
 744        uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
 745
 746        if (new_end - new_start <= length + d->bytes) {
 747            /* There can't be any overlap, areas ending up here have no
 748             * references any more and therefore shouldn't get freed another
 749             * time. */
 750            assert(d->bytes + length == new_end - new_start);
 751            d->offset = new_start;
 752            d->bytes = new_end - new_start;
 753            goto found;
 754        }
 755    }
 756
 757    d = g_malloc(sizeof(*d));
 758    *d = (Qcow2DiscardRegion) {
 759        .bs     = bs,
 760        .offset = offset,
 761        .bytes  = length,
 762    };
 763    QTAILQ_INSERT_TAIL(&s->discards, d, next);
 764
 765found:
 766    /* Merge discard requests if they are adjacent now */
 767    QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
 768        if (p == d
 769            || p->offset > d->offset + d->bytes
 770            || d->offset > p->offset + p->bytes)
 771        {
 772            continue;
 773        }
 774
 775        /* Still no overlap possible */
 776        assert(p->offset == d->offset + d->bytes
 777            || d->offset == p->offset + p->bytes);
 778
 779        QTAILQ_REMOVE(&s->discards, p, next);
 780        d->offset = MIN(d->offset, p->offset);
 781        d->bytes += p->bytes;
 782        g_free(p);
 783    }
 784}
 785
 786/* XXX: cache several refcount block clusters ? */
 787/* @addend is the absolute value of the addend; if @decrease is set, @addend
 788 * will be subtracted from the current refcount, otherwise it will be added */
 789static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
 790                                                   int64_t offset,
 791                                                   int64_t length,
 792                                                   uint64_t addend,
 793                                                   bool decrease,
 794                                                   enum qcow2_discard_type type)
 795{
 796    BDRVQcow2State *s = bs->opaque;
 797    int64_t start, last, cluster_offset;
 798    void *refcount_block = NULL;
 799    int64_t old_table_index = -1;
 800    int ret;
 801
 802#ifdef DEBUG_ALLOC2
 803    fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
 804            " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
 805            addend);
 806#endif
 807    if (length < 0) {
 808        return -EINVAL;
 809    } else if (length == 0) {
 810        return 0;
 811    }
 812
 813    if (decrease) {
 814        qcow2_cache_set_dependency(bs, s->refcount_block_cache,
 815            s->l2_table_cache);
 816    }
 817
 818    start = start_of_cluster(s, offset);
 819    last = start_of_cluster(s, offset + length - 1);
 820    for(cluster_offset = start; cluster_offset <= last;
 821        cluster_offset += s->cluster_size)
 822    {
 823        int block_index;
 824        uint64_t refcount;
 825        int64_t cluster_index = cluster_offset >> s->cluster_bits;
 826        int64_t table_index = cluster_index >> s->refcount_block_bits;
 827
 828        /* Load the refcount block and allocate it if needed */
 829        if (table_index != old_table_index) {
 830            if (refcount_block) {
 831                qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
 832            }
 833            ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
 834            if (ret < 0) {
 835                goto fail;
 836            }
 837        }
 838        old_table_index = table_index;
 839
 840        qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
 841                                     refcount_block);
 842
 843        /* we can update the count and save it */
 844        block_index = cluster_index & (s->refcount_block_size - 1);
 845
 846        refcount = s->get_refcount(refcount_block, block_index);
 847        if (decrease ? (refcount - addend > refcount)
 848                     : (refcount + addend < refcount ||
 849                        refcount + addend > s->refcount_max))
 850        {
 851            ret = -EINVAL;
 852            goto fail;
 853        }
 854        if (decrease) {
 855            refcount -= addend;
 856        } else {
 857            refcount += addend;
 858        }
 859        if (refcount == 0 && cluster_index < s->free_cluster_index) {
 860            s->free_cluster_index = cluster_index;
 861        }
 862        s->set_refcount(refcount_block, block_index, refcount);
 863
 864        if (refcount == 0 && s->discard_passthrough[type]) {
 865            update_refcount_discard(bs, cluster_offset, s->cluster_size);
 866        }
 867    }
 868
 869    ret = 0;
 870fail:
 871    if (!s->cache_discards) {
 872        qcow2_process_discards(bs, ret);
 873    }
 874
 875    /* Write last changed block to disk */
 876    if (refcount_block) {
 877        qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
 878    }
 879
 880    /*
 881     * Try do undo any updates if an error is returned (This may succeed in
 882     * some cases like ENOSPC for allocating a new refcount block)
 883     */
 884    if (ret < 0) {
 885        int dummy;
 886        dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
 887                                !decrease, QCOW2_DISCARD_NEVER);
 888        (void)dummy;
 889    }
 890
 891    return ret;
 892}
 893
 894/*
 895 * Increases or decreases the refcount of a given cluster.
 896 *
 897 * @addend is the absolute value of the addend; if @decrease is set, @addend
 898 * will be subtracted from the current refcount, otherwise it will be added.
 899 *
 900 * On success 0 is returned; on failure -errno is returned.
 901 */
 902int qcow2_update_cluster_refcount(BlockDriverState *bs,
 903                                  int64_t cluster_index,
 904                                  uint64_t addend, bool decrease,
 905                                  enum qcow2_discard_type type)
 906{
 907    BDRVQcow2State *s = bs->opaque;
 908    int ret;
 909
 910    ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
 911                          decrease, type);
 912    if (ret < 0) {
 913        return ret;
 914    }
 915
 916    return 0;
 917}
 918
 919
 920
 921/*********************************************************/
 922/* cluster allocation functions */
 923
 924
 925
 926/* return < 0 if error */
 927static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
 928{
 929    BDRVQcow2State *s = bs->opaque;
 930    uint64_t i, nb_clusters, refcount;
 931    int ret;
 932
 933    /* We can't allocate clusters if they may still be queued for discard. */
 934    if (s->cache_discards) {
 935        qcow2_process_discards(bs, 0);
 936    }
 937
 938    nb_clusters = size_to_clusters(s, size);
 939retry:
 940    for(i = 0; i < nb_clusters; i++) {
 941        uint64_t next_cluster_index = s->free_cluster_index++;
 942        ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
 943
 944        if (ret < 0) {
 945            return ret;
 946        } else if (refcount != 0) {
 947            goto retry;
 948        }
 949    }
 950
 951    /* Make sure that all offsets in the "allocated" range are representable
 952     * in an int64_t */
 953    if (s->free_cluster_index > 0 &&
 954        s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits))
 955    {
 956        return -EFBIG;
 957    }
 958
 959#ifdef DEBUG_ALLOC2
 960    fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
 961            size,
 962            (s->free_cluster_index - nb_clusters) << s->cluster_bits);
 963#endif
 964    return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
 965}
 966
 967int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
 968{
 969    int64_t offset;
 970    int ret;
 971
 972    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
 973    do {
 974        offset = alloc_clusters_noref(bs, size);
 975        if (offset < 0) {
 976            return offset;
 977        }
 978
 979        ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
 980    } while (ret == -EAGAIN);
 981
 982    if (ret < 0) {
 983        return ret;
 984    }
 985
 986    return offset;
 987}
 988
 989int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
 990                                int64_t nb_clusters)
 991{
 992    BDRVQcow2State *s = bs->opaque;
 993    uint64_t cluster_index, refcount;
 994    uint64_t i;
 995    int ret;
 996
 997    assert(nb_clusters >= 0);
 998    if (nb_clusters == 0) {
 999        return 0;
1000    }
1001
1002    do {
1003        /* Check how many clusters there are free */
1004        cluster_index = offset >> s->cluster_bits;
1005        for(i = 0; i < nb_clusters; i++) {
1006            ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
1007            if (ret < 0) {
1008                return ret;
1009            } else if (refcount != 0) {
1010                break;
1011            }
1012        }
1013
1014        /* And then allocate them */
1015        ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
1016                              QCOW2_DISCARD_NEVER);
1017    } while (ret == -EAGAIN);
1018
1019    if (ret < 0) {
1020        return ret;
1021    }
1022
1023    return i;
1024}
1025
1026/* only used to allocate compressed sectors. We try to allocate
1027   contiguous sectors. size must be <= cluster_size */
1028int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
1029{
1030    BDRVQcow2State *s = bs->opaque;
1031    int64_t offset;
1032    size_t free_in_cluster;
1033    int ret;
1034
1035    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
1036    assert(size > 0 && size <= s->cluster_size);
1037    assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
1038
1039    offset = s->free_byte_offset;
1040
1041    if (offset) {
1042        uint64_t refcount;
1043        ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
1044        if (ret < 0) {
1045            return ret;
1046        }
1047
1048        if (refcount == s->refcount_max) {
1049            offset = 0;
1050        }
1051    }
1052
1053    free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
1054    do {
1055        if (!offset || free_in_cluster < size) {
1056            int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size);
1057            if (new_cluster < 0) {
1058                return new_cluster;
1059            }
1060
1061            if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
1062                offset = new_cluster;
1063                free_in_cluster = s->cluster_size;
1064            } else {
1065                free_in_cluster += s->cluster_size;
1066            }
1067        }
1068
1069        assert(offset);
1070        ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1071        if (ret < 0) {
1072            offset = 0;
1073        }
1074    } while (ret == -EAGAIN);
1075    if (ret < 0) {
1076        return ret;
1077    }
1078
1079    /* The cluster refcount was incremented; refcount blocks must be flushed
1080     * before the caller's L2 table updates. */
1081    qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
1082
1083    s->free_byte_offset = offset + size;
1084    if (!offset_into_cluster(s, s->free_byte_offset)) {
1085        s->free_byte_offset = 0;
1086    }
1087
1088    return offset;
1089}
1090
1091void qcow2_free_clusters(BlockDriverState *bs,
1092                          int64_t offset, int64_t size,
1093                          enum qcow2_discard_type type)
1094{
1095    int ret;
1096
1097    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
1098    ret = update_refcount(bs, offset, size, 1, true, type);
1099    if (ret < 0) {
1100        fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
1101        /* TODO Remember the clusters to free them later and avoid leaking */
1102    }
1103}
1104
1105/*
1106 * Free a cluster using its L2 entry (handles clusters of all types, e.g.
1107 * normal cluster, compressed cluster, etc.)
1108 */
1109void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
1110                             int nb_clusters, enum qcow2_discard_type type)
1111{
1112    BDRVQcow2State *s = bs->opaque;
1113
1114    switch (qcow2_get_cluster_type(l2_entry)) {
1115    case QCOW2_CLUSTER_COMPRESSED:
1116        {
1117            int nb_csectors;
1118            nb_csectors = ((l2_entry >> s->csize_shift) &
1119                           s->csize_mask) + 1;
1120            qcow2_free_clusters(bs,
1121                (l2_entry & s->cluster_offset_mask) & ~511,
1122                nb_csectors * 512, type);
1123        }
1124        break;
1125    case QCOW2_CLUSTER_NORMAL:
1126    case QCOW2_CLUSTER_ZERO_ALLOC:
1127        if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1128            qcow2_signal_corruption(bs, false, -1, -1,
1129                                    "Cannot free unaligned cluster %#llx",
1130                                    l2_entry & L2E_OFFSET_MASK);
1131        } else {
1132            qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1133                                nb_clusters << s->cluster_bits, type);
1134        }
1135        break;
1136    case QCOW2_CLUSTER_ZERO_PLAIN:
1137    case QCOW2_CLUSTER_UNALLOCATED:
1138        break;
1139    default:
1140        abort();
1141    }
1142}
1143
1144
1145
1146/*********************************************************/
1147/* snapshots and image creation */
1148
1149
1150
1151/* update the refcounts of snapshots and the copied flag */
1152int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1153    int64_t l1_table_offset, int l1_size, int addend)
1154{
1155    BDRVQcow2State *s = bs->opaque;
1156    uint64_t *l1_table, *l2_table, l2_offset, entry, l1_size2, refcount;
1157    bool l1_allocated = false;
1158    int64_t old_entry, old_l2_offset;
1159    int i, j, l1_modified = 0, nb_csectors;
1160    int ret;
1161
1162    assert(addend >= -1 && addend <= 1);
1163
1164    l2_table = NULL;
1165    l1_table = NULL;
1166    l1_size2 = l1_size * sizeof(uint64_t);
1167
1168    s->cache_discards = true;
1169
1170    /* WARNING: qcow2_snapshot_goto relies on this function not using the
1171     * l1_table_offset when it is the current s->l1_table_offset! Be careful
1172     * when changing this! */
1173    if (l1_table_offset != s->l1_table_offset) {
1174        l1_table = g_try_malloc0(align_offset(l1_size2, 512));
1175        if (l1_size2 && l1_table == NULL) {
1176            ret = -ENOMEM;
1177            goto fail;
1178        }
1179        l1_allocated = true;
1180
1181        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1182        if (ret < 0) {
1183            goto fail;
1184        }
1185
1186        for (i = 0; i < l1_size; i++) {
1187            be64_to_cpus(&l1_table[i]);
1188        }
1189    } else {
1190        assert(l1_size == s->l1_size);
1191        l1_table = s->l1_table;
1192        l1_allocated = false;
1193    }
1194
1195    for (i = 0; i < l1_size; i++) {
1196        l2_offset = l1_table[i];
1197        if (l2_offset) {
1198            old_l2_offset = l2_offset;
1199            l2_offset &= L1E_OFFSET_MASK;
1200
1201            if (offset_into_cluster(s, l2_offset)) {
1202                qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1203                                        PRIx64 " unaligned (L1 index: %#x)",
1204                                        l2_offset, i);
1205                ret = -EIO;
1206                goto fail;
1207            }
1208
1209            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
1210                (void**) &l2_table);
1211            if (ret < 0) {
1212                goto fail;
1213            }
1214
1215            for (j = 0; j < s->l2_size; j++) {
1216                uint64_t cluster_index;
1217                uint64_t offset;
1218
1219                entry = be64_to_cpu(l2_table[j]);
1220                old_entry = entry;
1221                entry &= ~QCOW_OFLAG_COPIED;
1222                offset = entry & L2E_OFFSET_MASK;
1223
1224                switch (qcow2_get_cluster_type(entry)) {
1225                case QCOW2_CLUSTER_COMPRESSED:
1226                    nb_csectors = ((entry >> s->csize_shift) &
1227                                   s->csize_mask) + 1;
1228                    if (addend != 0) {
1229                        ret = update_refcount(bs,
1230                                (entry & s->cluster_offset_mask) & ~511,
1231                                nb_csectors * 512, abs(addend), addend < 0,
1232                                QCOW2_DISCARD_SNAPSHOT);
1233                        if (ret < 0) {
1234                            goto fail;
1235                        }
1236                    }
1237                    /* compressed clusters are never modified */
1238                    refcount = 2;
1239                    break;
1240
1241                case QCOW2_CLUSTER_NORMAL:
1242                case QCOW2_CLUSTER_ZERO_ALLOC:
1243                    if (offset_into_cluster(s, offset)) {
1244                        qcow2_signal_corruption(bs, true, -1, -1, "Cluster "
1245                                                "allocation offset %#" PRIx64
1246                                                " unaligned (L2 offset: %#"
1247                                                PRIx64 ", L2 index: %#x)",
1248                                                offset, l2_offset, j);
1249                        ret = -EIO;
1250                        goto fail;
1251                    }
1252
1253                    cluster_index = offset >> s->cluster_bits;
1254                    assert(cluster_index);
1255                    if (addend != 0) {
1256                        ret = qcow2_update_cluster_refcount(bs,
1257                                    cluster_index, abs(addend), addend < 0,
1258                                    QCOW2_DISCARD_SNAPSHOT);
1259                        if (ret < 0) {
1260                            goto fail;
1261                        }
1262                    }
1263
1264                    ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1265                    if (ret < 0) {
1266                        goto fail;
1267                    }
1268                    break;
1269
1270                case QCOW2_CLUSTER_ZERO_PLAIN:
1271                case QCOW2_CLUSTER_UNALLOCATED:
1272                    refcount = 0;
1273                    break;
1274
1275                default:
1276                    abort();
1277                }
1278
1279                if (refcount == 1) {
1280                    entry |= QCOW_OFLAG_COPIED;
1281                }
1282                if (entry != old_entry) {
1283                    if (addend > 0) {
1284                        qcow2_cache_set_dependency(bs, s->l2_table_cache,
1285                            s->refcount_block_cache);
1286                    }
1287                    l2_table[j] = cpu_to_be64(entry);
1288                    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache,
1289                                                 l2_table);
1290                }
1291            }
1292
1293            qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
1294
1295            if (addend != 0) {
1296                ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1297                                                        s->cluster_bits,
1298                                                    abs(addend), addend < 0,
1299                                                    QCOW2_DISCARD_SNAPSHOT);
1300                if (ret < 0) {
1301                    goto fail;
1302                }
1303            }
1304            ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1305                                     &refcount);
1306            if (ret < 0) {
1307                goto fail;
1308            } else if (refcount == 1) {
1309                l2_offset |= QCOW_OFLAG_COPIED;
1310            }
1311            if (l2_offset != old_l2_offset) {
1312                l1_table[i] = l2_offset;
1313                l1_modified = 1;
1314            }
1315        }
1316    }
1317
1318    ret = bdrv_flush(bs);
1319fail:
1320    if (l2_table) {
1321        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
1322    }
1323
1324    s->cache_discards = false;
1325    qcow2_process_discards(bs, ret);
1326
1327    /* Update L1 only if it isn't deleted anyway (addend = -1) */
1328    if (ret == 0 && addend >= 0 && l1_modified) {
1329        for (i = 0; i < l1_size; i++) {
1330            cpu_to_be64s(&l1_table[i]);
1331        }
1332
1333        ret = bdrv_pwrite_sync(bs->file, l1_table_offset,
1334                               l1_table, l1_size2);
1335
1336        for (i = 0; i < l1_size; i++) {
1337            be64_to_cpus(&l1_table[i]);
1338        }
1339    }
1340    if (l1_allocated)
1341        g_free(l1_table);
1342    return ret;
1343}
1344
1345
1346
1347
1348/*********************************************************/
1349/* refcount checking functions */
1350
1351
1352static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
1353{
1354    /* This assertion holds because there is no way we can address more than
1355     * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1356     * offsets have to be representable in bytes); due to every cluster
1357     * corresponding to one refcount entry, we are well below that limit */
1358    assert(entries < (UINT64_C(1) << (64 - 9)));
1359
1360    /* Thanks to the assertion this will not overflow, because
1361     * s->refcount_order < 7.
1362     * (note: x << s->refcount_order == x * s->refcount_bits) */
1363    return DIV_ROUND_UP(entries << s->refcount_order, 8);
1364}
1365
1366/**
1367 * Reallocates *array so that it can hold new_size entries. *size must contain
1368 * the current number of entries in *array. If the reallocation fails, *array
1369 * and *size will not be modified and -errno will be returned. If the
1370 * reallocation is successful, *array will be set to the new buffer, *size
1371 * will be set to new_size and 0 will be returned. The size of the reallocated
1372 * refcount array buffer will be aligned to a cluster boundary, and the newly
1373 * allocated area will be zeroed.
1374 */
1375static int realloc_refcount_array(BDRVQcow2State *s, void **array,
1376                                  int64_t *size, int64_t new_size)
1377{
1378    int64_t old_byte_size, new_byte_size;
1379    void *new_ptr;
1380
1381    /* Round to clusters so the array can be directly written to disk */
1382    old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1383                    * s->cluster_size;
1384    new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1385                    * s->cluster_size;
1386
1387    if (new_byte_size == old_byte_size) {
1388        *size = new_size;
1389        return 0;
1390    }
1391
1392    assert(new_byte_size > 0);
1393
1394    if (new_byte_size > SIZE_MAX) {
1395        return -ENOMEM;
1396    }
1397
1398    new_ptr = g_try_realloc(*array, new_byte_size);
1399    if (!new_ptr) {
1400        return -ENOMEM;
1401    }
1402
1403    if (new_byte_size > old_byte_size) {
1404        memset((char *)new_ptr + old_byte_size, 0,
1405               new_byte_size - old_byte_size);
1406    }
1407
1408    *array = new_ptr;
1409    *size  = new_size;
1410
1411    return 0;
1412}
1413
1414/*
1415 * Increases the refcount for a range of clusters in a given refcount table.
1416 * This is used to construct a temporary refcount table out of L1 and L2 tables
1417 * which can be compared to the refcount table saved in the image.
1418 *
1419 * Modifies the number of errors in res.
1420 */
1421int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
1422                             void **refcount_table,
1423                             int64_t *refcount_table_size,
1424                             int64_t offset, int64_t size)
1425{
1426    BDRVQcow2State *s = bs->opaque;
1427    uint64_t start, last, cluster_offset, k, refcount;
1428    int ret;
1429
1430    if (size <= 0) {
1431        return 0;
1432    }
1433
1434    start = start_of_cluster(s, offset);
1435    last = start_of_cluster(s, offset + size - 1);
1436    for(cluster_offset = start; cluster_offset <= last;
1437        cluster_offset += s->cluster_size) {
1438        k = cluster_offset >> s->cluster_bits;
1439        if (k >= *refcount_table_size) {
1440            ret = realloc_refcount_array(s, refcount_table,
1441                                         refcount_table_size, k + 1);
1442            if (ret < 0) {
1443                res->check_errors++;
1444                return ret;
1445            }
1446        }
1447
1448        refcount = s->get_refcount(*refcount_table, k);
1449        if (refcount == s->refcount_max) {
1450            fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1451                    "\n", cluster_offset);
1452            fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1453                    "width or qemu-img convert to create a clean copy if the "
1454                    "image cannot be opened for writing\n");
1455            res->corruptions++;
1456            continue;
1457        }
1458        s->set_refcount(*refcount_table, k, refcount + 1);
1459    }
1460
1461    return 0;
1462}
1463
1464/* Flags for check_refcounts_l1() and check_refcounts_l2() */
1465enum {
1466    CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1467};
1468
1469/*
1470 * Increases the refcount in the given refcount table for the all clusters
1471 * referenced in the L2 table. While doing so, performs some checks on L2
1472 * entries.
1473 *
1474 * Returns the number of errors found by the checks or -errno if an internal
1475 * error occurred.
1476 */
1477static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1478                              void **refcount_table,
1479                              int64_t *refcount_table_size, int64_t l2_offset,
1480                              int flags)
1481{
1482    BDRVQcow2State *s = bs->opaque;
1483    uint64_t *l2_table, l2_entry;
1484    uint64_t next_contiguous_offset = 0;
1485    int i, l2_size, nb_csectors, ret;
1486
1487    /* Read L2 table from disk */
1488    l2_size = s->l2_size * sizeof(uint64_t);
1489    l2_table = g_malloc(l2_size);
1490
1491    ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size);
1492    if (ret < 0) {
1493        fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1494        res->check_errors++;
1495        goto fail;
1496    }
1497
1498    /* Do the actual checks */
1499    for(i = 0; i < s->l2_size; i++) {
1500        l2_entry = be64_to_cpu(l2_table[i]);
1501
1502        switch (qcow2_get_cluster_type(l2_entry)) {
1503        case QCOW2_CLUSTER_COMPRESSED:
1504            /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1505            if (l2_entry & QCOW_OFLAG_COPIED) {
1506                fprintf(stderr, "ERROR: cluster %" PRId64 ": "
1507                    "copied flag must never be set for compressed "
1508                    "clusters\n", l2_entry >> s->cluster_bits);
1509                l2_entry &= ~QCOW_OFLAG_COPIED;
1510                res->corruptions++;
1511            }
1512
1513            /* Mark cluster as used */
1514            nb_csectors = ((l2_entry >> s->csize_shift) &
1515                           s->csize_mask) + 1;
1516            l2_entry &= s->cluster_offset_mask;
1517            ret = qcow2_inc_refcounts_imrt(bs, res,
1518                                           refcount_table, refcount_table_size,
1519                                           l2_entry & ~511, nb_csectors * 512);
1520            if (ret < 0) {
1521                goto fail;
1522            }
1523
1524            if (flags & CHECK_FRAG_INFO) {
1525                res->bfi.allocated_clusters++;
1526                res->bfi.compressed_clusters++;
1527
1528                /* Compressed clusters are fragmented by nature.  Since they
1529                 * take up sub-sector space but we only have sector granularity
1530                 * I/O we need to re-read the same sectors even for adjacent
1531                 * compressed clusters.
1532                 */
1533                res->bfi.fragmented_clusters++;
1534            }
1535            break;
1536
1537        case QCOW2_CLUSTER_ZERO_ALLOC:
1538        case QCOW2_CLUSTER_NORMAL:
1539        {
1540            uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1541
1542            if (flags & CHECK_FRAG_INFO) {
1543                res->bfi.allocated_clusters++;
1544                if (next_contiguous_offset &&
1545                    offset != next_contiguous_offset) {
1546                    res->bfi.fragmented_clusters++;
1547                }
1548                next_contiguous_offset = offset + s->cluster_size;
1549            }
1550
1551            /* Mark cluster as used */
1552            ret = qcow2_inc_refcounts_imrt(bs, res,
1553                                           refcount_table, refcount_table_size,
1554                                           offset, s->cluster_size);
1555            if (ret < 0) {
1556                goto fail;
1557            }
1558
1559            /* Correct offsets are cluster aligned */
1560            if (offset_into_cluster(s, offset)) {
1561                fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
1562                    "properly aligned; L2 entry corrupted.\n", offset);
1563                res->corruptions++;
1564            }
1565            break;
1566        }
1567
1568        case QCOW2_CLUSTER_ZERO_PLAIN:
1569        case QCOW2_CLUSTER_UNALLOCATED:
1570            break;
1571
1572        default:
1573            abort();
1574        }
1575    }
1576
1577    g_free(l2_table);
1578    return 0;
1579
1580fail:
1581    g_free(l2_table);
1582    return ret;
1583}
1584
1585/*
1586 * Increases the refcount for the L1 table, its L2 tables and all referenced
1587 * clusters in the given refcount table. While doing so, performs some checks
1588 * on L1 and L2 entries.
1589 *
1590 * Returns the number of errors found by the checks or -errno if an internal
1591 * error occurred.
1592 */
1593static int check_refcounts_l1(BlockDriverState *bs,
1594                              BdrvCheckResult *res,
1595                              void **refcount_table,
1596                              int64_t *refcount_table_size,
1597                              int64_t l1_table_offset, int l1_size,
1598                              int flags)
1599{
1600    BDRVQcow2State *s = bs->opaque;
1601    uint64_t *l1_table = NULL, l2_offset, l1_size2;
1602    int i, ret;
1603
1604    l1_size2 = l1_size * sizeof(uint64_t);
1605
1606    /* Mark L1 table as used */
1607    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size,
1608                                   l1_table_offset, l1_size2);
1609    if (ret < 0) {
1610        goto fail;
1611    }
1612
1613    /* Read L1 table entries from disk */
1614    if (l1_size2 > 0) {
1615        l1_table = g_try_malloc(l1_size2);
1616        if (l1_table == NULL) {
1617            ret = -ENOMEM;
1618            res->check_errors++;
1619            goto fail;
1620        }
1621        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1622        if (ret < 0) {
1623            fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1624            res->check_errors++;
1625            goto fail;
1626        }
1627        for(i = 0;i < l1_size; i++)
1628            be64_to_cpus(&l1_table[i]);
1629    }
1630
1631    /* Do the actual checks */
1632    for(i = 0; i < l1_size; i++) {
1633        l2_offset = l1_table[i];
1634        if (l2_offset) {
1635            /* Mark L2 table as used */
1636            l2_offset &= L1E_OFFSET_MASK;
1637            ret = qcow2_inc_refcounts_imrt(bs, res,
1638                                           refcount_table, refcount_table_size,
1639                                           l2_offset, s->cluster_size);
1640            if (ret < 0) {
1641                goto fail;
1642            }
1643
1644            /* L2 tables are cluster aligned */
1645            if (offset_into_cluster(s, l2_offset)) {
1646                fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1647                    "cluster aligned; L1 entry corrupted\n", l2_offset);
1648                res->corruptions++;
1649            }
1650
1651            /* Process and check L2 entries */
1652            ret = check_refcounts_l2(bs, res, refcount_table,
1653                                     refcount_table_size, l2_offset, flags);
1654            if (ret < 0) {
1655                goto fail;
1656            }
1657        }
1658    }
1659    g_free(l1_table);
1660    return 0;
1661
1662fail:
1663    g_free(l1_table);
1664    return ret;
1665}
1666
1667/*
1668 * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1669 *
1670 * This function does not print an error message nor does it increment
1671 * check_errors if qcow2_get_refcount fails (this is because such an error will
1672 * have been already detected and sufficiently signaled by the calling function
1673 * (qcow2_check_refcounts) by the time this function is called).
1674 */
1675static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1676                              BdrvCheckMode fix)
1677{
1678    BDRVQcow2State *s = bs->opaque;
1679    uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1680    int ret;
1681    uint64_t refcount;
1682    int i, j;
1683
1684    for (i = 0; i < s->l1_size; i++) {
1685        uint64_t l1_entry = s->l1_table[i];
1686        uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1687        bool l2_dirty = false;
1688
1689        if (!l2_offset) {
1690            continue;
1691        }
1692
1693        ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1694                                 &refcount);
1695        if (ret < 0) {
1696            /* don't print message nor increment check_errors */
1697            continue;
1698        }
1699        if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1700            fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1701                    "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1702                    fix & BDRV_FIX_ERRORS ? "Repairing" :
1703                                            "ERROR",
1704                    i, l1_entry, refcount);
1705            if (fix & BDRV_FIX_ERRORS) {
1706                s->l1_table[i] = refcount == 1
1707                               ? l1_entry |  QCOW_OFLAG_COPIED
1708                               : l1_entry & ~QCOW_OFLAG_COPIED;
1709                ret = qcow2_write_l1_entry(bs, i);
1710                if (ret < 0) {
1711                    res->check_errors++;
1712                    goto fail;
1713                }
1714                res->corruptions_fixed++;
1715            } else {
1716                res->corruptions++;
1717            }
1718        }
1719
1720        ret = bdrv_pread(bs->file, l2_offset, l2_table,
1721                         s->l2_size * sizeof(uint64_t));
1722        if (ret < 0) {
1723            fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1724                    strerror(-ret));
1725            res->check_errors++;
1726            goto fail;
1727        }
1728
1729        for (j = 0; j < s->l2_size; j++) {
1730            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1731            uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
1732            QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry);
1733
1734            if (cluster_type == QCOW2_CLUSTER_NORMAL ||
1735                cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) {
1736                ret = qcow2_get_refcount(bs,
1737                                         data_offset >> s->cluster_bits,
1738                                         &refcount);
1739                if (ret < 0) {
1740                    /* don't print message nor increment check_errors */
1741                    continue;
1742                }
1743                if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
1744                    fprintf(stderr, "%s OFLAG_COPIED data cluster: "
1745                            "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1746                            fix & BDRV_FIX_ERRORS ? "Repairing" :
1747                                                    "ERROR",
1748                            l2_entry, refcount);
1749                    if (fix & BDRV_FIX_ERRORS) {
1750                        l2_table[j] = cpu_to_be64(refcount == 1
1751                                    ? l2_entry |  QCOW_OFLAG_COPIED
1752                                    : l2_entry & ~QCOW_OFLAG_COPIED);
1753                        l2_dirty = true;
1754                        res->corruptions_fixed++;
1755                    } else {
1756                        res->corruptions++;
1757                    }
1758                }
1759            }
1760        }
1761
1762        if (l2_dirty) {
1763            ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1764                                                l2_offset, s->cluster_size);
1765            if (ret < 0) {
1766                fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1767                        "overlap check failed: %s\n", strerror(-ret));
1768                res->check_errors++;
1769                goto fail;
1770            }
1771
1772            ret = bdrv_pwrite(bs->file, l2_offset, l2_table,
1773                              s->cluster_size);
1774            if (ret < 0) {
1775                fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1776                        strerror(-ret));
1777                res->check_errors++;
1778                goto fail;
1779            }
1780        }
1781    }
1782
1783    ret = 0;
1784
1785fail:
1786    qemu_vfree(l2_table);
1787    return ret;
1788}
1789
1790/*
1791 * Checks consistency of refblocks and accounts for each refblock in
1792 * *refcount_table.
1793 */
1794static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
1795                           BdrvCheckMode fix, bool *rebuild,
1796                           void **refcount_table, int64_t *nb_clusters)
1797{
1798    BDRVQcow2State *s = bs->opaque;
1799    int64_t i, size;
1800    int ret;
1801
1802    for(i = 0; i < s->refcount_table_size; i++) {
1803        uint64_t offset, cluster;
1804        offset = s->refcount_table[i];
1805        cluster = offset >> s->cluster_bits;
1806
1807        /* Refcount blocks are cluster aligned */
1808        if (offset_into_cluster(s, offset)) {
1809            fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
1810                "cluster aligned; refcount table entry corrupted\n", i);
1811            res->corruptions++;
1812            *rebuild = true;
1813            continue;
1814        }
1815
1816        if (cluster >= *nb_clusters) {
1817            fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
1818                    fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
1819
1820            if (fix & BDRV_FIX_ERRORS) {
1821                int64_t new_nb_clusters;
1822                Error *local_err = NULL;
1823
1824                if (offset > INT64_MAX - s->cluster_size) {
1825                    ret = -EINVAL;
1826                    goto resize_fail;
1827                }
1828
1829                ret = bdrv_truncate(bs->file, offset + s->cluster_size,
1830                                    PREALLOC_MODE_OFF, &local_err);
1831                if (ret < 0) {
1832                    error_report_err(local_err);
1833                    goto resize_fail;
1834                }
1835                size = bdrv_getlength(bs->file->bs);
1836                if (size < 0) {
1837                    ret = size;
1838                    goto resize_fail;
1839                }
1840
1841                new_nb_clusters = size_to_clusters(s, size);
1842                assert(new_nb_clusters >= *nb_clusters);
1843
1844                ret = realloc_refcount_array(s, refcount_table,
1845                                             nb_clusters, new_nb_clusters);
1846                if (ret < 0) {
1847                    res->check_errors++;
1848                    return ret;
1849                }
1850
1851                if (cluster >= *nb_clusters) {
1852                    ret = -EINVAL;
1853                    goto resize_fail;
1854                }
1855
1856                res->corruptions_fixed++;
1857                ret = qcow2_inc_refcounts_imrt(bs, res,
1858                                               refcount_table, nb_clusters,
1859                                               offset, s->cluster_size);
1860                if (ret < 0) {
1861                    return ret;
1862                }
1863                /* No need to check whether the refcount is now greater than 1:
1864                 * This area was just allocated and zeroed, so it can only be
1865                 * exactly 1 after qcow2_inc_refcounts_imrt() */
1866                continue;
1867
1868resize_fail:
1869                res->corruptions++;
1870                *rebuild = true;
1871                fprintf(stderr, "ERROR could not resize image: %s\n",
1872                        strerror(-ret));
1873            } else {
1874                res->corruptions++;
1875            }
1876            continue;
1877        }
1878
1879        if (offset != 0) {
1880            ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
1881                                           offset, s->cluster_size);
1882            if (ret < 0) {
1883                return ret;
1884            }
1885            if (s->get_refcount(*refcount_table, cluster) != 1) {
1886                fprintf(stderr, "ERROR refcount block %" PRId64
1887                        " refcount=%" PRIu64 "\n", i,
1888                        s->get_refcount(*refcount_table, cluster));
1889                res->corruptions++;
1890                *rebuild = true;
1891            }
1892        }
1893    }
1894
1895    return 0;
1896}
1897
1898/*
1899 * Calculates an in-memory refcount table.
1900 */
1901static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1902                               BdrvCheckMode fix, bool *rebuild,
1903                               void **refcount_table, int64_t *nb_clusters)
1904{
1905    BDRVQcow2State *s = bs->opaque;
1906    int64_t i;
1907    QCowSnapshot *sn;
1908    int ret;
1909
1910    if (!*refcount_table) {
1911        int64_t old_size = 0;
1912        ret = realloc_refcount_array(s, refcount_table,
1913                                     &old_size, *nb_clusters);
1914        if (ret < 0) {
1915            res->check_errors++;
1916            return ret;
1917        }
1918    }
1919
1920    /* header */
1921    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
1922                                   0, s->cluster_size);
1923    if (ret < 0) {
1924        return ret;
1925    }
1926
1927    /* current L1 table */
1928    ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1929                             s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO);
1930    if (ret < 0) {
1931        return ret;
1932    }
1933
1934    /* snapshots */
1935    for (i = 0; i < s->nb_snapshots; i++) {
1936        sn = s->snapshots + i;
1937        ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1938                                 sn->l1_table_offset, sn->l1_size, 0);
1939        if (ret < 0) {
1940            return ret;
1941        }
1942    }
1943    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
1944                                   s->snapshots_offset, s->snapshots_size);
1945    if (ret < 0) {
1946        return ret;
1947    }
1948
1949    /* refcount data */
1950    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
1951                                   s->refcount_table_offset,
1952                                   s->refcount_table_size * sizeof(uint64_t));
1953    if (ret < 0) {
1954        return ret;
1955    }
1956
1957    /* encryption */
1958    if (s->crypto_header.length) {
1959        ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
1960                                       s->crypto_header.offset,
1961                                       s->crypto_header.length);
1962        if (ret < 0) {
1963            return ret;
1964        }
1965    }
1966
1967    /* bitmaps */
1968    ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters);
1969    if (ret < 0) {
1970        return ret;
1971    }
1972
1973    return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
1974}
1975
1976/*
1977 * Compares the actual reference count for each cluster in the image against the
1978 * refcount as reported by the refcount structures on-disk.
1979 */
1980static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1981                              BdrvCheckMode fix, bool *rebuild,
1982                              int64_t *highest_cluster,
1983                              void *refcount_table, int64_t nb_clusters)
1984{
1985    BDRVQcow2State *s = bs->opaque;
1986    int64_t i;
1987    uint64_t refcount1, refcount2;
1988    int ret;
1989
1990    for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
1991        ret = qcow2_get_refcount(bs, i, &refcount1);
1992        if (ret < 0) {
1993            fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
1994                    i, strerror(-ret));
1995            res->check_errors++;
1996            continue;
1997        }
1998
1999        refcount2 = s->get_refcount(refcount_table, i);
2000
2001        if (refcount1 > 0 || refcount2 > 0) {
2002            *highest_cluster = i;
2003        }
2004
2005        if (refcount1 != refcount2) {
2006            /* Check if we're allowed to fix the mismatch */
2007            int *num_fixed = NULL;
2008            if (refcount1 == 0) {
2009                *rebuild = true;
2010            } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
2011                num_fixed = &res->leaks_fixed;
2012            } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
2013                num_fixed = &res->corruptions_fixed;
2014            }
2015
2016            fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
2017                    " reference=%" PRIu64 "\n",
2018                   num_fixed != NULL     ? "Repairing" :
2019                   refcount1 < refcount2 ? "ERROR" :
2020                                           "Leaked",
2021                   i, refcount1, refcount2);
2022
2023            if (num_fixed) {
2024                ret = update_refcount(bs, i << s->cluster_bits, 1,
2025                                      refcount_diff(refcount1, refcount2),
2026                                      refcount1 > refcount2,
2027                                      QCOW2_DISCARD_ALWAYS);
2028                if (ret >= 0) {
2029                    (*num_fixed)++;
2030                    continue;
2031                }
2032            }
2033
2034            /* And if we couldn't, print an error */
2035            if (refcount1 < refcount2) {
2036                res->corruptions++;
2037            } else {
2038                res->leaks++;
2039            }
2040        }
2041    }
2042}
2043
2044/*
2045 * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
2046 * the on-disk refcount structures.
2047 *
2048 * On input, *first_free_cluster tells where to start looking, and need not
2049 * actually be a free cluster; the returned offset will not be before that
2050 * cluster.  On output, *first_free_cluster points to the first gap found, even
2051 * if that gap was too small to be used as the returned offset.
2052 *
2053 * Note that *first_free_cluster is a cluster index whereas the return value is
2054 * an offset.
2055 */
2056static int64_t alloc_clusters_imrt(BlockDriverState *bs,
2057                                   int cluster_count,
2058                                   void **refcount_table,
2059                                   int64_t *imrt_nb_clusters,
2060                                   int64_t *first_free_cluster)
2061{
2062    BDRVQcow2State *s = bs->opaque;
2063    int64_t cluster = *first_free_cluster, i;
2064    bool first_gap = true;
2065    int contiguous_free_clusters;
2066    int ret;
2067
2068    /* Starting at *first_free_cluster, find a range of at least cluster_count
2069     * continuously free clusters */
2070    for (contiguous_free_clusters = 0;
2071         cluster < *imrt_nb_clusters &&
2072         contiguous_free_clusters < cluster_count;
2073         cluster++)
2074    {
2075        if (!s->get_refcount(*refcount_table, cluster)) {
2076            contiguous_free_clusters++;
2077            if (first_gap) {
2078                /* If this is the first free cluster found, update
2079                 * *first_free_cluster accordingly */
2080                *first_free_cluster = cluster;
2081                first_gap = false;
2082            }
2083        } else if (contiguous_free_clusters) {
2084            contiguous_free_clusters = 0;
2085        }
2086    }
2087
2088    /* If contiguous_free_clusters is greater than zero, it contains the number
2089     * of continuously free clusters until the current cluster; the first free
2090     * cluster in the current "gap" is therefore
2091     * cluster - contiguous_free_clusters */
2092
2093    /* If no such range could be found, grow the in-memory refcount table
2094     * accordingly to append free clusters at the end of the image */
2095    if (contiguous_free_clusters < cluster_count) {
2096        /* contiguous_free_clusters clusters are already empty at the image end;
2097         * we need cluster_count clusters; therefore, we have to allocate
2098         * cluster_count - contiguous_free_clusters new clusters at the end of
2099         * the image (which is the current value of cluster; note that cluster
2100         * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
2101         * the image end) */
2102        ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
2103                                     cluster + cluster_count
2104                                     - contiguous_free_clusters);
2105        if (ret < 0) {
2106            return ret;
2107        }
2108    }
2109
2110    /* Go back to the first free cluster */
2111    cluster -= contiguous_free_clusters;
2112    for (i = 0; i < cluster_count; i++) {
2113        s->set_refcount(*refcount_table, cluster + i, 1);
2114    }
2115
2116    return cluster << s->cluster_bits;
2117}
2118
2119/*
2120 * Creates a new refcount structure based solely on the in-memory information
2121 * given through *refcount_table. All necessary allocations will be reflected
2122 * in that array.
2123 *
2124 * On success, the old refcount structure is leaked (it will be covered by the
2125 * new refcount structure).
2126 */
2127static int rebuild_refcount_structure(BlockDriverState *bs,
2128                                      BdrvCheckResult *res,
2129                                      void **refcount_table,
2130                                      int64_t *nb_clusters)
2131{
2132    BDRVQcow2State *s = bs->opaque;
2133    int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0;
2134    int64_t refblock_offset, refblock_start, refblock_index;
2135    uint32_t reftable_size = 0;
2136    uint64_t *on_disk_reftable = NULL;
2137    void *on_disk_refblock;
2138    int ret = 0;
2139    struct {
2140        uint64_t reftable_offset;
2141        uint32_t reftable_clusters;
2142    } QEMU_PACKED reftable_offset_and_clusters;
2143
2144    qcow2_cache_empty(bs, s->refcount_block_cache);
2145
2146write_refblocks:
2147    for (; cluster < *nb_clusters; cluster++) {
2148        if (!s->get_refcount(*refcount_table, cluster)) {
2149            continue;
2150        }
2151
2152        refblock_index = cluster >> s->refcount_block_bits;
2153        refblock_start = refblock_index << s->refcount_block_bits;
2154
2155        /* Don't allocate a cluster in a refblock already written to disk */
2156        if (first_free_cluster < refblock_start) {
2157            first_free_cluster = refblock_start;
2158        }
2159        refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2160                                              nb_clusters, &first_free_cluster);
2161        if (refblock_offset < 0) {
2162            fprintf(stderr, "ERROR allocating refblock: %s\n",
2163                    strerror(-refblock_offset));
2164            res->check_errors++;
2165            ret = refblock_offset;
2166            goto fail;
2167        }
2168
2169        if (reftable_size <= refblock_index) {
2170            uint32_t old_reftable_size = reftable_size;
2171            uint64_t *new_on_disk_reftable;
2172
2173            reftable_size = ROUND_UP((refblock_index + 1) * sizeof(uint64_t),
2174                                     s->cluster_size) / sizeof(uint64_t);
2175            new_on_disk_reftable = g_try_realloc(on_disk_reftable,
2176                                                 reftable_size *
2177                                                 sizeof(uint64_t));
2178            if (!new_on_disk_reftable) {
2179                res->check_errors++;
2180                ret = -ENOMEM;
2181                goto fail;
2182            }
2183            on_disk_reftable = new_on_disk_reftable;
2184
2185            memset(on_disk_reftable + old_reftable_size, 0,
2186                   (reftable_size - old_reftable_size) * sizeof(uint64_t));
2187
2188            /* The offset we have for the reftable is now no longer valid;
2189             * this will leak that range, but we can easily fix that by running
2190             * a leak-fixing check after this rebuild operation */
2191            reftable_offset = -1;
2192        } else {
2193            assert(on_disk_reftable);
2194        }
2195        on_disk_reftable[refblock_index] = refblock_offset;
2196
2197        /* If this is apparently the last refblock (for now), try to squeeze the
2198         * reftable in */
2199        if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits &&
2200            reftable_offset < 0)
2201        {
2202            uint64_t reftable_clusters = size_to_clusters(s, reftable_size *
2203                                                          sizeof(uint64_t));
2204            reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2205                                                  refcount_table, nb_clusters,
2206                                                  &first_free_cluster);
2207            if (reftable_offset < 0) {
2208                fprintf(stderr, "ERROR allocating reftable: %s\n",
2209                        strerror(-reftable_offset));
2210                res->check_errors++;
2211                ret = reftable_offset;
2212                goto fail;
2213            }
2214        }
2215
2216        ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2217                                            s->cluster_size);
2218        if (ret < 0) {
2219            fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2220            goto fail;
2221        }
2222
2223        /* The size of *refcount_table is always cluster-aligned, therefore the
2224         * write operation will not overflow */
2225        on_disk_refblock = (void *)((char *) *refcount_table +
2226                                    refblock_index * s->cluster_size);
2227
2228        ret = bdrv_write(bs->file, refblock_offset / BDRV_SECTOR_SIZE,
2229                         on_disk_refblock, s->cluster_sectors);
2230        if (ret < 0) {
2231            fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2232            goto fail;
2233        }
2234
2235        /* Go to the end of this refblock */
2236        cluster = refblock_start + s->refcount_block_size - 1;
2237    }
2238
2239    if (reftable_offset < 0) {
2240        uint64_t post_refblock_start, reftable_clusters;
2241
2242        post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size);
2243        reftable_clusters = size_to_clusters(s,
2244                                             reftable_size * sizeof(uint64_t));
2245        /* Not pretty but simple */
2246        if (first_free_cluster < post_refblock_start) {
2247            first_free_cluster = post_refblock_start;
2248        }
2249        reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2250                                              refcount_table, nb_clusters,
2251                                              &first_free_cluster);
2252        if (reftable_offset < 0) {
2253            fprintf(stderr, "ERROR allocating reftable: %s\n",
2254                    strerror(-reftable_offset));
2255            res->check_errors++;
2256            ret = reftable_offset;
2257            goto fail;
2258        }
2259
2260        goto write_refblocks;
2261    }
2262
2263    for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2264        cpu_to_be64s(&on_disk_reftable[refblock_index]);
2265    }
2266
2267    ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset,
2268                                        reftable_size * sizeof(uint64_t));
2269    if (ret < 0) {
2270        fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2271        goto fail;
2272    }
2273
2274    assert(reftable_size < INT_MAX / sizeof(uint64_t));
2275    ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable,
2276                      reftable_size * sizeof(uint64_t));
2277    if (ret < 0) {
2278        fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2279        goto fail;
2280    }
2281
2282    /* Enter new reftable into the image header */
2283    reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
2284    reftable_offset_and_clusters.reftable_clusters =
2285        cpu_to_be32(size_to_clusters(s, reftable_size * sizeof(uint64_t)));
2286    ret = bdrv_pwrite_sync(bs->file,
2287                           offsetof(QCowHeader, refcount_table_offset),
2288                           &reftable_offset_and_clusters,
2289                           sizeof(reftable_offset_and_clusters));
2290    if (ret < 0) {
2291        fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret));
2292        goto fail;
2293    }
2294
2295    for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2296        be64_to_cpus(&on_disk_reftable[refblock_index]);
2297    }
2298    s->refcount_table = on_disk_reftable;
2299    s->refcount_table_offset = reftable_offset;
2300    s->refcount_table_size = reftable_size;
2301    update_max_refcount_table_index(s);
2302
2303    return 0;
2304
2305fail:
2306    g_free(on_disk_reftable);
2307    return ret;
2308}
2309
2310/*
2311 * Checks an image for refcount consistency.
2312 *
2313 * Returns 0 if no errors are found, the number of errors in case the image is
2314 * detected as corrupted, and -errno when an internal error occurred.
2315 */
2316int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2317                          BdrvCheckMode fix)
2318{
2319    BDRVQcow2State *s = bs->opaque;
2320    BdrvCheckResult pre_compare_res;
2321    int64_t size, highest_cluster, nb_clusters;
2322    void *refcount_table = NULL;
2323    bool rebuild = false;
2324    int ret;
2325
2326    size = bdrv_getlength(bs->file->bs);
2327    if (size < 0) {
2328        res->check_errors++;
2329        return size;
2330    }
2331
2332    nb_clusters = size_to_clusters(s, size);
2333    if (nb_clusters > INT_MAX) {
2334        res->check_errors++;
2335        return -EFBIG;
2336    }
2337
2338    res->bfi.total_clusters =
2339        size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2340
2341    ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2342                              &nb_clusters);
2343    if (ret < 0) {
2344        goto fail;
2345    }
2346
2347    /* In case we don't need to rebuild the refcount structure (but want to fix
2348     * something), this function is immediately called again, in which case the
2349     * result should be ignored */
2350    pre_compare_res = *res;
2351    compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2352                      nb_clusters);
2353
2354    if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2355        BdrvCheckResult old_res = *res;
2356        int fresh_leaks = 0;
2357
2358        fprintf(stderr, "Rebuilding refcount structure\n");
2359        ret = rebuild_refcount_structure(bs, res, &refcount_table,
2360                                         &nb_clusters);
2361        if (ret < 0) {
2362            goto fail;
2363        }
2364
2365        res->corruptions = 0;
2366        res->leaks = 0;
2367
2368        /* Because the old reftable has been exchanged for a new one the
2369         * references have to be recalculated */
2370        rebuild = false;
2371        memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2372        ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2373                                  &nb_clusters);
2374        if (ret < 0) {
2375            goto fail;
2376        }
2377
2378        if (fix & BDRV_FIX_LEAKS) {
2379            /* The old refcount structures are now leaked, fix it; the result
2380             * can be ignored, aside from leaks which were introduced by
2381             * rebuild_refcount_structure() that could not be fixed */
2382            BdrvCheckResult saved_res = *res;
2383            *res = (BdrvCheckResult){ 0 };
2384
2385            compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2386                              &highest_cluster, refcount_table, nb_clusters);
2387            if (rebuild) {
2388                fprintf(stderr, "ERROR rebuilt refcount structure is still "
2389                        "broken\n");
2390            }
2391
2392            /* Any leaks accounted for here were introduced by
2393             * rebuild_refcount_structure() because that function has created a
2394             * new refcount structure from scratch */
2395            fresh_leaks = res->leaks;
2396            *res = saved_res;
2397        }
2398
2399        if (res->corruptions < old_res.corruptions) {
2400            res->corruptions_fixed += old_res.corruptions - res->corruptions;
2401        }
2402        if (res->leaks < old_res.leaks) {
2403            res->leaks_fixed += old_res.leaks - res->leaks;
2404        }
2405        res->leaks += fresh_leaks;
2406    } else if (fix) {
2407        if (rebuild) {
2408            fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2409            res->check_errors++;
2410            ret = -EIO;
2411            goto fail;
2412        }
2413
2414        if (res->leaks || res->corruptions) {
2415            *res = pre_compare_res;
2416            compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2417                              refcount_table, nb_clusters);
2418        }
2419    }
2420
2421    /* check OFLAG_COPIED */
2422    ret = check_oflag_copied(bs, res, fix);
2423    if (ret < 0) {
2424        goto fail;
2425    }
2426
2427    res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2428    ret = 0;
2429
2430fail:
2431    g_free(refcount_table);
2432
2433    return ret;
2434}
2435
2436#define overlaps_with(ofs, sz) \
2437    ranges_overlap(offset, size, ofs, sz)
2438
2439/*
2440 * Checks if the given offset into the image file is actually free to use by
2441 * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2442 * i.e. a sanity check without relying on the refcount tables.
2443 *
2444 * The ign parameter specifies what checks not to perform (being a bitmask of
2445 * QCow2MetadataOverlap values), i.e., what sections to ignore.
2446 *
2447 * Returns:
2448 * - 0 if writing to this offset will not affect the mentioned metadata
2449 * - a positive QCow2MetadataOverlap value indicating one overlapping section
2450 * - a negative value (-errno) indicating an error while performing a check,
2451 *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
2452 */
2453int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2454                                 int64_t size)
2455{
2456    BDRVQcow2State *s = bs->opaque;
2457    int chk = s->overlap_check & ~ign;
2458    int i, j;
2459
2460    if (!size) {
2461        return 0;
2462    }
2463
2464    if (chk & QCOW2_OL_MAIN_HEADER) {
2465        if (offset < s->cluster_size) {
2466            return QCOW2_OL_MAIN_HEADER;
2467        }
2468    }
2469
2470    /* align range to test to cluster boundaries */
2471    size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
2472    offset = start_of_cluster(s, offset);
2473
2474    if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2475        if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
2476            return QCOW2_OL_ACTIVE_L1;
2477        }
2478    }
2479
2480    if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2481        if (overlaps_with(s->refcount_table_offset,
2482            s->refcount_table_size * sizeof(uint64_t))) {
2483            return QCOW2_OL_REFCOUNT_TABLE;
2484        }
2485    }
2486
2487    if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2488        if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2489            return QCOW2_OL_SNAPSHOT_TABLE;
2490        }
2491    }
2492
2493    if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2494        for (i = 0; i < s->nb_snapshots; i++) {
2495            if (s->snapshots[i].l1_size &&
2496                overlaps_with(s->snapshots[i].l1_table_offset,
2497                s->snapshots[i].l1_size * sizeof(uint64_t))) {
2498                return QCOW2_OL_INACTIVE_L1;
2499            }
2500        }
2501    }
2502
2503    if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2504        for (i = 0; i < s->l1_size; i++) {
2505            if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2506                overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2507                s->cluster_size)) {
2508                return QCOW2_OL_ACTIVE_L2;
2509            }
2510        }
2511    }
2512
2513    if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2514        unsigned last_entry = s->max_refcount_table_index;
2515        assert(last_entry < s->refcount_table_size);
2516        assert(last_entry + 1 == s->refcount_table_size ||
2517               (s->refcount_table[last_entry + 1] & REFT_OFFSET_MASK) == 0);
2518        for (i = 0; i <= last_entry; i++) {
2519            if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2520                overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2521                s->cluster_size)) {
2522                return QCOW2_OL_REFCOUNT_BLOCK;
2523            }
2524        }
2525    }
2526
2527    if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2528        for (i = 0; i < s->nb_snapshots; i++) {
2529            uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2530            uint32_t l1_sz  = s->snapshots[i].l1_size;
2531            uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
2532            uint64_t *l1 = g_try_malloc(l1_sz2);
2533            int ret;
2534
2535            if (l1_sz2 && l1 == NULL) {
2536                return -ENOMEM;
2537            }
2538
2539            ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
2540            if (ret < 0) {
2541                g_free(l1);
2542                return ret;
2543            }
2544
2545            for (j = 0; j < l1_sz; j++) {
2546                uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
2547                if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
2548                    g_free(l1);
2549                    return QCOW2_OL_INACTIVE_L2;
2550                }
2551            }
2552
2553            g_free(l1);
2554        }
2555    }
2556
2557    return 0;
2558}
2559
2560static const char *metadata_ol_names[] = {
2561    [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header",
2562    [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table",
2563    [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table",
2564    [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
2565    [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
2566    [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
2567    [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table",
2568    [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table",
2569};
2570
2571/*
2572 * First performs a check for metadata overlaps (through
2573 * qcow2_check_metadata_overlap); if that fails with a negative value (error
2574 * while performing a check), that value is returned. If an impending overlap
2575 * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
2576 * and -EIO returned.
2577 *
2578 * Returns 0 if there were neither overlaps nor errors while checking for
2579 * overlaps; or a negative value (-errno) on error.
2580 */
2581int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
2582                                  int64_t size)
2583{
2584    int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
2585
2586    if (ret < 0) {
2587        return ret;
2588    } else if (ret > 0) {
2589        int metadata_ol_bitnr = ctz32(ret);
2590        assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
2591
2592        qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
2593                                "write on metadata (overlaps with %s)",
2594                                metadata_ol_names[metadata_ol_bitnr]);
2595        return -EIO;
2596    }
2597
2598    return 0;
2599}
2600
2601/* A pointer to a function of this type is given to walk_over_reftable(). That
2602 * function will create refblocks and pass them to a RefblockFinishOp once they
2603 * are completed (@refblock). @refblock_empty is set if the refblock is
2604 * completely empty.
2605 *
2606 * Along with the refblock, a corresponding reftable entry is passed, in the
2607 * reftable @reftable (which may be reallocated) at @reftable_index.
2608 *
2609 * @allocated should be set to true if a new cluster has been allocated.
2610 */
2611typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
2612                               uint64_t reftable_index, uint64_t *reftable_size,
2613                               void *refblock, bool refblock_empty,
2614                               bool *allocated, Error **errp);
2615
2616/**
2617 * This "operation" for walk_over_reftable() allocates the refblock on disk (if
2618 * it is not empty) and inserts its offset into the new reftable. The size of
2619 * this new reftable is increased as required.
2620 */
2621static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
2622                          uint64_t reftable_index, uint64_t *reftable_size,
2623                          void *refblock, bool refblock_empty, bool *allocated,
2624                          Error **errp)
2625{
2626    BDRVQcow2State *s = bs->opaque;
2627    int64_t offset;
2628
2629    if (!refblock_empty && reftable_index >= *reftable_size) {
2630        uint64_t *new_reftable;
2631        uint64_t new_reftable_size;
2632
2633        new_reftable_size = ROUND_UP(reftable_index + 1,
2634                                     s->cluster_size / sizeof(uint64_t));
2635        if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
2636            error_setg(errp,
2637                       "This operation would make the refcount table grow "
2638                       "beyond the maximum size supported by QEMU, aborting");
2639            return -ENOTSUP;
2640        }
2641
2642        new_reftable = g_try_realloc(*reftable, new_reftable_size *
2643                                                sizeof(uint64_t));
2644        if (!new_reftable) {
2645            error_setg(errp, "Failed to increase reftable buffer size");
2646            return -ENOMEM;
2647        }
2648
2649        memset(new_reftable + *reftable_size, 0,
2650               (new_reftable_size - *reftable_size) * sizeof(uint64_t));
2651
2652        *reftable      = new_reftable;
2653        *reftable_size = new_reftable_size;
2654    }
2655
2656    if (!refblock_empty && !(*reftable)[reftable_index]) {
2657        offset = qcow2_alloc_clusters(bs, s->cluster_size);
2658        if (offset < 0) {
2659            error_setg_errno(errp, -offset, "Failed to allocate refblock");
2660            return offset;
2661        }
2662        (*reftable)[reftable_index] = offset;
2663        *allocated = true;
2664    }
2665
2666    return 0;
2667}
2668
2669/**
2670 * This "operation" for walk_over_reftable() writes the refblock to disk at the
2671 * offset specified by the new reftable's entry. It does not modify the new
2672 * reftable or change any refcounts.
2673 */
2674static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
2675                          uint64_t reftable_index, uint64_t *reftable_size,
2676                          void *refblock, bool refblock_empty, bool *allocated,
2677                          Error **errp)
2678{
2679    BDRVQcow2State *s = bs->opaque;
2680    int64_t offset;
2681    int ret;
2682
2683    if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
2684        offset = (*reftable)[reftable_index];
2685
2686        ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
2687        if (ret < 0) {
2688            error_setg_errno(errp, -ret, "Overlap check failed");
2689            return ret;
2690        }
2691
2692        ret = bdrv_pwrite(bs->file, offset, refblock, s->cluster_size);
2693        if (ret < 0) {
2694            error_setg_errno(errp, -ret, "Failed to write refblock");
2695            return ret;
2696        }
2697    } else {
2698        assert(refblock_empty);
2699    }
2700
2701    return 0;
2702}
2703
2704/**
2705 * This function walks over the existing reftable and every referenced refblock;
2706 * if @new_set_refcount is non-NULL, it is called for every refcount entry to
2707 * create an equal new entry in the passed @new_refblock. Once that
2708 * @new_refblock is completely filled, @operation will be called.
2709 *
2710 * @status_cb and @cb_opaque are used for the amend operation's status callback.
2711 * @index is the index of the walk_over_reftable() calls and @total is the total
2712 * number of walk_over_reftable() calls per amend operation. Both are used for
2713 * calculating the parameters for the status callback.
2714 *
2715 * @allocated is set to true if a new cluster has been allocated.
2716 */
2717static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
2718                              uint64_t *new_reftable_index,
2719                              uint64_t *new_reftable_size,
2720                              void *new_refblock, int new_refblock_size,
2721                              int new_refcount_bits,
2722                              RefblockFinishOp *operation, bool *allocated,
2723                              Qcow2SetRefcountFunc *new_set_refcount,
2724                              BlockDriverAmendStatusCB *status_cb,
2725                              void *cb_opaque, int index, int total,
2726                              Error **errp)
2727{
2728    BDRVQcow2State *s = bs->opaque;
2729    uint64_t reftable_index;
2730    bool new_refblock_empty = true;
2731    int refblock_index;
2732    int new_refblock_index = 0;
2733    int ret;
2734
2735    for (reftable_index = 0; reftable_index < s->refcount_table_size;
2736         reftable_index++)
2737    {
2738        uint64_t refblock_offset = s->refcount_table[reftable_index]
2739                                 & REFT_OFFSET_MASK;
2740
2741        status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
2742                  (uint64_t)total * s->refcount_table_size, cb_opaque);
2743
2744        if (refblock_offset) {
2745            void *refblock;
2746
2747            if (offset_into_cluster(s, refblock_offset)) {
2748                qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
2749                                        PRIx64 " unaligned (reftable index: %#"
2750                                        PRIx64 ")", refblock_offset,
2751                                        reftable_index);
2752                error_setg(errp,
2753                           "Image is corrupt (unaligned refblock offset)");
2754                return -EIO;
2755            }
2756
2757            ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
2758                                  &refblock);
2759            if (ret < 0) {
2760                error_setg_errno(errp, -ret, "Failed to retrieve refblock");
2761                return ret;
2762            }
2763
2764            for (refblock_index = 0; refblock_index < s->refcount_block_size;
2765                 refblock_index++)
2766            {
2767                uint64_t refcount;
2768
2769                if (new_refblock_index >= new_refblock_size) {
2770                    /* new_refblock is now complete */
2771                    ret = operation(bs, new_reftable, *new_reftable_index,
2772                                    new_reftable_size, new_refblock,
2773                                    new_refblock_empty, allocated, errp);
2774                    if (ret < 0) {
2775                        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2776                        return ret;
2777                    }
2778
2779                    (*new_reftable_index)++;
2780                    new_refblock_index = 0;
2781                    new_refblock_empty = true;
2782                }
2783
2784                refcount = s->get_refcount(refblock, refblock_index);
2785                if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
2786                    uint64_t offset;
2787
2788                    qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2789
2790                    offset = ((reftable_index << s->refcount_block_bits)
2791                              + refblock_index) << s->cluster_bits;
2792
2793                    error_setg(errp, "Cannot decrease refcount entry width to "
2794                               "%i bits: Cluster at offset %#" PRIx64 " has a "
2795                               "refcount of %" PRIu64, new_refcount_bits,
2796                               offset, refcount);
2797                    return -EINVAL;
2798                }
2799
2800                if (new_set_refcount) {
2801                    new_set_refcount(new_refblock, new_refblock_index++,
2802                                     refcount);
2803                } else {
2804                    new_refblock_index++;
2805                }
2806                new_refblock_empty = new_refblock_empty && refcount == 0;
2807            }
2808
2809            qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2810        } else {
2811            /* No refblock means every refcount is 0 */
2812            for (refblock_index = 0; refblock_index < s->refcount_block_size;
2813                 refblock_index++)
2814            {
2815                if (new_refblock_index >= new_refblock_size) {
2816                    /* new_refblock is now complete */
2817                    ret = operation(bs, new_reftable, *new_reftable_index,
2818                                    new_reftable_size, new_refblock,
2819                                    new_refblock_empty, allocated, errp);
2820                    if (ret < 0) {
2821                        return ret;
2822                    }
2823
2824                    (*new_reftable_index)++;
2825                    new_refblock_index = 0;
2826                    new_refblock_empty = true;
2827                }
2828
2829                if (new_set_refcount) {
2830                    new_set_refcount(new_refblock, new_refblock_index++, 0);
2831                } else {
2832                    new_refblock_index++;
2833                }
2834            }
2835        }
2836    }
2837
2838    if (new_refblock_index > 0) {
2839        /* Complete the potentially existing partially filled final refblock */
2840        if (new_set_refcount) {
2841            for (; new_refblock_index < new_refblock_size;
2842                 new_refblock_index++)
2843            {
2844                new_set_refcount(new_refblock, new_refblock_index, 0);
2845            }
2846        }
2847
2848        ret = operation(bs, new_reftable, *new_reftable_index,
2849                        new_reftable_size, new_refblock, new_refblock_empty,
2850                        allocated, errp);
2851        if (ret < 0) {
2852            return ret;
2853        }
2854
2855        (*new_reftable_index)++;
2856    }
2857
2858    status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
2859              (uint64_t)total * s->refcount_table_size, cb_opaque);
2860
2861    return 0;
2862}
2863
2864int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
2865                                BlockDriverAmendStatusCB *status_cb,
2866                                void *cb_opaque, Error **errp)
2867{
2868    BDRVQcow2State *s = bs->opaque;
2869    Qcow2GetRefcountFunc *new_get_refcount;
2870    Qcow2SetRefcountFunc *new_set_refcount;
2871    void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
2872    uint64_t *new_reftable = NULL, new_reftable_size = 0;
2873    uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
2874    uint64_t new_reftable_index = 0;
2875    uint64_t i;
2876    int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
2877    int new_refblock_size, new_refcount_bits = 1 << refcount_order;
2878    int old_refcount_order;
2879    int walk_index = 0;
2880    int ret;
2881    bool new_allocation;
2882
2883    assert(s->qcow_version >= 3);
2884    assert(refcount_order >= 0 && refcount_order <= 6);
2885
2886    /* see qcow2_open() */
2887    new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
2888
2889    new_get_refcount = get_refcount_funcs[refcount_order];
2890    new_set_refcount = set_refcount_funcs[refcount_order];
2891
2892
2893    do {
2894        int total_walks;
2895
2896        new_allocation = false;
2897
2898        /* At least we have to do this walk and the one which writes the
2899         * refblocks; also, at least we have to do this loop here at least
2900         * twice (normally), first to do the allocations, and second to
2901         * determine that everything is correctly allocated, this then makes
2902         * three walks in total */
2903        total_walks = MAX(walk_index + 2, 3);
2904
2905        /* First, allocate the structures so they are present in the refcount
2906         * structures */
2907        ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
2908                                 &new_reftable_size, NULL, new_refblock_size,
2909                                 new_refcount_bits, &alloc_refblock,
2910                                 &new_allocation, NULL, status_cb, cb_opaque,
2911                                 walk_index++, total_walks, errp);
2912        if (ret < 0) {
2913            goto done;
2914        }
2915
2916        new_reftable_index = 0;
2917
2918        if (new_allocation) {
2919            if (new_reftable_offset) {
2920                qcow2_free_clusters(bs, new_reftable_offset,
2921                                    allocated_reftable_size * sizeof(uint64_t),
2922                                    QCOW2_DISCARD_NEVER);
2923            }
2924
2925            new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
2926                                                           sizeof(uint64_t));
2927            if (new_reftable_offset < 0) {
2928                error_setg_errno(errp, -new_reftable_offset,
2929                                 "Failed to allocate the new reftable");
2930                ret = new_reftable_offset;
2931                goto done;
2932            }
2933            allocated_reftable_size = new_reftable_size;
2934        }
2935    } while (new_allocation);
2936
2937    /* Second, write the new refblocks */
2938    ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
2939                             &new_reftable_size, new_refblock,
2940                             new_refblock_size, new_refcount_bits,
2941                             &flush_refblock, &new_allocation, new_set_refcount,
2942                             status_cb, cb_opaque, walk_index, walk_index + 1,
2943                             errp);
2944    if (ret < 0) {
2945        goto done;
2946    }
2947    assert(!new_allocation);
2948
2949
2950    /* Write the new reftable */
2951    ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
2952                                        new_reftable_size * sizeof(uint64_t));
2953    if (ret < 0) {
2954        error_setg_errno(errp, -ret, "Overlap check failed");
2955        goto done;
2956    }
2957
2958    for (i = 0; i < new_reftable_size; i++) {
2959        cpu_to_be64s(&new_reftable[i]);
2960    }
2961
2962    ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable,
2963                      new_reftable_size * sizeof(uint64_t));
2964
2965    for (i = 0; i < new_reftable_size; i++) {
2966        be64_to_cpus(&new_reftable[i]);
2967    }
2968
2969    if (ret < 0) {
2970        error_setg_errno(errp, -ret, "Failed to write the new reftable");
2971        goto done;
2972    }
2973
2974
2975    /* Empty the refcount cache */
2976    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
2977    if (ret < 0) {
2978        error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
2979        goto done;
2980    }
2981
2982    /* Update the image header to point to the new reftable; this only updates
2983     * the fields which are relevant to qcow2_update_header(); other fields
2984     * such as s->refcount_table or s->refcount_bits stay stale for now
2985     * (because we have to restore everything if qcow2_update_header() fails) */
2986    old_refcount_order  = s->refcount_order;
2987    old_reftable_size   = s->refcount_table_size;
2988    old_reftable_offset = s->refcount_table_offset;
2989
2990    s->refcount_order        = refcount_order;
2991    s->refcount_table_size   = new_reftable_size;
2992    s->refcount_table_offset = new_reftable_offset;
2993
2994    ret = qcow2_update_header(bs);
2995    if (ret < 0) {
2996        s->refcount_order        = old_refcount_order;
2997        s->refcount_table_size   = old_reftable_size;
2998        s->refcount_table_offset = old_reftable_offset;
2999        error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
3000        goto done;
3001    }
3002
3003    /* Now update the rest of the in-memory information */
3004    old_reftable = s->refcount_table;
3005    s->refcount_table = new_reftable;
3006    update_max_refcount_table_index(s);
3007
3008    s->refcount_bits = 1 << refcount_order;
3009    s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
3010    s->refcount_max += s->refcount_max - 1;
3011
3012    s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
3013    s->refcount_block_size = 1 << s->refcount_block_bits;
3014
3015    s->get_refcount = new_get_refcount;
3016    s->set_refcount = new_set_refcount;
3017
3018    /* For cleaning up all old refblocks and the old reftable below the "done"
3019     * label */
3020    new_reftable        = old_reftable;
3021    new_reftable_size   = old_reftable_size;
3022    new_reftable_offset = old_reftable_offset;
3023
3024done:
3025    if (new_reftable) {
3026        /* On success, new_reftable actually points to the old reftable (and
3027         * new_reftable_size is the old reftable's size); but that is just
3028         * fine */
3029        for (i = 0; i < new_reftable_size; i++) {
3030            uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
3031            if (offset) {
3032                qcow2_free_clusters(bs, offset, s->cluster_size,
3033                                    QCOW2_DISCARD_OTHER);
3034            }
3035        }
3036        g_free(new_reftable);
3037
3038        if (new_reftable_offset > 0) {
3039            qcow2_free_clusters(bs, new_reftable_offset,
3040                                new_reftable_size * sizeof(uint64_t),
3041                                QCOW2_DISCARD_OTHER);
3042        }
3043    }
3044
3045    qemu_vfree(new_refblock);
3046    return ret;
3047}
3048