qemu/block/qcow2-refcount.c
<<
>>
Prefs
   1/*
   2 * Block driver for the QCOW version 2 format
   3 *
   4 * Copyright (c) 2004-2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "qapi/error.h"
  27#include "qcow2.h"
  28#include "qemu/range.h"
  29#include "qemu/bswap.h"
  30#include "qemu/cutils.h"
  31#include "qemu/memalign.h"
  32#include "trace.h"
  33
  34static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
  35                                    uint64_t max);
  36
  37G_GNUC_WARN_UNUSED_RESULT
  38static int update_refcount(BlockDriverState *bs,
  39                           int64_t offset, int64_t length, uint64_t addend,
  40                           bool decrease, enum qcow2_discard_type type);
  41
  42static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
  43static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
  44static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
  45static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
  46static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
  47static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
  48static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
  49
  50static void set_refcount_ro0(void *refcount_array, uint64_t index,
  51                             uint64_t value);
  52static void set_refcount_ro1(void *refcount_array, uint64_t index,
  53                             uint64_t value);
  54static void set_refcount_ro2(void *refcount_array, uint64_t index,
  55                             uint64_t value);
  56static void set_refcount_ro3(void *refcount_array, uint64_t index,
  57                             uint64_t value);
  58static void set_refcount_ro4(void *refcount_array, uint64_t index,
  59                             uint64_t value);
  60static void set_refcount_ro5(void *refcount_array, uint64_t index,
  61                             uint64_t value);
  62static void set_refcount_ro6(void *refcount_array, uint64_t index,
  63                             uint64_t value);
  64
  65
  66static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
  67    &get_refcount_ro0,
  68    &get_refcount_ro1,
  69    &get_refcount_ro2,
  70    &get_refcount_ro3,
  71    &get_refcount_ro4,
  72    &get_refcount_ro5,
  73    &get_refcount_ro6
  74};
  75
  76static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
  77    &set_refcount_ro0,
  78    &set_refcount_ro1,
  79    &set_refcount_ro2,
  80    &set_refcount_ro3,
  81    &set_refcount_ro4,
  82    &set_refcount_ro5,
  83    &set_refcount_ro6
  84};
  85
  86
  87/*********************************************************/
  88/* refcount handling */
  89
  90static void update_max_refcount_table_index(BDRVQcow2State *s)
  91{
  92    unsigned i = s->refcount_table_size - 1;
  93    while (i > 0 && (s->refcount_table[i] & REFT_OFFSET_MASK) == 0) {
  94        i--;
  95    }
  96    /* Set s->max_refcount_table_index to the index of the last used entry */
  97    s->max_refcount_table_index = i;
  98}
  99
 100int coroutine_fn qcow2_refcount_init(BlockDriverState *bs)
 101{
 102    BDRVQcow2State *s = bs->opaque;
 103    unsigned int refcount_table_size2, i;
 104    int ret;
 105
 106    assert(s->refcount_order >= 0 && s->refcount_order <= 6);
 107
 108    s->get_refcount = get_refcount_funcs[s->refcount_order];
 109    s->set_refcount = set_refcount_funcs[s->refcount_order];
 110
 111    assert(s->refcount_table_size <= INT_MAX / REFTABLE_ENTRY_SIZE);
 112    refcount_table_size2 = s->refcount_table_size * REFTABLE_ENTRY_SIZE;
 113    s->refcount_table = g_try_malloc(refcount_table_size2);
 114
 115    if (s->refcount_table_size > 0) {
 116        if (s->refcount_table == NULL) {
 117            ret = -ENOMEM;
 118            goto fail;
 119        }
 120        BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
 121        ret = bdrv_co_pread(bs->file, s->refcount_table_offset,
 122                            refcount_table_size2, s->refcount_table, 0);
 123        if (ret < 0) {
 124            goto fail;
 125        }
 126        for(i = 0; i < s->refcount_table_size; i++)
 127            be64_to_cpus(&s->refcount_table[i]);
 128        update_max_refcount_table_index(s);
 129    }
 130    return 0;
 131 fail:
 132    return ret;
 133}
 134
 135void qcow2_refcount_close(BlockDriverState *bs)
 136{
 137    BDRVQcow2State *s = bs->opaque;
 138    g_free(s->refcount_table);
 139}
 140
 141
 142static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
 143{
 144    return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
 145}
 146
 147static void set_refcount_ro0(void *refcount_array, uint64_t index,
 148                             uint64_t value)
 149{
 150    assert(!(value >> 1));
 151    ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
 152    ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
 153}
 154
 155static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
 156{
 157    return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
 158           & 0x3;
 159}
 160
 161static void set_refcount_ro1(void *refcount_array, uint64_t index,
 162                             uint64_t value)
 163{
 164    assert(!(value >> 2));
 165    ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
 166    ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
 167}
 168
 169static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
 170{
 171    return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
 172           & 0xf;
 173}
 174
 175static void set_refcount_ro2(void *refcount_array, uint64_t index,
 176                             uint64_t value)
 177{
 178    assert(!(value >> 4));
 179    ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
 180    ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
 181}
 182
 183static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
 184{
 185    return ((const uint8_t *)refcount_array)[index];
 186}
 187
 188static void set_refcount_ro3(void *refcount_array, uint64_t index,
 189                             uint64_t value)
 190{
 191    assert(!(value >> 8));
 192    ((uint8_t *)refcount_array)[index] = value;
 193}
 194
 195static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
 196{
 197    return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
 198}
 199
 200static void set_refcount_ro4(void *refcount_array, uint64_t index,
 201                             uint64_t value)
 202{
 203    assert(!(value >> 16));
 204    ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
 205}
 206
 207static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
 208{
 209    return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
 210}
 211
 212static void set_refcount_ro5(void *refcount_array, uint64_t index,
 213                             uint64_t value)
 214{
 215    assert(!(value >> 32));
 216    ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
 217}
 218
 219static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
 220{
 221    return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
 222}
 223
 224static void set_refcount_ro6(void *refcount_array, uint64_t index,
 225                             uint64_t value)
 226{
 227    ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
 228}
 229
 230
 231static int load_refcount_block(BlockDriverState *bs,
 232                               int64_t refcount_block_offset,
 233                               void **refcount_block)
 234{
 235    BDRVQcow2State *s = bs->opaque;
 236
 237    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
 238    return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
 239                           refcount_block);
 240}
 241
 242/*
 243 * Retrieves the refcount of the cluster given by its index and stores it in
 244 * *refcount. Returns 0 on success and -errno on failure.
 245 */
 246int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
 247                       uint64_t *refcount)
 248{
 249    BDRVQcow2State *s = bs->opaque;
 250    uint64_t refcount_table_index, block_index;
 251    int64_t refcount_block_offset;
 252    int ret;
 253    void *refcount_block;
 254
 255    refcount_table_index = cluster_index >> s->refcount_block_bits;
 256    if (refcount_table_index >= s->refcount_table_size) {
 257        *refcount = 0;
 258        return 0;
 259    }
 260    refcount_block_offset =
 261        s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
 262    if (!refcount_block_offset) {
 263        *refcount = 0;
 264        return 0;
 265    }
 266
 267    if (offset_into_cluster(s, refcount_block_offset)) {
 268        qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
 269                                " unaligned (reftable index: %#" PRIx64 ")",
 270                                refcount_block_offset, refcount_table_index);
 271        return -EIO;
 272    }
 273
 274    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
 275                          &refcount_block);
 276    if (ret < 0) {
 277        return ret;
 278    }
 279
 280    block_index = cluster_index & (s->refcount_block_size - 1);
 281    *refcount = s->get_refcount(refcount_block, block_index);
 282
 283    qcow2_cache_put(s->refcount_block_cache, &refcount_block);
 284
 285    return 0;
 286}
 287
 288/* Checks if two offsets are described by the same refcount block */
 289static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
 290    uint64_t offset_b)
 291{
 292    uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
 293    uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
 294
 295    return (block_a == block_b);
 296}
 297
 298/*
 299 * Loads a refcount block. If it doesn't exist yet, it is allocated first
 300 * (including growing the refcount table if needed).
 301 *
 302 * Returns 0 on success or -errno in error case
 303 */
 304static int alloc_refcount_block(BlockDriverState *bs,
 305                                int64_t cluster_index, void **refcount_block)
 306{
 307    BDRVQcow2State *s = bs->opaque;
 308    unsigned int refcount_table_index;
 309    int64_t ret;
 310
 311    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
 312
 313    /* Find the refcount block for the given cluster */
 314    refcount_table_index = cluster_index >> s->refcount_block_bits;
 315
 316    if (refcount_table_index < s->refcount_table_size) {
 317
 318        uint64_t refcount_block_offset =
 319            s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
 320
 321        /* If it's already there, we're done */
 322        if (refcount_block_offset) {
 323            if (offset_into_cluster(s, refcount_block_offset)) {
 324                qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
 325                                        PRIx64 " unaligned (reftable index: "
 326                                        "%#x)", refcount_block_offset,
 327                                        refcount_table_index);
 328                return -EIO;
 329            }
 330
 331             return load_refcount_block(bs, refcount_block_offset,
 332                                        refcount_block);
 333        }
 334    }
 335
 336    /*
 337     * If we came here, we need to allocate something. Something is at least
 338     * a cluster for the new refcount block. It may also include a new refcount
 339     * table if the old refcount table is too small.
 340     *
 341     * Note that allocating clusters here needs some special care:
 342     *
 343     * - We can't use the normal qcow2_alloc_clusters(), it would try to
 344     *   increase the refcount and very likely we would end up with an endless
 345     *   recursion. Instead we must place the refcount blocks in a way that
 346     *   they can describe them themselves.
 347     *
 348     * - We need to consider that at this point we are inside update_refcounts
 349     *   and potentially doing an initial refcount increase. This means that
 350     *   some clusters have already been allocated by the caller, but their
 351     *   refcount isn't accurate yet. If we allocate clusters for metadata, we
 352     *   need to return -EAGAIN to signal the caller that it needs to restart
 353     *   the search for free clusters.
 354     *
 355     * - alloc_clusters_noref and qcow2_free_clusters may load a different
 356     *   refcount block into the cache
 357     */
 358
 359    *refcount_block = NULL;
 360
 361    /* We write to the refcount table, so we might depend on L2 tables */
 362    ret = qcow2_cache_flush(bs, s->l2_table_cache);
 363    if (ret < 0) {
 364        return ret;
 365    }
 366
 367    /* Allocate the refcount block itself and mark it as used */
 368    int64_t new_block = alloc_clusters_noref(bs, s->cluster_size, INT64_MAX);
 369    if (new_block < 0) {
 370        return new_block;
 371    }
 372
 373    /* The offset must fit in the offset field of the refcount table entry */
 374    assert((new_block & REFT_OFFSET_MASK) == new_block);
 375
 376    /* If we're allocating the block at offset 0 then something is wrong */
 377    if (new_block == 0) {
 378        qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
 379                                "allocation of refcount block at offset 0");
 380        return -EIO;
 381    }
 382
 383#ifdef DEBUG_ALLOC2
 384    fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
 385        " at %" PRIx64 "\n",
 386        refcount_table_index, cluster_index << s->cluster_bits, new_block);
 387#endif
 388
 389    if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
 390        /* Zero the new refcount block before updating it */
 391        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
 392                                    refcount_block);
 393        if (ret < 0) {
 394            goto fail;
 395        }
 396
 397        memset(*refcount_block, 0, s->cluster_size);
 398
 399        /* The block describes itself, need to update the cache */
 400        int block_index = (new_block >> s->cluster_bits) &
 401            (s->refcount_block_size - 1);
 402        s->set_refcount(*refcount_block, block_index, 1);
 403    } else {
 404        /* Described somewhere else. This can recurse at most twice before we
 405         * arrive at a block that describes itself. */
 406        ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
 407                              QCOW2_DISCARD_NEVER);
 408        if (ret < 0) {
 409            goto fail;
 410        }
 411
 412        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 413        if (ret < 0) {
 414            goto fail;
 415        }
 416
 417        /* Initialize the new refcount block only after updating its refcount,
 418         * update_refcount uses the refcount cache itself */
 419        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
 420                                    refcount_block);
 421        if (ret < 0) {
 422            goto fail;
 423        }
 424
 425        memset(*refcount_block, 0, s->cluster_size);
 426    }
 427
 428    /* Now the new refcount block needs to be written to disk */
 429    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
 430    qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
 431    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 432    if (ret < 0) {
 433        goto fail;
 434    }
 435
 436    /* If the refcount table is big enough, just hook the block up there */
 437    if (refcount_table_index < s->refcount_table_size) {
 438        uint64_t data64 = cpu_to_be64(new_block);
 439        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
 440        ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset +
 441                               refcount_table_index * REFTABLE_ENTRY_SIZE,
 442            sizeof(data64), &data64, 0);
 443        if (ret < 0) {
 444            goto fail;
 445        }
 446
 447        s->refcount_table[refcount_table_index] = new_block;
 448        /* If there's a hole in s->refcount_table then it can happen
 449         * that refcount_table_index < s->max_refcount_table_index */
 450        s->max_refcount_table_index =
 451            MAX(s->max_refcount_table_index, refcount_table_index);
 452
 453        /* The new refcount block may be where the caller intended to put its
 454         * data, so let it restart the search. */
 455        return -EAGAIN;
 456    }
 457
 458    qcow2_cache_put(s->refcount_block_cache, refcount_block);
 459
 460    /*
 461     * If we come here, we need to grow the refcount table. Again, a new
 462     * refcount table needs some space and we can't simply allocate to avoid
 463     * endless recursion.
 464     *
 465     * Therefore let's grab new refcount blocks at the end of the image, which
 466     * will describe themselves and the new refcount table. This way we can
 467     * reference them only in the new table and do the switch to the new
 468     * refcount table at once without producing an inconsistent state in
 469     * between.
 470     */
 471    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
 472
 473    /* Calculate the number of refcount blocks needed so far; this will be the
 474     * basis for calculating the index of the first cluster used for the
 475     * self-describing refcount structures which we are about to create.
 476     *
 477     * Because we reached this point, there cannot be any refcount entries for
 478     * cluster_index or higher indices yet. However, because new_block has been
 479     * allocated to describe that cluster (and it will assume this role later
 480     * on), we cannot use that index; also, new_block may actually have a higher
 481     * cluster index than cluster_index, so it needs to be taken into account
 482     * here (and 1 needs to be added to its value because that cluster is used).
 483     */
 484    uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
 485                                            (new_block >> s->cluster_bits) + 1),
 486                                        s->refcount_block_size);
 487
 488    /* Create the new refcount table and blocks */
 489    uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
 490        s->cluster_size;
 491
 492    ret = qcow2_refcount_area(bs, meta_offset, 0, false,
 493                              refcount_table_index, new_block);
 494    if (ret < 0) {
 495        return ret;
 496    }
 497
 498    ret = load_refcount_block(bs, new_block, refcount_block);
 499    if (ret < 0) {
 500        return ret;
 501    }
 502
 503    /* If we were trying to do the initial refcount update for some cluster
 504     * allocation, we might have used the same clusters to store newly
 505     * allocated metadata. Make the caller search some new space. */
 506    return -EAGAIN;
 507
 508fail:
 509    if (*refcount_block != NULL) {
 510        qcow2_cache_put(s->refcount_block_cache, refcount_block);
 511    }
 512    return ret;
 513}
 514
 515/*
 516 * Starting at @start_offset, this function creates new self-covering refcount
 517 * structures: A new refcount table and refcount blocks which cover all of
 518 * themselves, and a number of @additional_clusters beyond their end.
 519 * @start_offset must be at the end of the image file, that is, there must be
 520 * only empty space beyond it.
 521 * If @exact_size is false, the refcount table will have 50 % more entries than
 522 * necessary so it will not need to grow again soon.
 523 * If @new_refblock_offset is not zero, it contains the offset of a refcount
 524 * block that should be entered into the new refcount table at index
 525 * @new_refblock_index.
 526 *
 527 * Returns: The offset after the new refcount structures (i.e. where the
 528 *          @additional_clusters may be placed) on success, -errno on error.
 529 */
 530int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
 531                            uint64_t additional_clusters, bool exact_size,
 532                            int new_refblock_index,
 533                            uint64_t new_refblock_offset)
 534{
 535    BDRVQcow2State *s = bs->opaque;
 536    uint64_t total_refblock_count_u64, additional_refblock_count;
 537    int total_refblock_count, table_size, area_reftable_index, table_clusters;
 538    int i;
 539    uint64_t table_offset, block_offset, end_offset;
 540    int ret;
 541    uint64_t *new_table;
 542
 543    assert(!(start_offset % s->cluster_size));
 544
 545    qcow2_refcount_metadata_size(start_offset / s->cluster_size +
 546                                 additional_clusters,
 547                                 s->cluster_size, s->refcount_order,
 548                                 !exact_size, &total_refblock_count_u64);
 549    if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) {
 550        return -EFBIG;
 551    }
 552    total_refblock_count = total_refblock_count_u64;
 553
 554    /* Index in the refcount table of the first refcount block to cover the area
 555     * of refcount structures we are about to create; we know that
 556     * @total_refblock_count can cover @start_offset, so this will definitely
 557     * fit into an int. */
 558    area_reftable_index = (start_offset / s->cluster_size) /
 559                          s->refcount_block_size;
 560
 561    if (exact_size) {
 562        table_size = total_refblock_count;
 563    } else {
 564        table_size = total_refblock_count +
 565                     DIV_ROUND_UP(total_refblock_count, 2);
 566    }
 567    /* The qcow2 file can only store the reftable size in number of clusters */
 568    table_size = ROUND_UP(table_size, s->cluster_size / REFTABLE_ENTRY_SIZE);
 569    table_clusters = (table_size * REFTABLE_ENTRY_SIZE) / s->cluster_size;
 570
 571    if (table_size > QCOW_MAX_REFTABLE_SIZE) {
 572        return -EFBIG;
 573    }
 574
 575    new_table = g_try_new0(uint64_t, table_size);
 576
 577    assert(table_size > 0);
 578    if (new_table == NULL) {
 579        ret = -ENOMEM;
 580        goto fail;
 581    }
 582
 583    /* Fill the new refcount table */
 584    if (table_size > s->max_refcount_table_index) {
 585        /* We're actually growing the reftable */
 586        memcpy(new_table, s->refcount_table,
 587               (s->max_refcount_table_index + 1) * REFTABLE_ENTRY_SIZE);
 588    } else {
 589        /* Improbable case: We're shrinking the reftable. However, the caller
 590         * has assured us that there is only empty space beyond @start_offset,
 591         * so we can simply drop all of the refblocks that won't fit into the
 592         * new reftable. */
 593        memcpy(new_table, s->refcount_table, table_size * REFTABLE_ENTRY_SIZE);
 594    }
 595
 596    if (new_refblock_offset) {
 597        assert(new_refblock_index < total_refblock_count);
 598        new_table[new_refblock_index] = new_refblock_offset;
 599    }
 600
 601    /* Count how many new refblocks we have to create */
 602    additional_refblock_count = 0;
 603    for (i = area_reftable_index; i < total_refblock_count; i++) {
 604        if (!new_table[i]) {
 605            additional_refblock_count++;
 606        }
 607    }
 608
 609    table_offset = start_offset + additional_refblock_count * s->cluster_size;
 610    end_offset = table_offset + table_clusters * s->cluster_size;
 611
 612    /* Fill the refcount blocks, and create new ones, if necessary */
 613    block_offset = start_offset;
 614    for (i = area_reftable_index; i < total_refblock_count; i++) {
 615        void *refblock_data;
 616        uint64_t first_offset_covered;
 617
 618        /* Reuse an existing refblock if possible, create a new one otherwise */
 619        if (new_table[i]) {
 620            ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i],
 621                                  &refblock_data);
 622            if (ret < 0) {
 623                goto fail;
 624            }
 625        } else {
 626            ret = qcow2_cache_get_empty(bs, s->refcount_block_cache,
 627                                        block_offset, &refblock_data);
 628            if (ret < 0) {
 629                goto fail;
 630            }
 631            memset(refblock_data, 0, s->cluster_size);
 632            qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
 633                                         refblock_data);
 634
 635            new_table[i] = block_offset;
 636            block_offset += s->cluster_size;
 637        }
 638
 639        /* First host offset covered by this refblock */
 640        first_offset_covered = (uint64_t)i * s->refcount_block_size *
 641                               s->cluster_size;
 642        if (first_offset_covered < end_offset) {
 643            int j, end_index;
 644
 645            /* Set the refcount of all of the new refcount structures to 1 */
 646
 647            if (first_offset_covered < start_offset) {
 648                assert(i == area_reftable_index);
 649                j = (start_offset - first_offset_covered) / s->cluster_size;
 650                assert(j < s->refcount_block_size);
 651            } else {
 652                j = 0;
 653            }
 654
 655            end_index = MIN((end_offset - first_offset_covered) /
 656                            s->cluster_size,
 657                            s->refcount_block_size);
 658
 659            for (; j < end_index; j++) {
 660                /* The caller guaranteed us this space would be empty */
 661                assert(s->get_refcount(refblock_data, j) == 0);
 662                s->set_refcount(refblock_data, j, 1);
 663            }
 664
 665            qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
 666                                         refblock_data);
 667        }
 668
 669        qcow2_cache_put(s->refcount_block_cache, &refblock_data);
 670    }
 671
 672    assert(block_offset == table_offset);
 673
 674    /* Write refcount blocks to disk */
 675    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
 676    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 677    if (ret < 0) {
 678        goto fail;
 679    }
 680
 681    /* Write refcount table to disk */
 682    for (i = 0; i < total_refblock_count; i++) {
 683        cpu_to_be64s(&new_table[i]);
 684    }
 685
 686    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
 687    ret = bdrv_pwrite_sync(bs->file, table_offset,
 688                           table_size * REFTABLE_ENTRY_SIZE, new_table, 0);
 689    if (ret < 0) {
 690        goto fail;
 691    }
 692
 693    for (i = 0; i < total_refblock_count; i++) {
 694        be64_to_cpus(&new_table[i]);
 695    }
 696
 697    /* Hook up the new refcount table in the qcow2 header */
 698    struct QEMU_PACKED {
 699        uint64_t d64;
 700        uint32_t d32;
 701    } data;
 702    data.d64 = cpu_to_be64(table_offset);
 703    data.d32 = cpu_to_be32(table_clusters);
 704    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
 705    ret = bdrv_pwrite_sync(bs->file,
 706                           offsetof(QCowHeader, refcount_table_offset),
 707                           sizeof(data), &data, 0);
 708    if (ret < 0) {
 709        goto fail;
 710    }
 711
 712    /* And switch it in memory */
 713    uint64_t old_table_offset = s->refcount_table_offset;
 714    uint64_t old_table_size = s->refcount_table_size;
 715
 716    g_free(s->refcount_table);
 717    s->refcount_table = new_table;
 718    s->refcount_table_size = table_size;
 719    s->refcount_table_offset = table_offset;
 720    update_max_refcount_table_index(s);
 721
 722    /* Free old table. */
 723    qcow2_free_clusters(bs, old_table_offset,
 724                        old_table_size * REFTABLE_ENTRY_SIZE,
 725                        QCOW2_DISCARD_OTHER);
 726
 727    return end_offset;
 728
 729fail:
 730    g_free(new_table);
 731    return ret;
 732}
 733
 734void qcow2_process_discards(BlockDriverState *bs, int ret)
 735{
 736    BDRVQcow2State *s = bs->opaque;
 737    Qcow2DiscardRegion *d, *next;
 738
 739    QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
 740        QTAILQ_REMOVE(&s->discards, d, next);
 741
 742        /* Discard is optional, ignore the return value */
 743        if (ret >= 0) {
 744            int r2 = bdrv_pdiscard(bs->file, d->offset, d->bytes);
 745            if (r2 < 0) {
 746                trace_qcow2_process_discards_failed_region(d->offset, d->bytes,
 747                                                           r2);
 748            }
 749        }
 750
 751        g_free(d);
 752    }
 753}
 754
 755static void update_refcount_discard(BlockDriverState *bs,
 756                                    uint64_t offset, uint64_t length)
 757{
 758    BDRVQcow2State *s = bs->opaque;
 759    Qcow2DiscardRegion *d, *p, *next;
 760
 761    QTAILQ_FOREACH(d, &s->discards, next) {
 762        uint64_t new_start = MIN(offset, d->offset);
 763        uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
 764
 765        if (new_end - new_start <= length + d->bytes) {
 766            /* There can't be any overlap, areas ending up here have no
 767             * references any more and therefore shouldn't get freed another
 768             * time. */
 769            assert(d->bytes + length == new_end - new_start);
 770            d->offset = new_start;
 771            d->bytes = new_end - new_start;
 772            goto found;
 773        }
 774    }
 775
 776    d = g_malloc(sizeof(*d));
 777    *d = (Qcow2DiscardRegion) {
 778        .bs     = bs,
 779        .offset = offset,
 780        .bytes  = length,
 781    };
 782    QTAILQ_INSERT_TAIL(&s->discards, d, next);
 783
 784found:
 785    /* Merge discard requests if they are adjacent now */
 786    QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
 787        if (p == d
 788            || p->offset > d->offset + d->bytes
 789            || d->offset > p->offset + p->bytes)
 790        {
 791            continue;
 792        }
 793
 794        /* Still no overlap possible */
 795        assert(p->offset == d->offset + d->bytes
 796            || d->offset == p->offset + p->bytes);
 797
 798        QTAILQ_REMOVE(&s->discards, p, next);
 799        d->offset = MIN(d->offset, p->offset);
 800        d->bytes += p->bytes;
 801        g_free(p);
 802    }
 803}
 804
 805/* XXX: cache several refcount block clusters ? */
 806/* @addend is the absolute value of the addend; if @decrease is set, @addend
 807 * will be subtracted from the current refcount, otherwise it will be added */
 808static int update_refcount(BlockDriverState *bs,
 809                           int64_t offset,
 810                           int64_t length,
 811                           uint64_t addend,
 812                           bool decrease,
 813                           enum qcow2_discard_type type)
 814{
 815    BDRVQcow2State *s = bs->opaque;
 816    int64_t start, last, cluster_offset;
 817    void *refcount_block = NULL;
 818    int64_t old_table_index = -1;
 819    int ret;
 820
 821#ifdef DEBUG_ALLOC2
 822    fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
 823            " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
 824            addend);
 825#endif
 826    if (length < 0) {
 827        return -EINVAL;
 828    } else if (length == 0) {
 829        return 0;
 830    }
 831
 832    if (decrease) {
 833        qcow2_cache_set_dependency(bs, s->refcount_block_cache,
 834            s->l2_table_cache);
 835    }
 836
 837    start = start_of_cluster(s, offset);
 838    last = start_of_cluster(s, offset + length - 1);
 839    for(cluster_offset = start; cluster_offset <= last;
 840        cluster_offset += s->cluster_size)
 841    {
 842        int block_index;
 843        uint64_t refcount;
 844        int64_t cluster_index = cluster_offset >> s->cluster_bits;
 845        int64_t table_index = cluster_index >> s->refcount_block_bits;
 846
 847        /* Load the refcount block and allocate it if needed */
 848        if (table_index != old_table_index) {
 849            if (refcount_block) {
 850                qcow2_cache_put(s->refcount_block_cache, &refcount_block);
 851            }
 852            ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
 853            /* If the caller needs to restart the search for free clusters,
 854             * try the same ones first to see if they're still free. */
 855            if (ret == -EAGAIN) {
 856                if (s->free_cluster_index > (start >> s->cluster_bits)) {
 857                    s->free_cluster_index = (start >> s->cluster_bits);
 858                }
 859            }
 860            if (ret < 0) {
 861                goto fail;
 862            }
 863        }
 864        old_table_index = table_index;
 865
 866        qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
 867
 868        /* we can update the count and save it */
 869        block_index = cluster_index & (s->refcount_block_size - 1);
 870
 871        refcount = s->get_refcount(refcount_block, block_index);
 872        if (decrease ? (refcount - addend > refcount)
 873                     : (refcount + addend < refcount ||
 874                        refcount + addend > s->refcount_max))
 875        {
 876            ret = -EINVAL;
 877            goto fail;
 878        }
 879        if (decrease) {
 880            refcount -= addend;
 881        } else {
 882            refcount += addend;
 883        }
 884        if (refcount == 0 && cluster_index < s->free_cluster_index) {
 885            s->free_cluster_index = cluster_index;
 886        }
 887        s->set_refcount(refcount_block, block_index, refcount);
 888
 889        if (refcount == 0) {
 890            void *table;
 891
 892            table = qcow2_cache_is_table_offset(s->refcount_block_cache,
 893                                                offset);
 894            if (table != NULL) {
 895                qcow2_cache_put(s->refcount_block_cache, &refcount_block);
 896                old_table_index = -1;
 897                qcow2_cache_discard(s->refcount_block_cache, table);
 898            }
 899
 900            table = qcow2_cache_is_table_offset(s->l2_table_cache, offset);
 901            if (table != NULL) {
 902                qcow2_cache_discard(s->l2_table_cache, table);
 903            }
 904
 905            if (s->discard_passthrough[type]) {
 906                update_refcount_discard(bs, cluster_offset, s->cluster_size);
 907            }
 908        }
 909    }
 910
 911    ret = 0;
 912fail:
 913    if (!s->cache_discards) {
 914        qcow2_process_discards(bs, ret);
 915    }
 916
 917    /* Write last changed block to disk */
 918    if (refcount_block) {
 919        qcow2_cache_put(s->refcount_block_cache, &refcount_block);
 920    }
 921
 922    /*
 923     * Try do undo any updates if an error is returned (This may succeed in
 924     * some cases like ENOSPC for allocating a new refcount block)
 925     */
 926    if (ret < 0) {
 927        int dummy;
 928        dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
 929                                !decrease, QCOW2_DISCARD_NEVER);
 930        (void)dummy;
 931    }
 932
 933    return ret;
 934}
 935
 936/*
 937 * Increases or decreases the refcount of a given cluster.
 938 *
 939 * @addend is the absolute value of the addend; if @decrease is set, @addend
 940 * will be subtracted from the current refcount, otherwise it will be added.
 941 *
 942 * On success 0 is returned; on failure -errno is returned.
 943 */
 944int qcow2_update_cluster_refcount(BlockDriverState *bs,
 945                                  int64_t cluster_index,
 946                                  uint64_t addend, bool decrease,
 947                                  enum qcow2_discard_type type)
 948{
 949    BDRVQcow2State *s = bs->opaque;
 950    int ret;
 951
 952    ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
 953                          decrease, type);
 954    if (ret < 0) {
 955        return ret;
 956    }
 957
 958    return 0;
 959}
 960
 961
 962
 963/*********************************************************/
 964/* cluster allocation functions */
 965
 966
 967
 968/* return < 0 if error */
 969static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
 970                                    uint64_t max)
 971{
 972    BDRVQcow2State *s = bs->opaque;
 973    uint64_t i, nb_clusters, refcount;
 974    int ret;
 975
 976    /* We can't allocate clusters if they may still be queued for discard. */
 977    if (s->cache_discards) {
 978        qcow2_process_discards(bs, 0);
 979    }
 980
 981    nb_clusters = size_to_clusters(s, size);
 982retry:
 983    for(i = 0; i < nb_clusters; i++) {
 984        uint64_t next_cluster_index = s->free_cluster_index++;
 985        ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
 986
 987        if (ret < 0) {
 988            return ret;
 989        } else if (refcount != 0) {
 990            goto retry;
 991        }
 992    }
 993
 994    /* Make sure that all offsets in the "allocated" range are representable
 995     * in the requested max */
 996    if (s->free_cluster_index > 0 &&
 997        s->free_cluster_index - 1 > (max >> s->cluster_bits))
 998    {
 999        return -EFBIG;
1000    }
1001
1002#ifdef DEBUG_ALLOC2
1003    fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
1004            size,
1005            (s->free_cluster_index - nb_clusters) << s->cluster_bits);
1006#endif
1007    return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
1008}
1009
1010int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
1011{
1012    int64_t offset;
1013    int ret;
1014
1015    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
1016    do {
1017        offset = alloc_clusters_noref(bs, size, QCOW_MAX_CLUSTER_OFFSET);
1018        if (offset < 0) {
1019            return offset;
1020        }
1021
1022        ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1023    } while (ret == -EAGAIN);
1024
1025    if (ret < 0) {
1026        return ret;
1027    }
1028
1029    return offset;
1030}
1031
1032int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
1033                                int64_t nb_clusters)
1034{
1035    BDRVQcow2State *s = bs->opaque;
1036    uint64_t cluster_index, refcount;
1037    uint64_t i;
1038    int ret;
1039
1040    assert(nb_clusters >= 0);
1041    if (nb_clusters == 0) {
1042        return 0;
1043    }
1044
1045    do {
1046        /* Check how many clusters there are free */
1047        cluster_index = offset >> s->cluster_bits;
1048        for(i = 0; i < nb_clusters; i++) {
1049            ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
1050            if (ret < 0) {
1051                return ret;
1052            } else if (refcount != 0) {
1053                break;
1054            }
1055        }
1056
1057        /* And then allocate them */
1058        ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
1059                              QCOW2_DISCARD_NEVER);
1060    } while (ret == -EAGAIN);
1061
1062    if (ret < 0) {
1063        return ret;
1064    }
1065
1066    return i;
1067}
1068
1069/* only used to allocate compressed sectors. We try to allocate
1070   contiguous sectors. size must be <= cluster_size */
1071int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
1072{
1073    BDRVQcow2State *s = bs->opaque;
1074    int64_t offset;
1075    size_t free_in_cluster;
1076    int ret;
1077
1078    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
1079    assert(size > 0 && size <= s->cluster_size);
1080    assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
1081
1082    offset = s->free_byte_offset;
1083
1084    if (offset) {
1085        uint64_t refcount;
1086        ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
1087        if (ret < 0) {
1088            return ret;
1089        }
1090
1091        if (refcount == s->refcount_max) {
1092            offset = 0;
1093        }
1094    }
1095
1096    free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
1097    do {
1098        if (!offset || free_in_cluster < size) {
1099            int64_t new_cluster;
1100
1101            new_cluster = alloc_clusters_noref(bs, s->cluster_size,
1102                                               MIN(s->cluster_offset_mask,
1103                                                   QCOW_MAX_CLUSTER_OFFSET));
1104            if (new_cluster < 0) {
1105                return new_cluster;
1106            }
1107
1108            if (new_cluster == 0) {
1109                qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
1110                                        "allocation of compressed cluster "
1111                                        "at offset 0");
1112                return -EIO;
1113            }
1114
1115            if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
1116                offset = new_cluster;
1117                free_in_cluster = s->cluster_size;
1118            } else {
1119                free_in_cluster += s->cluster_size;
1120            }
1121        }
1122
1123        assert(offset);
1124        ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1125        if (ret < 0) {
1126            offset = 0;
1127        }
1128    } while (ret == -EAGAIN);
1129    if (ret < 0) {
1130        return ret;
1131    }
1132
1133    /* The cluster refcount was incremented; refcount blocks must be flushed
1134     * before the caller's L2 table updates. */
1135    qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
1136
1137    s->free_byte_offset = offset + size;
1138    if (!offset_into_cluster(s, s->free_byte_offset)) {
1139        s->free_byte_offset = 0;
1140    }
1141
1142    return offset;
1143}
1144
1145void qcow2_free_clusters(BlockDriverState *bs,
1146                          int64_t offset, int64_t size,
1147                          enum qcow2_discard_type type)
1148{
1149    int ret;
1150
1151    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
1152    ret = update_refcount(bs, offset, size, 1, true, type);
1153    if (ret < 0) {
1154        fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
1155        /* TODO Remember the clusters to free them later and avoid leaking */
1156    }
1157}
1158
1159/*
1160 * Free a cluster using its L2 entry (handles clusters of all types, e.g.
1161 * normal cluster, compressed cluster, etc.)
1162 */
1163void qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry,
1164                            enum qcow2_discard_type type)
1165{
1166    BDRVQcow2State *s = bs->opaque;
1167    QCow2ClusterType ctype = qcow2_get_cluster_type(bs, l2_entry);
1168
1169    if (has_data_file(bs)) {
1170        if (s->discard_passthrough[type] &&
1171            (ctype == QCOW2_CLUSTER_NORMAL ||
1172             ctype == QCOW2_CLUSTER_ZERO_ALLOC))
1173        {
1174            bdrv_pdiscard(s->data_file, l2_entry & L2E_OFFSET_MASK,
1175                          s->cluster_size);
1176        }
1177        return;
1178    }
1179
1180    switch (ctype) {
1181    case QCOW2_CLUSTER_COMPRESSED:
1182        {
1183            uint64_t coffset;
1184            int csize;
1185
1186            qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
1187            qcow2_free_clusters(bs, coffset, csize, type);
1188        }
1189        break;
1190    case QCOW2_CLUSTER_NORMAL:
1191    case QCOW2_CLUSTER_ZERO_ALLOC:
1192        if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1193            qcow2_signal_corruption(bs, false, -1, -1,
1194                                    "Cannot free unaligned cluster %#llx",
1195                                    l2_entry & L2E_OFFSET_MASK);
1196        } else {
1197            qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1198                                s->cluster_size, type);
1199        }
1200        break;
1201    case QCOW2_CLUSTER_ZERO_PLAIN:
1202    case QCOW2_CLUSTER_UNALLOCATED:
1203        break;
1204    default:
1205        abort();
1206    }
1207}
1208
1209int qcow2_write_caches(BlockDriverState *bs)
1210{
1211    BDRVQcow2State *s = bs->opaque;
1212    int ret;
1213
1214    ret = qcow2_cache_write(bs, s->l2_table_cache);
1215    if (ret < 0) {
1216        return ret;
1217    }
1218
1219    if (qcow2_need_accurate_refcounts(s)) {
1220        ret = qcow2_cache_write(bs, s->refcount_block_cache);
1221        if (ret < 0) {
1222            return ret;
1223        }
1224    }
1225
1226    return 0;
1227}
1228
1229int qcow2_flush_caches(BlockDriverState *bs)
1230{
1231    int ret = qcow2_write_caches(bs);
1232    if (ret < 0) {
1233        return ret;
1234    }
1235
1236    return bdrv_flush(bs->file->bs);
1237}
1238
1239/*********************************************************/
1240/* snapshots and image creation */
1241
1242
1243
1244/* update the refcounts of snapshots and the copied flag */
1245int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1246    int64_t l1_table_offset, int l1_size, int addend)
1247{
1248    BDRVQcow2State *s = bs->opaque;
1249    uint64_t *l1_table, *l2_slice, l2_offset, entry, l1_size2, refcount;
1250    bool l1_allocated = false;
1251    int64_t old_entry, old_l2_offset;
1252    unsigned slice, slice_size2, n_slices;
1253    int i, j, l1_modified = 0;
1254    int ret;
1255
1256    assert(addend >= -1 && addend <= 1);
1257
1258    l2_slice = NULL;
1259    l1_table = NULL;
1260    l1_size2 = l1_size * L1E_SIZE;
1261    slice_size2 = s->l2_slice_size * l2_entry_size(s);
1262    n_slices = s->cluster_size / slice_size2;
1263
1264    s->cache_discards = true;
1265
1266    /* WARNING: qcow2_snapshot_goto relies on this function not using the
1267     * l1_table_offset when it is the current s->l1_table_offset! Be careful
1268     * when changing this! */
1269    if (l1_table_offset != s->l1_table_offset) {
1270        l1_table = g_try_malloc0(l1_size2);
1271        if (l1_size2 && l1_table == NULL) {
1272            ret = -ENOMEM;
1273            goto fail;
1274        }
1275        l1_allocated = true;
1276
1277        ret = bdrv_pread(bs->file, l1_table_offset, l1_size2, l1_table, 0);
1278        if (ret < 0) {
1279            goto fail;
1280        }
1281
1282        for (i = 0; i < l1_size; i++) {
1283            be64_to_cpus(&l1_table[i]);
1284        }
1285    } else {
1286        assert(l1_size == s->l1_size);
1287        l1_table = s->l1_table;
1288        l1_allocated = false;
1289    }
1290
1291    for (i = 0; i < l1_size; i++) {
1292        l2_offset = l1_table[i];
1293        if (l2_offset) {
1294            old_l2_offset = l2_offset;
1295            l2_offset &= L1E_OFFSET_MASK;
1296
1297            if (offset_into_cluster(s, l2_offset)) {
1298                qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1299                                        PRIx64 " unaligned (L1 index: %#x)",
1300                                        l2_offset, i);
1301                ret = -EIO;
1302                goto fail;
1303            }
1304
1305            for (slice = 0; slice < n_slices; slice++) {
1306                ret = qcow2_cache_get(bs, s->l2_table_cache,
1307                                      l2_offset + slice * slice_size2,
1308                                      (void **) &l2_slice);
1309                if (ret < 0) {
1310                    goto fail;
1311                }
1312
1313                for (j = 0; j < s->l2_slice_size; j++) {
1314                    uint64_t cluster_index;
1315                    uint64_t offset;
1316
1317                    entry = get_l2_entry(s, l2_slice, j);
1318                    old_entry = entry;
1319                    entry &= ~QCOW_OFLAG_COPIED;
1320                    offset = entry & L2E_OFFSET_MASK;
1321
1322                    switch (qcow2_get_cluster_type(bs, entry)) {
1323                    case QCOW2_CLUSTER_COMPRESSED:
1324                        if (addend != 0) {
1325                            uint64_t coffset;
1326                            int csize;
1327
1328                            qcow2_parse_compressed_l2_entry(bs, entry,
1329                                                            &coffset, &csize);
1330                            ret = update_refcount(
1331                                bs, coffset, csize,
1332                                abs(addend), addend < 0,
1333                                QCOW2_DISCARD_SNAPSHOT);
1334                            if (ret < 0) {
1335                                goto fail;
1336                            }
1337                        }
1338                        /* compressed clusters are never modified */
1339                        refcount = 2;
1340                        break;
1341
1342                    case QCOW2_CLUSTER_NORMAL:
1343                    case QCOW2_CLUSTER_ZERO_ALLOC:
1344                        if (offset_into_cluster(s, offset)) {
1345                            /* Here l2_index means table (not slice) index */
1346                            int l2_index = slice * s->l2_slice_size + j;
1347                            qcow2_signal_corruption(
1348                                bs, true, -1, -1, "Cluster "
1349                                "allocation offset %#" PRIx64
1350                                " unaligned (L2 offset: %#"
1351                                PRIx64 ", L2 index: %#x)",
1352                                offset, l2_offset, l2_index);
1353                            ret = -EIO;
1354                            goto fail;
1355                        }
1356
1357                        cluster_index = offset >> s->cluster_bits;
1358                        assert(cluster_index);
1359                        if (addend != 0) {
1360                            ret = qcow2_update_cluster_refcount(
1361                                bs, cluster_index, abs(addend), addend < 0,
1362                                QCOW2_DISCARD_SNAPSHOT);
1363                            if (ret < 0) {
1364                                goto fail;
1365                            }
1366                        }
1367
1368                        ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1369                        if (ret < 0) {
1370                            goto fail;
1371                        }
1372                        break;
1373
1374                    case QCOW2_CLUSTER_ZERO_PLAIN:
1375                    case QCOW2_CLUSTER_UNALLOCATED:
1376                        refcount = 0;
1377                        break;
1378
1379                    default:
1380                        abort();
1381                    }
1382
1383                    if (refcount == 1) {
1384                        entry |= QCOW_OFLAG_COPIED;
1385                    }
1386                    if (entry != old_entry) {
1387                        if (addend > 0) {
1388                            qcow2_cache_set_dependency(bs, s->l2_table_cache,
1389                                                       s->refcount_block_cache);
1390                        }
1391                        set_l2_entry(s, l2_slice, j, entry);
1392                        qcow2_cache_entry_mark_dirty(s->l2_table_cache,
1393                                                     l2_slice);
1394                    }
1395                }
1396
1397                qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1398            }
1399
1400            if (addend != 0) {
1401                ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1402                                                        s->cluster_bits,
1403                                                    abs(addend), addend < 0,
1404                                                    QCOW2_DISCARD_SNAPSHOT);
1405                if (ret < 0) {
1406                    goto fail;
1407                }
1408            }
1409            ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1410                                     &refcount);
1411            if (ret < 0) {
1412                goto fail;
1413            } else if (refcount == 1) {
1414                l2_offset |= QCOW_OFLAG_COPIED;
1415            }
1416            if (l2_offset != old_l2_offset) {
1417                l1_table[i] = l2_offset;
1418                l1_modified = 1;
1419            }
1420        }
1421    }
1422
1423    ret = bdrv_flush(bs);
1424fail:
1425    if (l2_slice) {
1426        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1427    }
1428
1429    s->cache_discards = false;
1430    qcow2_process_discards(bs, ret);
1431
1432    /* Update L1 only if it isn't deleted anyway (addend = -1) */
1433    if (ret == 0 && addend >= 0 && l1_modified) {
1434        for (i = 0; i < l1_size; i++) {
1435            cpu_to_be64s(&l1_table[i]);
1436        }
1437
1438        ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_size2, l1_table,
1439                               0);
1440
1441        for (i = 0; i < l1_size; i++) {
1442            be64_to_cpus(&l1_table[i]);
1443        }
1444    }
1445    if (l1_allocated)
1446        g_free(l1_table);
1447    return ret;
1448}
1449
1450
1451
1452
1453/*********************************************************/
1454/* refcount checking functions */
1455
1456
1457static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
1458{
1459    /* This assertion holds because there is no way we can address more than
1460     * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1461     * offsets have to be representable in bytes); due to every cluster
1462     * corresponding to one refcount entry, we are well below that limit */
1463    assert(entries < (UINT64_C(1) << (64 - 9)));
1464
1465    /* Thanks to the assertion this will not overflow, because
1466     * s->refcount_order < 7.
1467     * (note: x << s->refcount_order == x * s->refcount_bits) */
1468    return DIV_ROUND_UP(entries << s->refcount_order, 8);
1469}
1470
1471/**
1472 * Reallocates *array so that it can hold new_size entries. *size must contain
1473 * the current number of entries in *array. If the reallocation fails, *array
1474 * and *size will not be modified and -errno will be returned. If the
1475 * reallocation is successful, *array will be set to the new buffer, *size
1476 * will be set to new_size and 0 will be returned. The size of the reallocated
1477 * refcount array buffer will be aligned to a cluster boundary, and the newly
1478 * allocated area will be zeroed.
1479 */
1480static int realloc_refcount_array(BDRVQcow2State *s, void **array,
1481                                  int64_t *size, int64_t new_size)
1482{
1483    int64_t old_byte_size, new_byte_size;
1484    void *new_ptr;
1485
1486    /* Round to clusters so the array can be directly written to disk */
1487    old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1488                    * s->cluster_size;
1489    new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1490                    * s->cluster_size;
1491
1492    if (new_byte_size == old_byte_size) {
1493        *size = new_size;
1494        return 0;
1495    }
1496
1497    assert(new_byte_size > 0);
1498
1499    if (new_byte_size > SIZE_MAX) {
1500        return -ENOMEM;
1501    }
1502
1503    new_ptr = g_try_realloc(*array, new_byte_size);
1504    if (!new_ptr) {
1505        return -ENOMEM;
1506    }
1507
1508    if (new_byte_size > old_byte_size) {
1509        memset((char *)new_ptr + old_byte_size, 0,
1510               new_byte_size - old_byte_size);
1511    }
1512
1513    *array = new_ptr;
1514    *size  = new_size;
1515
1516    return 0;
1517}
1518
1519/*
1520 * Increases the refcount for a range of clusters in a given refcount table.
1521 * This is used to construct a temporary refcount table out of L1 and L2 tables
1522 * which can be compared to the refcount table saved in the image.
1523 *
1524 * Modifies the number of errors in res.
1525 */
1526int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
1527                             void **refcount_table,
1528                             int64_t *refcount_table_size,
1529                             int64_t offset, int64_t size)
1530{
1531    BDRVQcow2State *s = bs->opaque;
1532    uint64_t start, last, cluster_offset, k, refcount;
1533    int64_t file_len;
1534    int ret;
1535
1536    if (size <= 0) {
1537        return 0;
1538    }
1539
1540    file_len = bdrv_getlength(bs->file->bs);
1541    if (file_len < 0) {
1542        return file_len;
1543    }
1544
1545    /*
1546     * Last cluster of qcow2 image may be semi-allocated, so it may be OK to
1547     * reference some space after file end but it should be less than one
1548     * cluster.
1549     */
1550    if (offset + size - file_len >= s->cluster_size) {
1551        fprintf(stderr, "ERROR: counting reference for region exceeding the "
1552                "end of the file by one cluster or more: offset 0x%" PRIx64
1553                " size 0x%" PRIx64 "\n", offset, size);
1554        res->corruptions++;
1555        return 0;
1556    }
1557
1558    start = start_of_cluster(s, offset);
1559    last = start_of_cluster(s, offset + size - 1);
1560    for(cluster_offset = start; cluster_offset <= last;
1561        cluster_offset += s->cluster_size) {
1562        k = cluster_offset >> s->cluster_bits;
1563        if (k >= *refcount_table_size) {
1564            ret = realloc_refcount_array(s, refcount_table,
1565                                         refcount_table_size, k + 1);
1566            if (ret < 0) {
1567                res->check_errors++;
1568                return ret;
1569            }
1570        }
1571
1572        refcount = s->get_refcount(*refcount_table, k);
1573        if (refcount == s->refcount_max) {
1574            fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1575                    "\n", cluster_offset);
1576            fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1577                    "width or qemu-img convert to create a clean copy if the "
1578                    "image cannot be opened for writing\n");
1579            res->corruptions++;
1580            continue;
1581        }
1582        s->set_refcount(*refcount_table, k, refcount + 1);
1583    }
1584
1585    return 0;
1586}
1587
1588/* Flags for check_refcounts_l1() and check_refcounts_l2() */
1589enum {
1590    CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1591};
1592
1593/*
1594 * Fix L2 entry by making it QCOW2_CLUSTER_ZERO_PLAIN (or making all its present
1595 * subclusters QCOW2_SUBCLUSTER_ZERO_PLAIN).
1596 *
1597 * This function decrements res->corruptions on success, so the caller is
1598 * responsible to increment res->corruptions prior to the call.
1599 *
1600 * On failure in-memory @l2_table may be modified.
1601 */
1602static int fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res,
1603                                uint64_t l2_offset,
1604                                uint64_t *l2_table, int l2_index, bool active,
1605                                bool *metadata_overlap)
1606{
1607    BDRVQcow2State *s = bs->opaque;
1608    int ret;
1609    int idx = l2_index * (l2_entry_size(s) / sizeof(uint64_t));
1610    uint64_t l2e_offset = l2_offset + (uint64_t)l2_index * l2_entry_size(s);
1611    int ign = active ? QCOW2_OL_ACTIVE_L2 : QCOW2_OL_INACTIVE_L2;
1612
1613    if (has_subclusters(s)) {
1614        uint64_t l2_bitmap = get_l2_bitmap(s, l2_table, l2_index);
1615
1616        /* Allocated subclusters become zero */
1617        l2_bitmap |= l2_bitmap << 32;
1618        l2_bitmap &= QCOW_L2_BITMAP_ALL_ZEROES;
1619
1620        set_l2_bitmap(s, l2_table, l2_index, l2_bitmap);
1621        set_l2_entry(s, l2_table, l2_index, 0);
1622    } else {
1623        set_l2_entry(s, l2_table, l2_index, QCOW_OFLAG_ZERO);
1624    }
1625
1626    ret = qcow2_pre_write_overlap_check(bs, ign, l2e_offset, l2_entry_size(s),
1627                                        false);
1628    if (metadata_overlap) {
1629        *metadata_overlap = ret < 0;
1630    }
1631    if (ret < 0) {
1632        fprintf(stderr, "ERROR: Overlap check failed\n");
1633        goto fail;
1634    }
1635
1636    ret = bdrv_pwrite_sync(bs->file, l2e_offset, l2_entry_size(s),
1637                           &l2_table[idx], 0);
1638    if (ret < 0) {
1639        fprintf(stderr, "ERROR: Failed to overwrite L2 "
1640                "table entry: %s\n", strerror(-ret));
1641        goto fail;
1642    }
1643
1644    res->corruptions--;
1645    res->corruptions_fixed++;
1646    return 0;
1647
1648fail:
1649    res->check_errors++;
1650    return ret;
1651}
1652
1653/*
1654 * Increases the refcount in the given refcount table for the all clusters
1655 * referenced in the L2 table. While doing so, performs some checks on L2
1656 * entries.
1657 *
1658 * Returns the number of errors found by the checks or -errno if an internal
1659 * error occurred.
1660 */
1661static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1662                              void **refcount_table,
1663                              int64_t *refcount_table_size, int64_t l2_offset,
1664                              int flags, BdrvCheckMode fix, bool active)
1665{
1666    BDRVQcow2State *s = bs->opaque;
1667    uint64_t l2_entry, l2_bitmap;
1668    uint64_t next_contiguous_offset = 0;
1669    int i, ret;
1670    size_t l2_size_bytes = s->l2_size * l2_entry_size(s);
1671    g_autofree uint64_t *l2_table = g_malloc(l2_size_bytes);
1672    bool metadata_overlap;
1673
1674    /* Read L2 table from disk */
1675    ret = bdrv_pread(bs->file, l2_offset, l2_size_bytes, l2_table, 0);
1676    if (ret < 0) {
1677        fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1678        res->check_errors++;
1679        return ret;
1680    }
1681
1682    /* Do the actual checks */
1683    for (i = 0; i < s->l2_size; i++) {
1684        uint64_t coffset;
1685        int csize;
1686        QCow2ClusterType type;
1687
1688        l2_entry = get_l2_entry(s, l2_table, i);
1689        l2_bitmap = get_l2_bitmap(s, l2_table, i);
1690        type = qcow2_get_cluster_type(bs, l2_entry);
1691
1692        if (type != QCOW2_CLUSTER_COMPRESSED) {
1693            /* Check reserved bits of Standard Cluster Descriptor */
1694            if (l2_entry & L2E_STD_RESERVED_MASK) {
1695                fprintf(stderr, "ERROR found l2 entry with reserved bits set: "
1696                        "%" PRIx64 "\n", l2_entry);
1697                res->corruptions++;
1698            }
1699        }
1700
1701        switch (type) {
1702        case QCOW2_CLUSTER_COMPRESSED:
1703            /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1704            if (l2_entry & QCOW_OFLAG_COPIED) {
1705                fprintf(stderr, "ERROR: coffset=0x%" PRIx64 ": "
1706                    "copied flag must never be set for compressed "
1707                    "clusters\n", l2_entry & s->cluster_offset_mask);
1708                l2_entry &= ~QCOW_OFLAG_COPIED;
1709                res->corruptions++;
1710            }
1711
1712            if (has_data_file(bs)) {
1713                fprintf(stderr, "ERROR compressed cluster %d with data file, "
1714                        "entry=0x%" PRIx64 "\n", i, l2_entry);
1715                res->corruptions++;
1716                break;
1717            }
1718
1719            if (l2_bitmap) {
1720                fprintf(stderr, "ERROR compressed cluster %d with non-zero "
1721                        "subcluster allocation bitmap, entry=0x%" PRIx64 "\n",
1722                        i, l2_entry);
1723                res->corruptions++;
1724                break;
1725            }
1726
1727            /* Mark cluster as used */
1728            qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
1729            ret = qcow2_inc_refcounts_imrt(
1730                bs, res, refcount_table, refcount_table_size, coffset, csize);
1731            if (ret < 0) {
1732                return ret;
1733            }
1734
1735            if (flags & CHECK_FRAG_INFO) {
1736                res->bfi.allocated_clusters++;
1737                res->bfi.compressed_clusters++;
1738
1739                /*
1740                 * Compressed clusters are fragmented by nature.  Since they
1741                 * take up sub-sector space but we only have sector granularity
1742                 * I/O we need to re-read the same sectors even for adjacent
1743                 * compressed clusters.
1744                 */
1745                res->bfi.fragmented_clusters++;
1746            }
1747            break;
1748
1749        case QCOW2_CLUSTER_ZERO_ALLOC:
1750        case QCOW2_CLUSTER_NORMAL:
1751        {
1752            uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1753
1754            if ((l2_bitmap >> 32) & l2_bitmap) {
1755                res->corruptions++;
1756                fprintf(stderr, "ERROR offset=%" PRIx64 ": Allocated "
1757                        "cluster has corrupted subcluster allocation bitmap\n",
1758                        offset);
1759            }
1760
1761            /* Correct offsets are cluster aligned */
1762            if (offset_into_cluster(s, offset)) {
1763                bool contains_data;
1764                res->corruptions++;
1765
1766                if (has_subclusters(s)) {
1767                    contains_data = (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC);
1768                } else {
1769                    contains_data = !(l2_entry & QCOW_OFLAG_ZERO);
1770                }
1771
1772                if (!contains_data) {
1773                    fprintf(stderr, "%s offset=%" PRIx64 ": Preallocated "
1774                            "cluster is not properly aligned; L2 entry "
1775                            "corrupted.\n",
1776                            fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
1777                            offset);
1778                    if (fix & BDRV_FIX_ERRORS) {
1779                        ret = fix_l2_entry_by_zero(bs, res, l2_offset,
1780                                                   l2_table, i, active,
1781                                                   &metadata_overlap);
1782                        if (metadata_overlap) {
1783                            /*
1784                             * Something is seriously wrong, so abort checking
1785                             * this L2 table.
1786                             */
1787                            return ret;
1788                        }
1789
1790                        if (ret == 0) {
1791                            /*
1792                             * Skip marking the cluster as used
1793                             * (it is unused now).
1794                             */
1795                            continue;
1796                        }
1797
1798                        /*
1799                         * Failed to fix.
1800                         * Do not abort, continue checking the rest of this
1801                         * L2 table's entries.
1802                         */
1803                    }
1804                } else {
1805                    fprintf(stderr, "ERROR offset=%" PRIx64 ": Data cluster is "
1806                        "not properly aligned; L2 entry corrupted.\n", offset);
1807                }
1808            }
1809
1810            if (flags & CHECK_FRAG_INFO) {
1811                res->bfi.allocated_clusters++;
1812                if (next_contiguous_offset &&
1813                    offset != next_contiguous_offset) {
1814                    res->bfi.fragmented_clusters++;
1815                }
1816                next_contiguous_offset = offset + s->cluster_size;
1817            }
1818
1819            /* Mark cluster as used */
1820            if (!has_data_file(bs)) {
1821                ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table,
1822                                               refcount_table_size,
1823                                               offset, s->cluster_size);
1824                if (ret < 0) {
1825                    return ret;
1826                }
1827            }
1828            break;
1829        }
1830
1831        case QCOW2_CLUSTER_ZERO_PLAIN:
1832            /* Impossible when image has subclusters */
1833            assert(!l2_bitmap);
1834            break;
1835
1836        case QCOW2_CLUSTER_UNALLOCATED:
1837            if (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC) {
1838                res->corruptions++;
1839                fprintf(stderr, "ERROR: Unallocated "
1840                        "cluster has non-zero subcluster allocation map\n");
1841            }
1842            break;
1843
1844        default:
1845            abort();
1846        }
1847    }
1848
1849    return 0;
1850}
1851
1852/*
1853 * Increases the refcount for the L1 table, its L2 tables and all referenced
1854 * clusters in the given refcount table. While doing so, performs some checks
1855 * on L1 and L2 entries.
1856 *
1857 * Returns the number of errors found by the checks or -errno if an internal
1858 * error occurred.
1859 */
1860static int check_refcounts_l1(BlockDriverState *bs,
1861                              BdrvCheckResult *res,
1862                              void **refcount_table,
1863                              int64_t *refcount_table_size,
1864                              int64_t l1_table_offset, int l1_size,
1865                              int flags, BdrvCheckMode fix, bool active)
1866{
1867    BDRVQcow2State *s = bs->opaque;
1868    size_t l1_size_bytes = l1_size * L1E_SIZE;
1869    g_autofree uint64_t *l1_table = NULL;
1870    uint64_t l2_offset;
1871    int i, ret;
1872
1873    if (!l1_size) {
1874        return 0;
1875    }
1876
1877    /* Mark L1 table as used */
1878    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size,
1879                                   l1_table_offset, l1_size_bytes);
1880    if (ret < 0) {
1881        return ret;
1882    }
1883
1884    l1_table = g_try_malloc(l1_size_bytes);
1885    if (l1_table == NULL) {
1886        res->check_errors++;
1887        return -ENOMEM;
1888    }
1889
1890    /* Read L1 table entries from disk */
1891    ret = bdrv_pread(bs->file, l1_table_offset, l1_size_bytes, l1_table, 0);
1892    if (ret < 0) {
1893        fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1894        res->check_errors++;
1895        return ret;
1896    }
1897
1898    for (i = 0; i < l1_size; i++) {
1899        be64_to_cpus(&l1_table[i]);
1900    }
1901
1902    /* Do the actual checks */
1903    for (i = 0; i < l1_size; i++) {
1904        if (!l1_table[i]) {
1905            continue;
1906        }
1907
1908        if (l1_table[i] & L1E_RESERVED_MASK) {
1909            fprintf(stderr, "ERROR found L1 entry with reserved bits set: "
1910                    "%" PRIx64 "\n", l1_table[i]);
1911            res->corruptions++;
1912        }
1913
1914        l2_offset = l1_table[i] & L1E_OFFSET_MASK;
1915
1916        /* Mark L2 table as used */
1917        ret = qcow2_inc_refcounts_imrt(bs, res,
1918                                       refcount_table, refcount_table_size,
1919                                       l2_offset, s->cluster_size);
1920        if (ret < 0) {
1921            return ret;
1922        }
1923
1924        /* L2 tables are cluster aligned */
1925        if (offset_into_cluster(s, l2_offset)) {
1926            fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1927                "cluster aligned; L1 entry corrupted\n", l2_offset);
1928            res->corruptions++;
1929        }
1930
1931        /* Process and check L2 entries */
1932        ret = check_refcounts_l2(bs, res, refcount_table,
1933                                 refcount_table_size, l2_offset, flags,
1934                                 fix, active);
1935        if (ret < 0) {
1936            return ret;
1937        }
1938    }
1939
1940    return 0;
1941}
1942
1943/*
1944 * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1945 *
1946 * This function does not print an error message nor does it increment
1947 * check_errors if qcow2_get_refcount fails (this is because such an error will
1948 * have been already detected and sufficiently signaled by the calling function
1949 * (qcow2_check_refcounts) by the time this function is called).
1950 */
1951static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1952                              BdrvCheckMode fix)
1953{
1954    BDRVQcow2State *s = bs->opaque;
1955    uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1956    int ret;
1957    uint64_t refcount;
1958    int i, j;
1959    bool repair;
1960
1961    if (fix & BDRV_FIX_ERRORS) {
1962        /* Always repair */
1963        repair = true;
1964    } else if (fix & BDRV_FIX_LEAKS) {
1965        /* Repair only if that seems safe: This function is always
1966         * called after the refcounts have been fixed, so the refcount
1967         * is accurate if that repair was successful */
1968        repair = !res->check_errors && !res->corruptions && !res->leaks;
1969    } else {
1970        repair = false;
1971    }
1972
1973    for (i = 0; i < s->l1_size; i++) {
1974        uint64_t l1_entry = s->l1_table[i];
1975        uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1976        int l2_dirty = 0;
1977
1978        if (!l2_offset) {
1979            continue;
1980        }
1981
1982        ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1983                                 &refcount);
1984        if (ret < 0) {
1985            /* don't print message nor increment check_errors */
1986            continue;
1987        }
1988        if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1989            res->corruptions++;
1990            fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1991                    "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1992                    repair ? "Repairing" : "ERROR", i, l1_entry, refcount);
1993            if (repair) {
1994                s->l1_table[i] = refcount == 1
1995                               ? l1_entry |  QCOW_OFLAG_COPIED
1996                               : l1_entry & ~QCOW_OFLAG_COPIED;
1997                ret = qcow2_write_l1_entry(bs, i);
1998                if (ret < 0) {
1999                    res->check_errors++;
2000                    goto fail;
2001                }
2002                res->corruptions--;
2003                res->corruptions_fixed++;
2004            }
2005        }
2006
2007        ret = bdrv_pread(bs->file, l2_offset, s->l2_size * l2_entry_size(s),
2008                         l2_table, 0);
2009        if (ret < 0) {
2010            fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
2011                    strerror(-ret));
2012            res->check_errors++;
2013            goto fail;
2014        }
2015
2016        for (j = 0; j < s->l2_size; j++) {
2017            uint64_t l2_entry = get_l2_entry(s, l2_table, j);
2018            uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
2019            QCow2ClusterType cluster_type = qcow2_get_cluster_type(bs, l2_entry);
2020
2021            if (cluster_type == QCOW2_CLUSTER_NORMAL ||
2022                cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) {
2023                if (has_data_file(bs)) {
2024                    refcount = 1;
2025                } else {
2026                    ret = qcow2_get_refcount(bs,
2027                                             data_offset >> s->cluster_bits,
2028                                             &refcount);
2029                    if (ret < 0) {
2030                        /* don't print message nor increment check_errors */
2031                        continue;
2032                    }
2033                }
2034                if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
2035                    res->corruptions++;
2036                    fprintf(stderr, "%s OFLAG_COPIED data cluster: "
2037                            "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
2038                            repair ? "Repairing" : "ERROR", l2_entry, refcount);
2039                    if (repair) {
2040                        set_l2_entry(s, l2_table, j,
2041                                     refcount == 1 ?
2042                                     l2_entry |  QCOW_OFLAG_COPIED :
2043                                     l2_entry & ~QCOW_OFLAG_COPIED);
2044                        l2_dirty++;
2045                    }
2046                }
2047            }
2048        }
2049
2050        if (l2_dirty > 0) {
2051            ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
2052                                                l2_offset, s->cluster_size,
2053                                                false);
2054            if (ret < 0) {
2055                fprintf(stderr, "ERROR: Could not write L2 table; metadata "
2056                        "overlap check failed: %s\n", strerror(-ret));
2057                res->check_errors++;
2058                goto fail;
2059            }
2060
2061            ret = bdrv_pwrite(bs->file, l2_offset, s->cluster_size, l2_table,
2062                              0);
2063            if (ret < 0) {
2064                fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
2065                        strerror(-ret));
2066                res->check_errors++;
2067                goto fail;
2068            }
2069            res->corruptions -= l2_dirty;
2070            res->corruptions_fixed += l2_dirty;
2071        }
2072    }
2073
2074    ret = 0;
2075
2076fail:
2077    qemu_vfree(l2_table);
2078    return ret;
2079}
2080
2081/*
2082 * Checks consistency of refblocks and accounts for each refblock in
2083 * *refcount_table.
2084 */
2085static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
2086                           BdrvCheckMode fix, bool *rebuild,
2087                           void **refcount_table, int64_t *nb_clusters)
2088{
2089    BDRVQcow2State *s = bs->opaque;
2090    int64_t i, size;
2091    int ret;
2092
2093    for(i = 0; i < s->refcount_table_size; i++) {
2094        uint64_t offset, cluster;
2095        offset = s->refcount_table[i] & REFT_OFFSET_MASK;
2096        cluster = offset >> s->cluster_bits;
2097
2098        if (s->refcount_table[i] & REFT_RESERVED_MASK) {
2099            fprintf(stderr, "ERROR refcount table entry %" PRId64 " has "
2100                    "reserved bits set\n", i);
2101            res->corruptions++;
2102            *rebuild = true;
2103            continue;
2104        }
2105
2106        /* Refcount blocks are cluster aligned */
2107        if (offset_into_cluster(s, offset)) {
2108            fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
2109                "cluster aligned; refcount table entry corrupted\n", i);
2110            res->corruptions++;
2111            *rebuild = true;
2112            continue;
2113        }
2114
2115        if (cluster >= *nb_clusters) {
2116            res->corruptions++;
2117            fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
2118                    fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
2119
2120            if (fix & BDRV_FIX_ERRORS) {
2121                int64_t new_nb_clusters;
2122                Error *local_err = NULL;
2123
2124                if (offset > INT64_MAX - s->cluster_size) {
2125                    ret = -EINVAL;
2126                    goto resize_fail;
2127                }
2128
2129                ret = bdrv_truncate(bs->file, offset + s->cluster_size, false,
2130                                    PREALLOC_MODE_OFF, 0, &local_err);
2131                if (ret < 0) {
2132                    error_report_err(local_err);
2133                    goto resize_fail;
2134                }
2135                size = bdrv_getlength(bs->file->bs);
2136                if (size < 0) {
2137                    ret = size;
2138                    goto resize_fail;
2139                }
2140
2141                new_nb_clusters = size_to_clusters(s, size);
2142                assert(new_nb_clusters >= *nb_clusters);
2143
2144                ret = realloc_refcount_array(s, refcount_table,
2145                                             nb_clusters, new_nb_clusters);
2146                if (ret < 0) {
2147                    res->check_errors++;
2148                    return ret;
2149                }
2150
2151                if (cluster >= *nb_clusters) {
2152                    ret = -EINVAL;
2153                    goto resize_fail;
2154                }
2155
2156                res->corruptions--;
2157                res->corruptions_fixed++;
2158                ret = qcow2_inc_refcounts_imrt(bs, res,
2159                                               refcount_table, nb_clusters,
2160                                               offset, s->cluster_size);
2161                if (ret < 0) {
2162                    return ret;
2163                }
2164                /* No need to check whether the refcount is now greater than 1:
2165                 * This area was just allocated and zeroed, so it can only be
2166                 * exactly 1 after qcow2_inc_refcounts_imrt() */
2167                continue;
2168
2169resize_fail:
2170                *rebuild = true;
2171                fprintf(stderr, "ERROR could not resize image: %s\n",
2172                        strerror(-ret));
2173            }
2174            continue;
2175        }
2176
2177        if (offset != 0) {
2178            ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2179                                           offset, s->cluster_size);
2180            if (ret < 0) {
2181                return ret;
2182            }
2183            if (s->get_refcount(*refcount_table, cluster) != 1) {
2184                fprintf(stderr, "ERROR refcount block %" PRId64
2185                        " refcount=%" PRIu64 "\n", i,
2186                        s->get_refcount(*refcount_table, cluster));
2187                res->corruptions++;
2188                *rebuild = true;
2189            }
2190        }
2191    }
2192
2193    return 0;
2194}
2195
2196/*
2197 * Calculates an in-memory refcount table.
2198 */
2199static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2200                               BdrvCheckMode fix, bool *rebuild,
2201                               void **refcount_table, int64_t *nb_clusters)
2202{
2203    BDRVQcow2State *s = bs->opaque;
2204    int64_t i;
2205    QCowSnapshot *sn;
2206    int ret;
2207
2208    if (!*refcount_table) {
2209        int64_t old_size = 0;
2210        ret = realloc_refcount_array(s, refcount_table,
2211                                     &old_size, *nb_clusters);
2212        if (ret < 0) {
2213            res->check_errors++;
2214            return ret;
2215        }
2216    }
2217
2218    /* header */
2219    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2220                                   0, s->cluster_size);
2221    if (ret < 0) {
2222        return ret;
2223    }
2224
2225    /* current L1 table */
2226    ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2227                             s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO,
2228                             fix, true);
2229    if (ret < 0) {
2230        return ret;
2231    }
2232
2233    /* snapshots */
2234    if (has_data_file(bs) && s->nb_snapshots) {
2235        fprintf(stderr, "ERROR %d snapshots in image with data file\n",
2236                s->nb_snapshots);
2237        res->corruptions++;
2238    }
2239
2240    for (i = 0; i < s->nb_snapshots; i++) {
2241        sn = s->snapshots + i;
2242        if (offset_into_cluster(s, sn->l1_table_offset)) {
2243            fprintf(stderr, "ERROR snapshot %s (%s) l1_offset=%#" PRIx64 ": "
2244                    "L1 table is not cluster aligned; snapshot table entry "
2245                    "corrupted\n", sn->id_str, sn->name, sn->l1_table_offset);
2246            res->corruptions++;
2247            continue;
2248        }
2249        if (sn->l1_size > QCOW_MAX_L1_SIZE / L1E_SIZE) {
2250            fprintf(stderr, "ERROR snapshot %s (%s) l1_size=%#" PRIx32 ": "
2251                    "L1 table is too large; snapshot table entry corrupted\n",
2252                    sn->id_str, sn->name, sn->l1_size);
2253            res->corruptions++;
2254            continue;
2255        }
2256        ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2257                                 sn->l1_table_offset, sn->l1_size, 0, fix,
2258                                 false);
2259        if (ret < 0) {
2260            return ret;
2261        }
2262    }
2263    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2264                                   s->snapshots_offset, s->snapshots_size);
2265    if (ret < 0) {
2266        return ret;
2267    }
2268
2269    /* refcount data */
2270    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2271                                   s->refcount_table_offset,
2272                                   s->refcount_table_size *
2273                                   REFTABLE_ENTRY_SIZE);
2274    if (ret < 0) {
2275        return ret;
2276    }
2277
2278    /* encryption */
2279    if (s->crypto_header.length) {
2280        ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2281                                       s->crypto_header.offset,
2282                                       s->crypto_header.length);
2283        if (ret < 0) {
2284            return ret;
2285        }
2286    }
2287
2288    /* bitmaps */
2289    ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters);
2290    if (ret < 0) {
2291        return ret;
2292    }
2293
2294    return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
2295}
2296
2297/*
2298 * Compares the actual reference count for each cluster in the image against the
2299 * refcount as reported by the refcount structures on-disk.
2300 */
2301static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2302                              BdrvCheckMode fix, bool *rebuild,
2303                              int64_t *highest_cluster,
2304                              void *refcount_table, int64_t nb_clusters)
2305{
2306    BDRVQcow2State *s = bs->opaque;
2307    int64_t i;
2308    uint64_t refcount1, refcount2;
2309    int ret;
2310
2311    for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
2312        ret = qcow2_get_refcount(bs, i, &refcount1);
2313        if (ret < 0) {
2314            fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
2315                    i, strerror(-ret));
2316            res->check_errors++;
2317            continue;
2318        }
2319
2320        refcount2 = s->get_refcount(refcount_table, i);
2321
2322        if (refcount1 > 0 || refcount2 > 0) {
2323            *highest_cluster = i;
2324        }
2325
2326        if (refcount1 != refcount2) {
2327            /* Check if we're allowed to fix the mismatch */
2328            int *num_fixed = NULL;
2329            if (refcount1 == 0) {
2330                *rebuild = true;
2331            } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
2332                num_fixed = &res->leaks_fixed;
2333            } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
2334                num_fixed = &res->corruptions_fixed;
2335            }
2336
2337            fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
2338                    " reference=%" PRIu64 "\n",
2339                   num_fixed != NULL     ? "Repairing" :
2340                   refcount1 < refcount2 ? "ERROR" :
2341                                           "Leaked",
2342                   i, refcount1, refcount2);
2343
2344            if (num_fixed) {
2345                ret = update_refcount(bs, i << s->cluster_bits, 1,
2346                                      refcount_diff(refcount1, refcount2),
2347                                      refcount1 > refcount2,
2348                                      QCOW2_DISCARD_ALWAYS);
2349                if (ret >= 0) {
2350                    (*num_fixed)++;
2351                    continue;
2352                }
2353            }
2354
2355            /* And if we couldn't, print an error */
2356            if (refcount1 < refcount2) {
2357                res->corruptions++;
2358            } else {
2359                res->leaks++;
2360            }
2361        }
2362    }
2363}
2364
2365/*
2366 * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
2367 * the on-disk refcount structures.
2368 *
2369 * On input, *first_free_cluster tells where to start looking, and need not
2370 * actually be a free cluster; the returned offset will not be before that
2371 * cluster.  On output, *first_free_cluster points to the first gap found, even
2372 * if that gap was too small to be used as the returned offset.
2373 *
2374 * Note that *first_free_cluster is a cluster index whereas the return value is
2375 * an offset.
2376 */
2377static int64_t alloc_clusters_imrt(BlockDriverState *bs,
2378                                   int cluster_count,
2379                                   void **refcount_table,
2380                                   int64_t *imrt_nb_clusters,
2381                                   int64_t *first_free_cluster)
2382{
2383    BDRVQcow2State *s = bs->opaque;
2384    int64_t cluster = *first_free_cluster, i;
2385    bool first_gap = true;
2386    int contiguous_free_clusters;
2387    int ret;
2388
2389    /* Starting at *first_free_cluster, find a range of at least cluster_count
2390     * continuously free clusters */
2391    for (contiguous_free_clusters = 0;
2392         cluster < *imrt_nb_clusters &&
2393         contiguous_free_clusters < cluster_count;
2394         cluster++)
2395    {
2396        if (!s->get_refcount(*refcount_table, cluster)) {
2397            contiguous_free_clusters++;
2398            if (first_gap) {
2399                /* If this is the first free cluster found, update
2400                 * *first_free_cluster accordingly */
2401                *first_free_cluster = cluster;
2402                first_gap = false;
2403            }
2404        } else if (contiguous_free_clusters) {
2405            contiguous_free_clusters = 0;
2406        }
2407    }
2408
2409    /* If contiguous_free_clusters is greater than zero, it contains the number
2410     * of continuously free clusters until the current cluster; the first free
2411     * cluster in the current "gap" is therefore
2412     * cluster - contiguous_free_clusters */
2413
2414    /* If no such range could be found, grow the in-memory refcount table
2415     * accordingly to append free clusters at the end of the image */
2416    if (contiguous_free_clusters < cluster_count) {
2417        /* contiguous_free_clusters clusters are already empty at the image end;
2418         * we need cluster_count clusters; therefore, we have to allocate
2419         * cluster_count - contiguous_free_clusters new clusters at the end of
2420         * the image (which is the current value of cluster; note that cluster
2421         * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
2422         * the image end) */
2423        ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
2424                                     cluster + cluster_count
2425                                     - contiguous_free_clusters);
2426        if (ret < 0) {
2427            return ret;
2428        }
2429    }
2430
2431    /* Go back to the first free cluster */
2432    cluster -= contiguous_free_clusters;
2433    for (i = 0; i < cluster_count; i++) {
2434        s->set_refcount(*refcount_table, cluster + i, 1);
2435    }
2436
2437    return cluster << s->cluster_bits;
2438}
2439
2440/*
2441 * Helper function for rebuild_refcount_structure().
2442 *
2443 * Scan the range of clusters [first_cluster, end_cluster) for allocated
2444 * clusters and write all corresponding refblocks to disk.  The refblock
2445 * and allocation data is taken from the in-memory refcount table
2446 * *refcount_table[] (of size *nb_clusters), which is basically one big
2447 * (unlimited size) refblock for the whole image.
2448 *
2449 * For these refblocks, clusters are allocated using said in-memory
2450 * refcount table.  Care is taken that these allocations are reflected
2451 * in the refblocks written to disk.
2452 *
2453 * The refblocks' offsets are written into a reftable, which is
2454 * *on_disk_reftable_ptr[] (of size *on_disk_reftable_entries_ptr).  If
2455 * that reftable is of insufficient size, it will be resized to fit.
2456 * This reftable is not written to disk.
2457 *
2458 * (If *on_disk_reftable_ptr is not NULL, the entries within are assumed
2459 * to point to existing valid refblocks that do not need to be allocated
2460 * again.)
2461 *
2462 * Return whether the on-disk reftable array was resized (true/false),
2463 * or -errno on error.
2464 */
2465static int rebuild_refcounts_write_refblocks(
2466        BlockDriverState *bs, void **refcount_table, int64_t *nb_clusters,
2467        int64_t first_cluster, int64_t end_cluster,
2468        uint64_t **on_disk_reftable_ptr, uint32_t *on_disk_reftable_entries_ptr,
2469        Error **errp
2470    )
2471{
2472    BDRVQcow2State *s = bs->opaque;
2473    int64_t cluster;
2474    int64_t refblock_offset, refblock_start, refblock_index;
2475    int64_t first_free_cluster = 0;
2476    uint64_t *on_disk_reftable = *on_disk_reftable_ptr;
2477    uint32_t on_disk_reftable_entries = *on_disk_reftable_entries_ptr;
2478    void *on_disk_refblock;
2479    bool reftable_grown = false;
2480    int ret;
2481
2482    for (cluster = first_cluster; cluster < end_cluster; cluster++) {
2483        /* Check all clusters to find refblocks that contain non-zero entries */
2484        if (!s->get_refcount(*refcount_table, cluster)) {
2485            continue;
2486        }
2487
2488        /*
2489         * This cluster is allocated, so we need to create a refblock
2490         * for it.  The data we will write to disk is just the
2491         * respective slice from *refcount_table, so it will contain
2492         * accurate refcounts for all clusters belonging to this
2493         * refblock.  After we have written it, we will therefore skip
2494         * all remaining clusters in this refblock.
2495         */
2496
2497        refblock_index = cluster >> s->refcount_block_bits;
2498        refblock_start = refblock_index << s->refcount_block_bits;
2499
2500        if (on_disk_reftable_entries > refblock_index &&
2501            on_disk_reftable[refblock_index])
2502        {
2503            /*
2504             * We can get here after a `goto write_refblocks`: We have a
2505             * reftable from a previous run, and the refblock is already
2506             * allocated.  No need to allocate it again.
2507             */
2508            refblock_offset = on_disk_reftable[refblock_index];
2509        } else {
2510            int64_t refblock_cluster_index;
2511
2512            /* Don't allocate a cluster in a refblock already written to disk */
2513            if (first_free_cluster < refblock_start) {
2514                first_free_cluster = refblock_start;
2515            }
2516            refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2517                                                  nb_clusters,
2518                                                  &first_free_cluster);
2519            if (refblock_offset < 0) {
2520                error_setg_errno(errp, -refblock_offset,
2521                                 "ERROR allocating refblock");
2522                return refblock_offset;
2523            }
2524
2525            refblock_cluster_index = refblock_offset / s->cluster_size;
2526            if (refblock_cluster_index >= end_cluster) {
2527                /*
2528                 * We must write the refblock that holds this refblock's
2529                 * refcount
2530                 */
2531                end_cluster = refblock_cluster_index + 1;
2532            }
2533
2534            if (on_disk_reftable_entries <= refblock_index) {
2535                on_disk_reftable_entries =
2536                    ROUND_UP((refblock_index + 1) * REFTABLE_ENTRY_SIZE,
2537                             s->cluster_size) / REFTABLE_ENTRY_SIZE;
2538                on_disk_reftable =
2539                    g_try_realloc(on_disk_reftable,
2540                                  on_disk_reftable_entries *
2541                                  REFTABLE_ENTRY_SIZE);
2542                if (!on_disk_reftable) {
2543                    error_setg(errp, "ERROR allocating reftable memory");
2544                    return -ENOMEM;
2545                }
2546
2547                memset(on_disk_reftable + *on_disk_reftable_entries_ptr, 0,
2548                       (on_disk_reftable_entries -
2549                        *on_disk_reftable_entries_ptr) *
2550                       REFTABLE_ENTRY_SIZE);
2551
2552                *on_disk_reftable_ptr = on_disk_reftable;
2553                *on_disk_reftable_entries_ptr = on_disk_reftable_entries;
2554
2555                reftable_grown = true;
2556            } else {
2557                assert(on_disk_reftable);
2558            }
2559            on_disk_reftable[refblock_index] = refblock_offset;
2560        }
2561
2562        /* Refblock is allocated, write it to disk */
2563
2564        ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2565                                            s->cluster_size, false);
2566        if (ret < 0) {
2567            error_setg_errno(errp, -ret, "ERROR writing refblock");
2568            return ret;
2569        }
2570
2571        /*
2572         * The refblock is simply a slice of *refcount_table.
2573         * Note that the size of *refcount_table is always aligned to
2574         * whole clusters, so the write operation will not result in
2575         * out-of-bounds accesses.
2576         */
2577        on_disk_refblock = (void *)((char *) *refcount_table +
2578                                    refblock_index * s->cluster_size);
2579
2580        ret = bdrv_pwrite(bs->file, refblock_offset, s->cluster_size,
2581                          on_disk_refblock, 0);
2582        if (ret < 0) {
2583            error_setg_errno(errp, -ret, "ERROR writing refblock");
2584            return ret;
2585        }
2586
2587        /* This refblock is done, skip to its end */
2588        cluster = refblock_start + s->refcount_block_size - 1;
2589    }
2590
2591    return reftable_grown;
2592}
2593
2594/*
2595 * Creates a new refcount structure based solely on the in-memory information
2596 * given through *refcount_table (this in-memory information is basically just
2597 * the concatenation of all refblocks).  All necessary allocations will be
2598 * reflected in that array.
2599 *
2600 * On success, the old refcount structure is leaked (it will be covered by the
2601 * new refcount structure).
2602 */
2603static int rebuild_refcount_structure(BlockDriverState *bs,
2604                                      BdrvCheckResult *res,
2605                                      void **refcount_table,
2606                                      int64_t *nb_clusters,
2607                                      Error **errp)
2608{
2609    BDRVQcow2State *s = bs->opaque;
2610    int64_t reftable_offset = -1;
2611    int64_t reftable_length = 0;
2612    int64_t reftable_clusters;
2613    int64_t refblock_index;
2614    uint32_t on_disk_reftable_entries = 0;
2615    uint64_t *on_disk_reftable = NULL;
2616    int ret = 0;
2617    int reftable_size_changed = 0;
2618    struct {
2619        uint64_t reftable_offset;
2620        uint32_t reftable_clusters;
2621    } QEMU_PACKED reftable_offset_and_clusters;
2622
2623    qcow2_cache_empty(bs, s->refcount_block_cache);
2624
2625    /*
2626     * For each refblock containing entries, we try to allocate a
2627     * cluster (in the in-memory refcount table) and write its offset
2628     * into on_disk_reftable[].  We then write the whole refblock to
2629     * disk (as a slice of the in-memory refcount table).
2630     * This is done by rebuild_refcounts_write_refblocks().
2631     *
2632     * Once we have scanned all clusters, we try to find space for the
2633     * reftable.  This will dirty the in-memory refcount table (i.e.
2634     * make it differ from the refblocks we have already written), so we
2635     * need to run rebuild_refcounts_write_refblocks() again for the
2636     * range of clusters where the reftable has been allocated.
2637     *
2638     * This second run might make the reftable grow again, in which case
2639     * we will need to allocate another space for it, which is why we
2640     * repeat all this until the reftable stops growing.
2641     *
2642     * (This loop will terminate, because with every cluster the
2643     * reftable grows, it can accomodate a multitude of more refcounts,
2644     * so that at some point this must be able to cover the reftable
2645     * and all refblocks describing it.)
2646     *
2647     * We then convert the reftable to big-endian and write it to disk.
2648     *
2649     * Note that we never free any reftable allocations.  Doing so would
2650     * needlessly complicate the algorithm: The eventual second check
2651     * run we do will clean up all leaks we have caused.
2652     */
2653
2654    reftable_size_changed =
2655        rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters,
2656                                          0, *nb_clusters,
2657                                          &on_disk_reftable,
2658                                          &on_disk_reftable_entries, errp);
2659    if (reftable_size_changed < 0) {
2660        res->check_errors++;
2661        ret = reftable_size_changed;
2662        goto fail;
2663    }
2664
2665    /*
2666     * There was no reftable before, so rebuild_refcounts_write_refblocks()
2667     * must have increased its size (from 0 to something).
2668     */
2669    assert(reftable_size_changed);
2670
2671    do {
2672        int64_t reftable_start_cluster, reftable_end_cluster;
2673        int64_t first_free_cluster = 0;
2674
2675        reftable_length = on_disk_reftable_entries * REFTABLE_ENTRY_SIZE;
2676        reftable_clusters = size_to_clusters(s, reftable_length);
2677
2678        reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2679                                              refcount_table, nb_clusters,
2680                                              &first_free_cluster);
2681        if (reftable_offset < 0) {
2682            error_setg_errno(errp, -reftable_offset,
2683                             "ERROR allocating reftable");
2684            res->check_errors++;
2685            ret = reftable_offset;
2686            goto fail;
2687        }
2688
2689        /*
2690         * We need to update the affected refblocks, so re-run the
2691         * write_refblocks loop for the reftable's range of clusters.
2692         */
2693        assert(offset_into_cluster(s, reftable_offset) == 0);
2694        reftable_start_cluster = reftable_offset / s->cluster_size;
2695        reftable_end_cluster = reftable_start_cluster + reftable_clusters;
2696        reftable_size_changed =
2697            rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters,
2698                                              reftable_start_cluster,
2699                                              reftable_end_cluster,
2700                                              &on_disk_reftable,
2701                                              &on_disk_reftable_entries, errp);
2702        if (reftable_size_changed < 0) {
2703            res->check_errors++;
2704            ret = reftable_size_changed;
2705            goto fail;
2706        }
2707
2708        /*
2709         * If the reftable size has changed, we will need to find a new
2710         * allocation, repeating the loop.
2711         */
2712    } while (reftable_size_changed);
2713
2714    /* The above loop must have run at least once */
2715    assert(reftable_offset >= 0);
2716
2717    /*
2718     * All allocations are done, all refblocks are written, convert the
2719     * reftable to big-endian and write it to disk.
2720     */
2721
2722    for (refblock_index = 0; refblock_index < on_disk_reftable_entries;
2723         refblock_index++)
2724    {
2725        cpu_to_be64s(&on_disk_reftable[refblock_index]);
2726    }
2727
2728    ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset, reftable_length,
2729                                        false);
2730    if (ret < 0) {
2731        error_setg_errno(errp, -ret, "ERROR writing reftable");
2732        goto fail;
2733    }
2734
2735    assert(reftable_length < INT_MAX);
2736    ret = bdrv_pwrite(bs->file, reftable_offset, reftable_length,
2737                      on_disk_reftable, 0);
2738    if (ret < 0) {
2739        error_setg_errno(errp, -ret, "ERROR writing reftable");
2740        goto fail;
2741    }
2742
2743    /* Enter new reftable into the image header */
2744    reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
2745    reftable_offset_and_clusters.reftable_clusters =
2746        cpu_to_be32(reftable_clusters);
2747    ret = bdrv_pwrite_sync(bs->file,
2748                           offsetof(QCowHeader, refcount_table_offset),
2749                           sizeof(reftable_offset_and_clusters),
2750                           &reftable_offset_and_clusters, 0);
2751    if (ret < 0) {
2752        error_setg_errno(errp, -ret, "ERROR setting reftable");
2753        goto fail;
2754    }
2755
2756    for (refblock_index = 0; refblock_index < on_disk_reftable_entries;
2757         refblock_index++)
2758    {
2759        be64_to_cpus(&on_disk_reftable[refblock_index]);
2760    }
2761    s->refcount_table = on_disk_reftable;
2762    s->refcount_table_offset = reftable_offset;
2763    s->refcount_table_size = on_disk_reftable_entries;
2764    update_max_refcount_table_index(s);
2765
2766    return 0;
2767
2768fail:
2769    g_free(on_disk_reftable);
2770    return ret;
2771}
2772
2773/*
2774 * Checks an image for refcount consistency.
2775 *
2776 * Returns 0 if no errors are found, the number of errors in case the image is
2777 * detected as corrupted, and -errno when an internal error occurred.
2778 */
2779int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2780                          BdrvCheckMode fix)
2781{
2782    BDRVQcow2State *s = bs->opaque;
2783    BdrvCheckResult pre_compare_res;
2784    int64_t size, highest_cluster, nb_clusters;
2785    void *refcount_table = NULL;
2786    bool rebuild = false;
2787    int ret;
2788
2789    size = bdrv_getlength(bs->file->bs);
2790    if (size < 0) {
2791        res->check_errors++;
2792        return size;
2793    }
2794
2795    nb_clusters = size_to_clusters(s, size);
2796    if (nb_clusters > INT_MAX) {
2797        res->check_errors++;
2798        return -EFBIG;
2799    }
2800
2801    res->bfi.total_clusters =
2802        size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2803
2804    ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2805                              &nb_clusters);
2806    if (ret < 0) {
2807        goto fail;
2808    }
2809
2810    /* In case we don't need to rebuild the refcount structure (but want to fix
2811     * something), this function is immediately called again, in which case the
2812     * result should be ignored */
2813    pre_compare_res = *res;
2814    compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2815                      nb_clusters);
2816
2817    if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2818        BdrvCheckResult old_res = *res;
2819        int fresh_leaks = 0;
2820        Error *local_err = NULL;
2821
2822        fprintf(stderr, "Rebuilding refcount structure\n");
2823        ret = rebuild_refcount_structure(bs, res, &refcount_table,
2824                                         &nb_clusters, &local_err);
2825        if (ret < 0) {
2826            error_report_err(local_err);
2827            goto fail;
2828        }
2829
2830        res->corruptions = 0;
2831        res->leaks = 0;
2832
2833        /* Because the old reftable has been exchanged for a new one the
2834         * references have to be recalculated */
2835        rebuild = false;
2836        memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2837        ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2838                                  &nb_clusters);
2839        if (ret < 0) {
2840            goto fail;
2841        }
2842
2843        if (fix & BDRV_FIX_LEAKS) {
2844            /* The old refcount structures are now leaked, fix it; the result
2845             * can be ignored, aside from leaks which were introduced by
2846             * rebuild_refcount_structure() that could not be fixed */
2847            BdrvCheckResult saved_res = *res;
2848            *res = (BdrvCheckResult){ 0 };
2849
2850            compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2851                              &highest_cluster, refcount_table, nb_clusters);
2852            if (rebuild) {
2853                fprintf(stderr, "ERROR rebuilt refcount structure is still "
2854                        "broken\n");
2855            }
2856
2857            /* Any leaks accounted for here were introduced by
2858             * rebuild_refcount_structure() because that function has created a
2859             * new refcount structure from scratch */
2860            fresh_leaks = res->leaks;
2861            *res = saved_res;
2862        }
2863
2864        if (res->corruptions < old_res.corruptions) {
2865            res->corruptions_fixed += old_res.corruptions - res->corruptions;
2866        }
2867        if (res->leaks < old_res.leaks) {
2868            res->leaks_fixed += old_res.leaks - res->leaks;
2869        }
2870        res->leaks += fresh_leaks;
2871    } else if (fix) {
2872        if (rebuild) {
2873            fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2874            res->check_errors++;
2875            ret = -EIO;
2876            goto fail;
2877        }
2878
2879        if (res->leaks || res->corruptions) {
2880            *res = pre_compare_res;
2881            compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2882                              refcount_table, nb_clusters);
2883        }
2884    }
2885
2886    /* check OFLAG_COPIED */
2887    ret = check_oflag_copied(bs, res, fix);
2888    if (ret < 0) {
2889        goto fail;
2890    }
2891
2892    res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2893    ret = 0;
2894
2895fail:
2896    g_free(refcount_table);
2897
2898    return ret;
2899}
2900
2901#define overlaps_with(ofs, sz) \
2902    ranges_overlap(offset, size, ofs, sz)
2903
2904/*
2905 * Checks if the given offset into the image file is actually free to use by
2906 * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2907 * i.e. a sanity check without relying on the refcount tables.
2908 *
2909 * The ign parameter specifies what checks not to perform (being a bitmask of
2910 * QCow2MetadataOverlap values), i.e., what sections to ignore.
2911 *
2912 * Returns:
2913 * - 0 if writing to this offset will not affect the mentioned metadata
2914 * - a positive QCow2MetadataOverlap value indicating one overlapping section
2915 * - a negative value (-errno) indicating an error while performing a check,
2916 *   e.g. when bdrv_pread failed on QCOW2_OL_INACTIVE_L2
2917 */
2918int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2919                                 int64_t size)
2920{
2921    BDRVQcow2State *s = bs->opaque;
2922    int chk = s->overlap_check & ~ign;
2923    int i, j;
2924
2925    if (!size) {
2926        return 0;
2927    }
2928
2929    if (chk & QCOW2_OL_MAIN_HEADER) {
2930        if (offset < s->cluster_size) {
2931            return QCOW2_OL_MAIN_HEADER;
2932        }
2933    }
2934
2935    /* align range to test to cluster boundaries */
2936    size = ROUND_UP(offset_into_cluster(s, offset) + size, s->cluster_size);
2937    offset = start_of_cluster(s, offset);
2938
2939    if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2940        if (overlaps_with(s->l1_table_offset, s->l1_size * L1E_SIZE)) {
2941            return QCOW2_OL_ACTIVE_L1;
2942        }
2943    }
2944
2945    if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2946        if (overlaps_with(s->refcount_table_offset,
2947            s->refcount_table_size * REFTABLE_ENTRY_SIZE)) {
2948            return QCOW2_OL_REFCOUNT_TABLE;
2949        }
2950    }
2951
2952    if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2953        if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2954            return QCOW2_OL_SNAPSHOT_TABLE;
2955        }
2956    }
2957
2958    if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2959        for (i = 0; i < s->nb_snapshots; i++) {
2960            if (s->snapshots[i].l1_size &&
2961                overlaps_with(s->snapshots[i].l1_table_offset,
2962                s->snapshots[i].l1_size * L1E_SIZE)) {
2963                return QCOW2_OL_INACTIVE_L1;
2964            }
2965        }
2966    }
2967
2968    if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2969        for (i = 0; i < s->l1_size; i++) {
2970            if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2971                overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2972                s->cluster_size)) {
2973                return QCOW2_OL_ACTIVE_L2;
2974            }
2975        }
2976    }
2977
2978    if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2979        unsigned last_entry = s->max_refcount_table_index;
2980        assert(last_entry < s->refcount_table_size);
2981        assert(last_entry + 1 == s->refcount_table_size ||
2982               (s->refcount_table[last_entry + 1] & REFT_OFFSET_MASK) == 0);
2983        for (i = 0; i <= last_entry; i++) {
2984            if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2985                overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2986                s->cluster_size)) {
2987                return QCOW2_OL_REFCOUNT_BLOCK;
2988            }
2989        }
2990    }
2991
2992    if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2993        for (i = 0; i < s->nb_snapshots; i++) {
2994            uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2995            uint32_t l1_sz  = s->snapshots[i].l1_size;
2996            uint64_t l1_sz2 = l1_sz * L1E_SIZE;
2997            uint64_t *l1;
2998            int ret;
2999
3000            ret = qcow2_validate_table(bs, l1_ofs, l1_sz, L1E_SIZE,
3001                                       QCOW_MAX_L1_SIZE, "", NULL);
3002            if (ret < 0) {
3003                return ret;
3004            }
3005
3006            l1 = g_try_malloc(l1_sz2);
3007
3008            if (l1_sz2 && l1 == NULL) {
3009                return -ENOMEM;
3010            }
3011
3012            ret = bdrv_pread(bs->file, l1_ofs, l1_sz2, l1, 0);
3013            if (ret < 0) {
3014                g_free(l1);
3015                return ret;
3016            }
3017
3018            for (j = 0; j < l1_sz; j++) {
3019                uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
3020                if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
3021                    g_free(l1);
3022                    return QCOW2_OL_INACTIVE_L2;
3023                }
3024            }
3025
3026            g_free(l1);
3027        }
3028    }
3029
3030    if ((chk & QCOW2_OL_BITMAP_DIRECTORY) &&
3031        (s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS))
3032    {
3033        if (overlaps_with(s->bitmap_directory_offset,
3034                          s->bitmap_directory_size))
3035        {
3036            return QCOW2_OL_BITMAP_DIRECTORY;
3037        }
3038    }
3039
3040    return 0;
3041}
3042
3043static const char *metadata_ol_names[] = {
3044    [QCOW2_OL_MAIN_HEADER_BITNR]        = "qcow2_header",
3045    [QCOW2_OL_ACTIVE_L1_BITNR]          = "active L1 table",
3046    [QCOW2_OL_ACTIVE_L2_BITNR]          = "active L2 table",
3047    [QCOW2_OL_REFCOUNT_TABLE_BITNR]     = "refcount table",
3048    [QCOW2_OL_REFCOUNT_BLOCK_BITNR]     = "refcount block",
3049    [QCOW2_OL_SNAPSHOT_TABLE_BITNR]     = "snapshot table",
3050    [QCOW2_OL_INACTIVE_L1_BITNR]        = "inactive L1 table",
3051    [QCOW2_OL_INACTIVE_L2_BITNR]        = "inactive L2 table",
3052    [QCOW2_OL_BITMAP_DIRECTORY_BITNR]   = "bitmap directory",
3053};
3054QEMU_BUILD_BUG_ON(QCOW2_OL_MAX_BITNR != ARRAY_SIZE(metadata_ol_names));
3055
3056/*
3057 * First performs a check for metadata overlaps (through
3058 * qcow2_check_metadata_overlap); if that fails with a negative value (error
3059 * while performing a check), that value is returned. If an impending overlap
3060 * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
3061 * and -EIO returned.
3062 *
3063 * Returns 0 if there were neither overlaps nor errors while checking for
3064 * overlaps; or a negative value (-errno) on error.
3065 */
3066int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
3067                                  int64_t size, bool data_file)
3068{
3069    int ret;
3070
3071    if (data_file && has_data_file(bs)) {
3072        return 0;
3073    }
3074
3075    ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
3076    if (ret < 0) {
3077        return ret;
3078    } else if (ret > 0) {
3079        int metadata_ol_bitnr = ctz32(ret);
3080        assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
3081
3082        qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
3083                                "write on metadata (overlaps with %s)",
3084                                metadata_ol_names[metadata_ol_bitnr]);
3085        return -EIO;
3086    }
3087
3088    return 0;
3089}
3090
3091/* A pointer to a function of this type is given to walk_over_reftable(). That
3092 * function will create refblocks and pass them to a RefblockFinishOp once they
3093 * are completed (@refblock). @refblock_empty is set if the refblock is
3094 * completely empty.
3095 *
3096 * Along with the refblock, a corresponding reftable entry is passed, in the
3097 * reftable @reftable (which may be reallocated) at @reftable_index.
3098 *
3099 * @allocated should be set to true if a new cluster has been allocated.
3100 */
3101typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
3102                               uint64_t reftable_index, uint64_t *reftable_size,
3103                               void *refblock, bool refblock_empty,
3104                               bool *allocated, Error **errp);
3105
3106/**
3107 * This "operation" for walk_over_reftable() allocates the refblock on disk (if
3108 * it is not empty) and inserts its offset into the new reftable. The size of
3109 * this new reftable is increased as required.
3110 */
3111static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
3112                          uint64_t reftable_index, uint64_t *reftable_size,
3113                          void *refblock, bool refblock_empty, bool *allocated,
3114                          Error **errp)
3115{
3116    BDRVQcow2State *s = bs->opaque;
3117    int64_t offset;
3118
3119    if (!refblock_empty && reftable_index >= *reftable_size) {
3120        uint64_t *new_reftable;
3121        uint64_t new_reftable_size;
3122
3123        new_reftable_size = ROUND_UP(reftable_index + 1,
3124                                     s->cluster_size / REFTABLE_ENTRY_SIZE);
3125        if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / REFTABLE_ENTRY_SIZE) {
3126            error_setg(errp,
3127                       "This operation would make the refcount table grow "
3128                       "beyond the maximum size supported by QEMU, aborting");
3129            return -ENOTSUP;
3130        }
3131
3132        new_reftable = g_try_realloc(*reftable, new_reftable_size *
3133                                                REFTABLE_ENTRY_SIZE);
3134        if (!new_reftable) {
3135            error_setg(errp, "Failed to increase reftable buffer size");
3136            return -ENOMEM;
3137        }
3138
3139        memset(new_reftable + *reftable_size, 0,
3140               (new_reftable_size - *reftable_size) * REFTABLE_ENTRY_SIZE);
3141
3142        *reftable      = new_reftable;
3143        *reftable_size = new_reftable_size;
3144    }
3145
3146    if (!refblock_empty && !(*reftable)[reftable_index]) {
3147        offset = qcow2_alloc_clusters(bs, s->cluster_size);
3148        if (offset < 0) {
3149            error_setg_errno(errp, -offset, "Failed to allocate refblock");
3150            return offset;
3151        }
3152        (*reftable)[reftable_index] = offset;
3153        *allocated = true;
3154    }
3155
3156    return 0;
3157}
3158
3159/**
3160 * This "operation" for walk_over_reftable() writes the refblock to disk at the
3161 * offset specified by the new reftable's entry. It does not modify the new
3162 * reftable or change any refcounts.
3163 */
3164static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
3165                          uint64_t reftable_index, uint64_t *reftable_size,
3166                          void *refblock, bool refblock_empty, bool *allocated,
3167                          Error **errp)
3168{
3169    BDRVQcow2State *s = bs->opaque;
3170    int64_t offset;
3171    int ret;
3172
3173    if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
3174        offset = (*reftable)[reftable_index];
3175
3176        ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size,
3177                                            false);
3178        if (ret < 0) {
3179            error_setg_errno(errp, -ret, "Overlap check failed");
3180            return ret;
3181        }
3182
3183        ret = bdrv_pwrite(bs->file, offset, s->cluster_size, refblock, 0);
3184        if (ret < 0) {
3185            error_setg_errno(errp, -ret, "Failed to write refblock");
3186            return ret;
3187        }
3188    } else {
3189        assert(refblock_empty);
3190    }
3191
3192    return 0;
3193}
3194
3195/**
3196 * This function walks over the existing reftable and every referenced refblock;
3197 * if @new_set_refcount is non-NULL, it is called for every refcount entry to
3198 * create an equal new entry in the passed @new_refblock. Once that
3199 * @new_refblock is completely filled, @operation will be called.
3200 *
3201 * @status_cb and @cb_opaque are used for the amend operation's status callback.
3202 * @index is the index of the walk_over_reftable() calls and @total is the total
3203 * number of walk_over_reftable() calls per amend operation. Both are used for
3204 * calculating the parameters for the status callback.
3205 *
3206 * @allocated is set to true if a new cluster has been allocated.
3207 */
3208static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
3209                              uint64_t *new_reftable_index,
3210                              uint64_t *new_reftable_size,
3211                              void *new_refblock, int new_refblock_size,
3212                              int new_refcount_bits,
3213                              RefblockFinishOp *operation, bool *allocated,
3214                              Qcow2SetRefcountFunc *new_set_refcount,
3215                              BlockDriverAmendStatusCB *status_cb,
3216                              void *cb_opaque, int index, int total,
3217                              Error **errp)
3218{
3219    BDRVQcow2State *s = bs->opaque;
3220    uint64_t reftable_index;
3221    bool new_refblock_empty = true;
3222    int refblock_index;
3223    int new_refblock_index = 0;
3224    int ret;
3225
3226    for (reftable_index = 0; reftable_index < s->refcount_table_size;
3227         reftable_index++)
3228    {
3229        uint64_t refblock_offset = s->refcount_table[reftable_index]
3230                                 & REFT_OFFSET_MASK;
3231
3232        status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
3233                  (uint64_t)total * s->refcount_table_size, cb_opaque);
3234
3235        if (refblock_offset) {
3236            void *refblock;
3237
3238            if (offset_into_cluster(s, refblock_offset)) {
3239                qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
3240                                        PRIx64 " unaligned (reftable index: %#"
3241                                        PRIx64 ")", refblock_offset,
3242                                        reftable_index);
3243                error_setg(errp,
3244                           "Image is corrupt (unaligned refblock offset)");
3245                return -EIO;
3246            }
3247
3248            ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
3249                                  &refblock);
3250            if (ret < 0) {
3251                error_setg_errno(errp, -ret, "Failed to retrieve refblock");
3252                return ret;
3253            }
3254
3255            for (refblock_index = 0; refblock_index < s->refcount_block_size;
3256                 refblock_index++)
3257            {
3258                uint64_t refcount;
3259
3260                if (new_refblock_index >= new_refblock_size) {
3261                    /* new_refblock is now complete */
3262                    ret = operation(bs, new_reftable, *new_reftable_index,
3263                                    new_reftable_size, new_refblock,
3264                                    new_refblock_empty, allocated, errp);
3265                    if (ret < 0) {
3266                        qcow2_cache_put(s->refcount_block_cache, &refblock);
3267                        return ret;
3268                    }
3269
3270                    (*new_reftable_index)++;
3271                    new_refblock_index = 0;
3272                    new_refblock_empty = true;
3273                }
3274
3275                refcount = s->get_refcount(refblock, refblock_index);
3276                if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
3277                    uint64_t offset;
3278
3279                    qcow2_cache_put(s->refcount_block_cache, &refblock);
3280
3281                    offset = ((reftable_index << s->refcount_block_bits)
3282                              + refblock_index) << s->cluster_bits;
3283
3284                    error_setg(errp, "Cannot decrease refcount entry width to "
3285                               "%i bits: Cluster at offset %#" PRIx64 " has a "
3286                               "refcount of %" PRIu64, new_refcount_bits,
3287                               offset, refcount);
3288                    return -EINVAL;
3289                }
3290
3291                if (new_set_refcount) {
3292                    new_set_refcount(new_refblock, new_refblock_index++,
3293                                     refcount);
3294                } else {
3295                    new_refblock_index++;
3296                }
3297                new_refblock_empty = new_refblock_empty && refcount == 0;
3298            }
3299
3300            qcow2_cache_put(s->refcount_block_cache, &refblock);
3301        } else {
3302            /* No refblock means every refcount is 0 */
3303            for (refblock_index = 0; refblock_index < s->refcount_block_size;
3304                 refblock_index++)
3305            {
3306                if (new_refblock_index >= new_refblock_size) {
3307                    /* new_refblock is now complete */
3308                    ret = operation(bs, new_reftable, *new_reftable_index,
3309                                    new_reftable_size, new_refblock,
3310                                    new_refblock_empty, allocated, errp);
3311                    if (ret < 0) {
3312                        return ret;
3313                    }
3314
3315                    (*new_reftable_index)++;
3316                    new_refblock_index = 0;
3317                    new_refblock_empty = true;
3318                }
3319
3320                if (new_set_refcount) {
3321                    new_set_refcount(new_refblock, new_refblock_index++, 0);
3322                } else {
3323                    new_refblock_index++;
3324                }
3325            }
3326        }
3327    }
3328
3329    if (new_refblock_index > 0) {
3330        /* Complete the potentially existing partially filled final refblock */
3331        if (new_set_refcount) {
3332            for (; new_refblock_index < new_refblock_size;
3333                 new_refblock_index++)
3334            {
3335                new_set_refcount(new_refblock, new_refblock_index, 0);
3336            }
3337        }
3338
3339        ret = operation(bs, new_reftable, *new_reftable_index,
3340                        new_reftable_size, new_refblock, new_refblock_empty,
3341                        allocated, errp);
3342        if (ret < 0) {
3343            return ret;
3344        }
3345
3346        (*new_reftable_index)++;
3347    }
3348
3349    status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
3350              (uint64_t)total * s->refcount_table_size, cb_opaque);
3351
3352    return 0;
3353}
3354
3355int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
3356                                BlockDriverAmendStatusCB *status_cb,
3357                                void *cb_opaque, Error **errp)
3358{
3359    BDRVQcow2State *s = bs->opaque;
3360    Qcow2GetRefcountFunc *new_get_refcount;
3361    Qcow2SetRefcountFunc *new_set_refcount;
3362    void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
3363    uint64_t *new_reftable = NULL, new_reftable_size = 0;
3364    uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
3365    uint64_t new_reftable_index = 0;
3366    uint64_t i;
3367    int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
3368    int new_refblock_size, new_refcount_bits = 1 << refcount_order;
3369    int old_refcount_order;
3370    int walk_index = 0;
3371    int ret;
3372    bool new_allocation;
3373
3374    assert(s->qcow_version >= 3);
3375    assert(refcount_order >= 0 && refcount_order <= 6);
3376
3377    /* see qcow2_open() */
3378    new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
3379
3380    new_get_refcount = get_refcount_funcs[refcount_order];
3381    new_set_refcount = set_refcount_funcs[refcount_order];
3382
3383
3384    do {
3385        int total_walks;
3386
3387        new_allocation = false;
3388
3389        /* At least we have to do this walk and the one which writes the
3390         * refblocks; also, at least we have to do this loop here at least
3391         * twice (normally), first to do the allocations, and second to
3392         * determine that everything is correctly allocated, this then makes
3393         * three walks in total */
3394        total_walks = MAX(walk_index + 2, 3);
3395
3396        /* First, allocate the structures so they are present in the refcount
3397         * structures */
3398        ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3399                                 &new_reftable_size, NULL, new_refblock_size,
3400                                 new_refcount_bits, &alloc_refblock,
3401                                 &new_allocation, NULL, status_cb, cb_opaque,
3402                                 walk_index++, total_walks, errp);
3403        if (ret < 0) {
3404            goto done;
3405        }
3406
3407        new_reftable_index = 0;
3408
3409        if (new_allocation) {
3410            if (new_reftable_offset) {
3411                qcow2_free_clusters(
3412                    bs, new_reftable_offset,
3413                    allocated_reftable_size * REFTABLE_ENTRY_SIZE,
3414                    QCOW2_DISCARD_NEVER);
3415            }
3416
3417            new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
3418                                                           REFTABLE_ENTRY_SIZE);
3419            if (new_reftable_offset < 0) {
3420                error_setg_errno(errp, -new_reftable_offset,
3421                                 "Failed to allocate the new reftable");
3422                ret = new_reftable_offset;
3423                goto done;
3424            }
3425            allocated_reftable_size = new_reftable_size;
3426        }
3427    } while (new_allocation);
3428
3429    /* Second, write the new refblocks */
3430    ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3431                             &new_reftable_size, new_refblock,
3432                             new_refblock_size, new_refcount_bits,
3433                             &flush_refblock, &new_allocation, new_set_refcount,
3434                             status_cb, cb_opaque, walk_index, walk_index + 1,
3435                             errp);
3436    if (ret < 0) {
3437        goto done;
3438    }
3439    assert(!new_allocation);
3440
3441
3442    /* Write the new reftable */
3443    ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
3444                                        new_reftable_size * REFTABLE_ENTRY_SIZE,
3445                                        false);
3446    if (ret < 0) {
3447        error_setg_errno(errp, -ret, "Overlap check failed");
3448        goto done;
3449    }
3450
3451    for (i = 0; i < new_reftable_size; i++) {
3452        cpu_to_be64s(&new_reftable[i]);
3453    }
3454
3455    ret = bdrv_pwrite(bs->file, new_reftable_offset,
3456                      new_reftable_size * REFTABLE_ENTRY_SIZE, new_reftable,
3457                      0);
3458
3459    for (i = 0; i < new_reftable_size; i++) {
3460        be64_to_cpus(&new_reftable[i]);
3461    }
3462
3463    if (ret < 0) {
3464        error_setg_errno(errp, -ret, "Failed to write the new reftable");
3465        goto done;
3466    }
3467
3468
3469    /* Empty the refcount cache */
3470    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
3471    if (ret < 0) {
3472        error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
3473        goto done;
3474    }
3475
3476    /* Update the image header to point to the new reftable; this only updates
3477     * the fields which are relevant to qcow2_update_header(); other fields
3478     * such as s->refcount_table or s->refcount_bits stay stale for now
3479     * (because we have to restore everything if qcow2_update_header() fails) */
3480    old_refcount_order  = s->refcount_order;
3481    old_reftable_size   = s->refcount_table_size;
3482    old_reftable_offset = s->refcount_table_offset;
3483
3484    s->refcount_order        = refcount_order;
3485    s->refcount_table_size   = new_reftable_size;
3486    s->refcount_table_offset = new_reftable_offset;
3487
3488    ret = qcow2_update_header(bs);
3489    if (ret < 0) {
3490        s->refcount_order        = old_refcount_order;
3491        s->refcount_table_size   = old_reftable_size;
3492        s->refcount_table_offset = old_reftable_offset;
3493        error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
3494        goto done;
3495    }
3496
3497    /* Now update the rest of the in-memory information */
3498    old_reftable = s->refcount_table;
3499    s->refcount_table = new_reftable;
3500    update_max_refcount_table_index(s);
3501
3502    s->refcount_bits = 1 << refcount_order;
3503    s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
3504    s->refcount_max += s->refcount_max - 1;
3505
3506    s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
3507    s->refcount_block_size = 1 << s->refcount_block_bits;
3508
3509    s->get_refcount = new_get_refcount;
3510    s->set_refcount = new_set_refcount;
3511
3512    /* For cleaning up all old refblocks and the old reftable below the "done"
3513     * label */
3514    new_reftable        = old_reftable;
3515    new_reftable_size   = old_reftable_size;
3516    new_reftable_offset = old_reftable_offset;
3517
3518done:
3519    if (new_reftable) {
3520        /* On success, new_reftable actually points to the old reftable (and
3521         * new_reftable_size is the old reftable's size); but that is just
3522         * fine */
3523        for (i = 0; i < new_reftable_size; i++) {
3524            uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
3525            if (offset) {
3526                qcow2_free_clusters(bs, offset, s->cluster_size,
3527                                    QCOW2_DISCARD_OTHER);
3528            }
3529        }
3530        g_free(new_reftable);
3531
3532        if (new_reftable_offset > 0) {
3533            qcow2_free_clusters(bs, new_reftable_offset,
3534                                new_reftable_size * REFTABLE_ENTRY_SIZE,
3535                                QCOW2_DISCARD_OTHER);
3536        }
3537    }
3538
3539    qemu_vfree(new_refblock);
3540    return ret;
3541}
3542
3543static int64_t get_refblock_offset(BlockDriverState *bs, uint64_t offset)
3544{
3545    BDRVQcow2State *s = bs->opaque;
3546    uint32_t index = offset_to_reftable_index(s, offset);
3547    int64_t covering_refblock_offset = 0;
3548
3549    if (index < s->refcount_table_size) {
3550        covering_refblock_offset = s->refcount_table[index] & REFT_OFFSET_MASK;
3551    }
3552    if (!covering_refblock_offset) {
3553        qcow2_signal_corruption(bs, true, -1, -1, "Refblock at %#" PRIx64 " is "
3554                                "not covered by the refcount structures",
3555                                offset);
3556        return -EIO;
3557    }
3558
3559    return covering_refblock_offset;
3560}
3561
3562static int coroutine_fn
3563qcow2_discard_refcount_block(BlockDriverState *bs, uint64_t discard_block_offs)
3564{
3565    BDRVQcow2State *s = bs->opaque;
3566    int64_t refblock_offs;
3567    uint64_t cluster_index = discard_block_offs >> s->cluster_bits;
3568    uint32_t block_index = cluster_index & (s->refcount_block_size - 1);
3569    void *refblock;
3570    int ret;
3571
3572    refblock_offs = get_refblock_offset(bs, discard_block_offs);
3573    if (refblock_offs < 0) {
3574        return refblock_offs;
3575    }
3576
3577    assert(discard_block_offs != 0);
3578
3579    ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3580                          &refblock);
3581    if (ret < 0) {
3582        return ret;
3583    }
3584
3585    if (s->get_refcount(refblock, block_index) != 1) {
3586        qcow2_signal_corruption(bs, true, -1, -1, "Invalid refcount:"
3587                                " refblock offset %#" PRIx64
3588                                ", reftable index %u"
3589                                ", block offset %#" PRIx64
3590                                ", refcount %#" PRIx64,
3591                                refblock_offs,
3592                                offset_to_reftable_index(s, discard_block_offs),
3593                                discard_block_offs,
3594                                s->get_refcount(refblock, block_index));
3595        qcow2_cache_put(s->refcount_block_cache, &refblock);
3596        return -EINVAL;
3597    }
3598    s->set_refcount(refblock, block_index, 0);
3599
3600    qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refblock);
3601
3602    qcow2_cache_put(s->refcount_block_cache, &refblock);
3603
3604    if (cluster_index < s->free_cluster_index) {
3605        s->free_cluster_index = cluster_index;
3606    }
3607
3608    refblock = qcow2_cache_is_table_offset(s->refcount_block_cache,
3609                                           discard_block_offs);
3610    if (refblock) {
3611        /* discard refblock from the cache if refblock is cached */
3612        qcow2_cache_discard(s->refcount_block_cache, refblock);
3613    }
3614    update_refcount_discard(bs, discard_block_offs, s->cluster_size);
3615
3616    return 0;
3617}
3618
3619int coroutine_fn qcow2_shrink_reftable(BlockDriverState *bs)
3620{
3621    BDRVQcow2State *s = bs->opaque;
3622    uint64_t *reftable_tmp =
3623        g_malloc(s->refcount_table_size * REFTABLE_ENTRY_SIZE);
3624    int i, ret;
3625
3626    for (i = 0; i < s->refcount_table_size; i++) {
3627        int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK;
3628        void *refblock;
3629        bool unused_block;
3630
3631        if (refblock_offs == 0) {
3632            reftable_tmp[i] = 0;
3633            continue;
3634        }
3635        ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3636                              &refblock);
3637        if (ret < 0) {
3638            goto out;
3639        }
3640
3641        /* the refblock has own reference */
3642        if (i == offset_to_reftable_index(s, refblock_offs)) {
3643            uint64_t block_index = (refblock_offs >> s->cluster_bits) &
3644                                   (s->refcount_block_size - 1);
3645            uint64_t refcount = s->get_refcount(refblock, block_index);
3646
3647            s->set_refcount(refblock, block_index, 0);
3648
3649            unused_block = buffer_is_zero(refblock, s->cluster_size);
3650
3651            s->set_refcount(refblock, block_index, refcount);
3652        } else {
3653            unused_block = buffer_is_zero(refblock, s->cluster_size);
3654        }
3655        qcow2_cache_put(s->refcount_block_cache, &refblock);
3656
3657        reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
3658    }
3659
3660    ret = bdrv_co_pwrite_sync(bs->file, s->refcount_table_offset,
3661                              s->refcount_table_size * REFTABLE_ENTRY_SIZE,
3662                              reftable_tmp, 0);
3663    /*
3664     * If the write in the reftable failed the image may contain a partially
3665     * overwritten reftable. In this case it would be better to clear the
3666     * reftable in memory to avoid possible image corruption.
3667     */
3668    for (i = 0; i < s->refcount_table_size; i++) {
3669        if (s->refcount_table[i] && !reftable_tmp[i]) {
3670            if (ret == 0) {
3671                ret = qcow2_discard_refcount_block(bs, s->refcount_table[i] &
3672                                                       REFT_OFFSET_MASK);
3673            }
3674            s->refcount_table[i] = 0;
3675        }
3676    }
3677
3678    if (!s->cache_discards) {
3679        qcow2_process_discards(bs, ret);
3680    }
3681
3682out:
3683    g_free(reftable_tmp);
3684    return ret;
3685}
3686
3687int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
3688{
3689    BDRVQcow2State *s = bs->opaque;
3690    int64_t i;
3691
3692    for (i = size_to_clusters(s, size) - 1; i >= 0; i--) {
3693        uint64_t refcount;
3694        int ret = qcow2_get_refcount(bs, i, &refcount);
3695        if (ret < 0) {
3696            fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
3697                    i, strerror(-ret));
3698            return ret;
3699        }
3700        if (refcount > 0) {
3701            return i;
3702        }
3703    }
3704    qcow2_signal_corruption(bs, true, -1, -1,
3705                            "There are no references in the refcount table.");
3706    return -EIO;
3707}
3708
3709int coroutine_fn qcow2_detect_metadata_preallocation(BlockDriverState *bs)
3710{
3711    BDRVQcow2State *s = bs->opaque;
3712    int64_t i, end_cluster, cluster_count = 0, threshold;
3713    int64_t file_length, real_allocation, real_clusters;
3714
3715    qemu_co_mutex_assert_locked(&s->lock);
3716
3717    file_length = bdrv_getlength(bs->file->bs);
3718    if (file_length < 0) {
3719        return file_length;
3720    }
3721
3722    real_allocation = bdrv_get_allocated_file_size(bs->file->bs);
3723    if (real_allocation < 0) {
3724        return real_allocation;
3725    }
3726
3727    real_clusters = real_allocation / s->cluster_size;
3728    threshold = MAX(real_clusters * 10 / 9, real_clusters + 2);
3729
3730    end_cluster = size_to_clusters(s, file_length);
3731    for (i = 0; i < end_cluster && cluster_count < threshold; i++) {
3732        uint64_t refcount;
3733        int ret = qcow2_get_refcount(bs, i, &refcount);
3734        if (ret < 0) {
3735            return ret;
3736        }
3737        cluster_count += !!refcount;
3738    }
3739
3740    return cluster_count >= threshold;
3741}
3742