qemu/block/qcow2-refcount.c
<<
>>
Prefs
   1/*
   2 * Block driver for the QCOW version 2 format
   3 *
   4 * Copyright (c) 2004-2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "qapi/error.h"
  27#include "qcow2.h"
  28#include "qemu/range.h"
  29#include "qemu/bswap.h"
  30#include "qemu/cutils.h"
  31#include "qemu/memalign.h"
  32#include "trace.h"
  33
  34static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
  35                                    uint64_t max);
  36
  37G_GNUC_WARN_UNUSED_RESULT
  38static int update_refcount(BlockDriverState *bs,
  39                           int64_t offset, int64_t length, uint64_t addend,
  40                           bool decrease, enum qcow2_discard_type type);
  41
  42static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
  43static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
  44static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
  45static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
  46static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
  47static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
  48static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
  49
  50static void set_refcount_ro0(void *refcount_array, uint64_t index,
  51                             uint64_t value);
  52static void set_refcount_ro1(void *refcount_array, uint64_t index,
  53                             uint64_t value);
  54static void set_refcount_ro2(void *refcount_array, uint64_t index,
  55                             uint64_t value);
  56static void set_refcount_ro3(void *refcount_array, uint64_t index,
  57                             uint64_t value);
  58static void set_refcount_ro4(void *refcount_array, uint64_t index,
  59                             uint64_t value);
  60static void set_refcount_ro5(void *refcount_array, uint64_t index,
  61                             uint64_t value);
  62static void set_refcount_ro6(void *refcount_array, uint64_t index,
  63                             uint64_t value);
  64
  65
  66static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
  67    &get_refcount_ro0,
  68    &get_refcount_ro1,
  69    &get_refcount_ro2,
  70    &get_refcount_ro3,
  71    &get_refcount_ro4,
  72    &get_refcount_ro5,
  73    &get_refcount_ro6
  74};
  75
  76static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
  77    &set_refcount_ro0,
  78    &set_refcount_ro1,
  79    &set_refcount_ro2,
  80    &set_refcount_ro3,
  81    &set_refcount_ro4,
  82    &set_refcount_ro5,
  83    &set_refcount_ro6
  84};
  85
  86
  87/*********************************************************/
  88/* refcount handling */
  89
  90static void update_max_refcount_table_index(BDRVQcow2State *s)
  91{
  92    unsigned i = s->refcount_table_size - 1;
  93    while (i > 0 && (s->refcount_table[i] & REFT_OFFSET_MASK) == 0) {
  94        i--;
  95    }
  96    /* Set s->max_refcount_table_index to the index of the last used entry */
  97    s->max_refcount_table_index = i;
  98}
  99
 100int qcow2_refcount_init(BlockDriverState *bs)
 101{
 102    BDRVQcow2State *s = bs->opaque;
 103    unsigned int refcount_table_size2, i;
 104    int ret;
 105
 106    assert(s->refcount_order >= 0 && s->refcount_order <= 6);
 107
 108    s->get_refcount = get_refcount_funcs[s->refcount_order];
 109    s->set_refcount = set_refcount_funcs[s->refcount_order];
 110
 111    assert(s->refcount_table_size <= INT_MAX / REFTABLE_ENTRY_SIZE);
 112    refcount_table_size2 = s->refcount_table_size * REFTABLE_ENTRY_SIZE;
 113    s->refcount_table = g_try_malloc(refcount_table_size2);
 114
 115    if (s->refcount_table_size > 0) {
 116        if (s->refcount_table == NULL) {
 117            ret = -ENOMEM;
 118            goto fail;
 119        }
 120        BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
 121        ret = bdrv_pread(bs->file, s->refcount_table_offset,
 122                         s->refcount_table, refcount_table_size2);
 123        if (ret < 0) {
 124            goto fail;
 125        }
 126        for(i = 0; i < s->refcount_table_size; i++)
 127            be64_to_cpus(&s->refcount_table[i]);
 128        update_max_refcount_table_index(s);
 129    }
 130    return 0;
 131 fail:
 132    return ret;
 133}
 134
 135void qcow2_refcount_close(BlockDriverState *bs)
 136{
 137    BDRVQcow2State *s = bs->opaque;
 138    g_free(s->refcount_table);
 139}
 140
 141
 142static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
 143{
 144    return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
 145}
 146
 147static void set_refcount_ro0(void *refcount_array, uint64_t index,
 148                             uint64_t value)
 149{
 150    assert(!(value >> 1));
 151    ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
 152    ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
 153}
 154
 155static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
 156{
 157    return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
 158           & 0x3;
 159}
 160
 161static void set_refcount_ro1(void *refcount_array, uint64_t index,
 162                             uint64_t value)
 163{
 164    assert(!(value >> 2));
 165    ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
 166    ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
 167}
 168
 169static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
 170{
 171    return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
 172           & 0xf;
 173}
 174
 175static void set_refcount_ro2(void *refcount_array, uint64_t index,
 176                             uint64_t value)
 177{
 178    assert(!(value >> 4));
 179    ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
 180    ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
 181}
 182
 183static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
 184{
 185    return ((const uint8_t *)refcount_array)[index];
 186}
 187
 188static void set_refcount_ro3(void *refcount_array, uint64_t index,
 189                             uint64_t value)
 190{
 191    assert(!(value >> 8));
 192    ((uint8_t *)refcount_array)[index] = value;
 193}
 194
 195static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
 196{
 197    return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
 198}
 199
 200static void set_refcount_ro4(void *refcount_array, uint64_t index,
 201                             uint64_t value)
 202{
 203    assert(!(value >> 16));
 204    ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
 205}
 206
 207static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
 208{
 209    return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
 210}
 211
 212static void set_refcount_ro5(void *refcount_array, uint64_t index,
 213                             uint64_t value)
 214{
 215    assert(!(value >> 32));
 216    ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
 217}
 218
 219static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
 220{
 221    return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
 222}
 223
 224static void set_refcount_ro6(void *refcount_array, uint64_t index,
 225                             uint64_t value)
 226{
 227    ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
 228}
 229
 230
 231static int load_refcount_block(BlockDriverState *bs,
 232                               int64_t refcount_block_offset,
 233                               void **refcount_block)
 234{
 235    BDRVQcow2State *s = bs->opaque;
 236
 237    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
 238    return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
 239                           refcount_block);
 240}
 241
 242/*
 243 * Retrieves the refcount of the cluster given by its index and stores it in
 244 * *refcount. Returns 0 on success and -errno on failure.
 245 */
 246int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
 247                       uint64_t *refcount)
 248{
 249    BDRVQcow2State *s = bs->opaque;
 250    uint64_t refcount_table_index, block_index;
 251    int64_t refcount_block_offset;
 252    int ret;
 253    void *refcount_block;
 254
 255    refcount_table_index = cluster_index >> s->refcount_block_bits;
 256    if (refcount_table_index >= s->refcount_table_size) {
 257        *refcount = 0;
 258        return 0;
 259    }
 260    refcount_block_offset =
 261        s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
 262    if (!refcount_block_offset) {
 263        *refcount = 0;
 264        return 0;
 265    }
 266
 267    if (offset_into_cluster(s, refcount_block_offset)) {
 268        qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
 269                                " unaligned (reftable index: %#" PRIx64 ")",
 270                                refcount_block_offset, refcount_table_index);
 271        return -EIO;
 272    }
 273
 274    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
 275                          &refcount_block);
 276    if (ret < 0) {
 277        return ret;
 278    }
 279
 280    block_index = cluster_index & (s->refcount_block_size - 1);
 281    *refcount = s->get_refcount(refcount_block, block_index);
 282
 283    qcow2_cache_put(s->refcount_block_cache, &refcount_block);
 284
 285    return 0;
 286}
 287
 288/* Checks if two offsets are described by the same refcount block */
 289static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
 290    uint64_t offset_b)
 291{
 292    uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
 293    uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
 294
 295    return (block_a == block_b);
 296}
 297
 298/*
 299 * Loads a refcount block. If it doesn't exist yet, it is allocated first
 300 * (including growing the refcount table if needed).
 301 *
 302 * Returns 0 on success or -errno in error case
 303 */
 304static int alloc_refcount_block(BlockDriverState *bs,
 305                                int64_t cluster_index, void **refcount_block)
 306{
 307    BDRVQcow2State *s = bs->opaque;
 308    unsigned int refcount_table_index;
 309    int64_t ret;
 310
 311    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
 312
 313    /* Find the refcount block for the given cluster */
 314    refcount_table_index = cluster_index >> s->refcount_block_bits;
 315
 316    if (refcount_table_index < s->refcount_table_size) {
 317
 318        uint64_t refcount_block_offset =
 319            s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
 320
 321        /* If it's already there, we're done */
 322        if (refcount_block_offset) {
 323            if (offset_into_cluster(s, refcount_block_offset)) {
 324                qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
 325                                        PRIx64 " unaligned (reftable index: "
 326                                        "%#x)", refcount_block_offset,
 327                                        refcount_table_index);
 328                return -EIO;
 329            }
 330
 331             return load_refcount_block(bs, refcount_block_offset,
 332                                        refcount_block);
 333        }
 334    }
 335
 336    /*
 337     * If we came here, we need to allocate something. Something is at least
 338     * a cluster for the new refcount block. It may also include a new refcount
 339     * table if the old refcount table is too small.
 340     *
 341     * Note that allocating clusters here needs some special care:
 342     *
 343     * - We can't use the normal qcow2_alloc_clusters(), it would try to
 344     *   increase the refcount and very likely we would end up with an endless
 345     *   recursion. Instead we must place the refcount blocks in a way that
 346     *   they can describe them themselves.
 347     *
 348     * - We need to consider that at this point we are inside update_refcounts
 349     *   and potentially doing an initial refcount increase. This means that
 350     *   some clusters have already been allocated by the caller, but their
 351     *   refcount isn't accurate yet. If we allocate clusters for metadata, we
 352     *   need to return -EAGAIN to signal the caller that it needs to restart
 353     *   the search for free clusters.
 354     *
 355     * - alloc_clusters_noref and qcow2_free_clusters may load a different
 356     *   refcount block into the cache
 357     */
 358
 359    *refcount_block = NULL;
 360
 361    /* We write to the refcount table, so we might depend on L2 tables */
 362    ret = qcow2_cache_flush(bs, s->l2_table_cache);
 363    if (ret < 0) {
 364        return ret;
 365    }
 366
 367    /* Allocate the refcount block itself and mark it as used */
 368    int64_t new_block = alloc_clusters_noref(bs, s->cluster_size, INT64_MAX);
 369    if (new_block < 0) {
 370        return new_block;
 371    }
 372
 373    /* The offset must fit in the offset field of the refcount table entry */
 374    assert((new_block & REFT_OFFSET_MASK) == new_block);
 375
 376    /* If we're allocating the block at offset 0 then something is wrong */
 377    if (new_block == 0) {
 378        qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
 379                                "allocation of refcount block at offset 0");
 380        return -EIO;
 381    }
 382
 383#ifdef DEBUG_ALLOC2
 384    fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
 385        " at %" PRIx64 "\n",
 386        refcount_table_index, cluster_index << s->cluster_bits, new_block);
 387#endif
 388
 389    if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
 390        /* Zero the new refcount block before updating it */
 391        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
 392                                    refcount_block);
 393        if (ret < 0) {
 394            goto fail;
 395        }
 396
 397        memset(*refcount_block, 0, s->cluster_size);
 398
 399        /* The block describes itself, need to update the cache */
 400        int block_index = (new_block >> s->cluster_bits) &
 401            (s->refcount_block_size - 1);
 402        s->set_refcount(*refcount_block, block_index, 1);
 403    } else {
 404        /* Described somewhere else. This can recurse at most twice before we
 405         * arrive at a block that describes itself. */
 406        ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
 407                              QCOW2_DISCARD_NEVER);
 408        if (ret < 0) {
 409            goto fail;
 410        }
 411
 412        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 413        if (ret < 0) {
 414            goto fail;
 415        }
 416
 417        /* Initialize the new refcount block only after updating its refcount,
 418         * update_refcount uses the refcount cache itself */
 419        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
 420                                    refcount_block);
 421        if (ret < 0) {
 422            goto fail;
 423        }
 424
 425        memset(*refcount_block, 0, s->cluster_size);
 426    }
 427
 428    /* Now the new refcount block needs to be written to disk */
 429    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
 430    qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
 431    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 432    if (ret < 0) {
 433        goto fail;
 434    }
 435
 436    /* If the refcount table is big enough, just hook the block up there */
 437    if (refcount_table_index < s->refcount_table_size) {
 438        uint64_t data64 = cpu_to_be64(new_block);
 439        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
 440        ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset +
 441                               refcount_table_index * REFTABLE_ENTRY_SIZE,
 442            &data64, sizeof(data64));
 443        if (ret < 0) {
 444            goto fail;
 445        }
 446
 447        s->refcount_table[refcount_table_index] = new_block;
 448        /* If there's a hole in s->refcount_table then it can happen
 449         * that refcount_table_index < s->max_refcount_table_index */
 450        s->max_refcount_table_index =
 451            MAX(s->max_refcount_table_index, refcount_table_index);
 452
 453        /* The new refcount block may be where the caller intended to put its
 454         * data, so let it restart the search. */
 455        return -EAGAIN;
 456    }
 457
 458    qcow2_cache_put(s->refcount_block_cache, refcount_block);
 459
 460    /*
 461     * If we come here, we need to grow the refcount table. Again, a new
 462     * refcount table needs some space and we can't simply allocate to avoid
 463     * endless recursion.
 464     *
 465     * Therefore let's grab new refcount blocks at the end of the image, which
 466     * will describe themselves and the new refcount table. This way we can
 467     * reference them only in the new table and do the switch to the new
 468     * refcount table at once without producing an inconsistent state in
 469     * between.
 470     */
 471    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
 472
 473    /* Calculate the number of refcount blocks needed so far; this will be the
 474     * basis for calculating the index of the first cluster used for the
 475     * self-describing refcount structures which we are about to create.
 476     *
 477     * Because we reached this point, there cannot be any refcount entries for
 478     * cluster_index or higher indices yet. However, because new_block has been
 479     * allocated to describe that cluster (and it will assume this role later
 480     * on), we cannot use that index; also, new_block may actually have a higher
 481     * cluster index than cluster_index, so it needs to be taken into account
 482     * here (and 1 needs to be added to its value because that cluster is used).
 483     */
 484    uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
 485                                            (new_block >> s->cluster_bits) + 1),
 486                                        s->refcount_block_size);
 487
 488    /* Create the new refcount table and blocks */
 489    uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
 490        s->cluster_size;
 491
 492    ret = qcow2_refcount_area(bs, meta_offset, 0, false,
 493                              refcount_table_index, new_block);
 494    if (ret < 0) {
 495        return ret;
 496    }
 497
 498    ret = load_refcount_block(bs, new_block, refcount_block);
 499    if (ret < 0) {
 500        return ret;
 501    }
 502
 503    /* If we were trying to do the initial refcount update for some cluster
 504     * allocation, we might have used the same clusters to store newly
 505     * allocated metadata. Make the caller search some new space. */
 506    return -EAGAIN;
 507
 508fail:
 509    if (*refcount_block != NULL) {
 510        qcow2_cache_put(s->refcount_block_cache, refcount_block);
 511    }
 512    return ret;
 513}
 514
 515/*
 516 * Starting at @start_offset, this function creates new self-covering refcount
 517 * structures: A new refcount table and refcount blocks which cover all of
 518 * themselves, and a number of @additional_clusters beyond their end.
 519 * @start_offset must be at the end of the image file, that is, there must be
 520 * only empty space beyond it.
 521 * If @exact_size is false, the refcount table will have 50 % more entries than
 522 * necessary so it will not need to grow again soon.
 523 * If @new_refblock_offset is not zero, it contains the offset of a refcount
 524 * block that should be entered into the new refcount table at index
 525 * @new_refblock_index.
 526 *
 527 * Returns: The offset after the new refcount structures (i.e. where the
 528 *          @additional_clusters may be placed) on success, -errno on error.
 529 */
 530int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
 531                            uint64_t additional_clusters, bool exact_size,
 532                            int new_refblock_index,
 533                            uint64_t new_refblock_offset)
 534{
 535    BDRVQcow2State *s = bs->opaque;
 536    uint64_t total_refblock_count_u64, additional_refblock_count;
 537    int total_refblock_count, table_size, area_reftable_index, table_clusters;
 538    int i;
 539    uint64_t table_offset, block_offset, end_offset;
 540    int ret;
 541    uint64_t *new_table;
 542
 543    assert(!(start_offset % s->cluster_size));
 544
 545    qcow2_refcount_metadata_size(start_offset / s->cluster_size +
 546                                 additional_clusters,
 547                                 s->cluster_size, s->refcount_order,
 548                                 !exact_size, &total_refblock_count_u64);
 549    if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) {
 550        return -EFBIG;
 551    }
 552    total_refblock_count = total_refblock_count_u64;
 553
 554    /* Index in the refcount table of the first refcount block to cover the area
 555     * of refcount structures we are about to create; we know that
 556     * @total_refblock_count can cover @start_offset, so this will definitely
 557     * fit into an int. */
 558    area_reftable_index = (start_offset / s->cluster_size) /
 559                          s->refcount_block_size;
 560
 561    if (exact_size) {
 562        table_size = total_refblock_count;
 563    } else {
 564        table_size = total_refblock_count +
 565                     DIV_ROUND_UP(total_refblock_count, 2);
 566    }
 567    /* The qcow2 file can only store the reftable size in number of clusters */
 568    table_size = ROUND_UP(table_size, s->cluster_size / REFTABLE_ENTRY_SIZE);
 569    table_clusters = (table_size * REFTABLE_ENTRY_SIZE) / s->cluster_size;
 570
 571    if (table_size > QCOW_MAX_REFTABLE_SIZE) {
 572        return -EFBIG;
 573    }
 574
 575    new_table = g_try_new0(uint64_t, table_size);
 576
 577    assert(table_size > 0);
 578    if (new_table == NULL) {
 579        ret = -ENOMEM;
 580        goto fail;
 581    }
 582
 583    /* Fill the new refcount table */
 584    if (table_size > s->max_refcount_table_index) {
 585        /* We're actually growing the reftable */
 586        memcpy(new_table, s->refcount_table,
 587               (s->max_refcount_table_index + 1) * REFTABLE_ENTRY_SIZE);
 588    } else {
 589        /* Improbable case: We're shrinking the reftable. However, the caller
 590         * has assured us that there is only empty space beyond @start_offset,
 591         * so we can simply drop all of the refblocks that won't fit into the
 592         * new reftable. */
 593        memcpy(new_table, s->refcount_table, table_size * REFTABLE_ENTRY_SIZE);
 594    }
 595
 596    if (new_refblock_offset) {
 597        assert(new_refblock_index < total_refblock_count);
 598        new_table[new_refblock_index] = new_refblock_offset;
 599    }
 600
 601    /* Count how many new refblocks we have to create */
 602    additional_refblock_count = 0;
 603    for (i = area_reftable_index; i < total_refblock_count; i++) {
 604        if (!new_table[i]) {
 605            additional_refblock_count++;
 606        }
 607    }
 608
 609    table_offset = start_offset + additional_refblock_count * s->cluster_size;
 610    end_offset = table_offset + table_clusters * s->cluster_size;
 611
 612    /* Fill the refcount blocks, and create new ones, if necessary */
 613    block_offset = start_offset;
 614    for (i = area_reftable_index; i < total_refblock_count; i++) {
 615        void *refblock_data;
 616        uint64_t first_offset_covered;
 617
 618        /* Reuse an existing refblock if possible, create a new one otherwise */
 619        if (new_table[i]) {
 620            ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i],
 621                                  &refblock_data);
 622            if (ret < 0) {
 623                goto fail;
 624            }
 625        } else {
 626            ret = qcow2_cache_get_empty(bs, s->refcount_block_cache,
 627                                        block_offset, &refblock_data);
 628            if (ret < 0) {
 629                goto fail;
 630            }
 631            memset(refblock_data, 0, s->cluster_size);
 632            qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
 633                                         refblock_data);
 634
 635            new_table[i] = block_offset;
 636            block_offset += s->cluster_size;
 637        }
 638
 639        /* First host offset covered by this refblock */
 640        first_offset_covered = (uint64_t)i * s->refcount_block_size *
 641                               s->cluster_size;
 642        if (first_offset_covered < end_offset) {
 643            int j, end_index;
 644
 645            /* Set the refcount of all of the new refcount structures to 1 */
 646
 647            if (first_offset_covered < start_offset) {
 648                assert(i == area_reftable_index);
 649                j = (start_offset - first_offset_covered) / s->cluster_size;
 650                assert(j < s->refcount_block_size);
 651            } else {
 652                j = 0;
 653            }
 654
 655            end_index = MIN((end_offset - first_offset_covered) /
 656                            s->cluster_size,
 657                            s->refcount_block_size);
 658
 659            for (; j < end_index; j++) {
 660                /* The caller guaranteed us this space would be empty */
 661                assert(s->get_refcount(refblock_data, j) == 0);
 662                s->set_refcount(refblock_data, j, 1);
 663            }
 664
 665            qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
 666                                         refblock_data);
 667        }
 668
 669        qcow2_cache_put(s->refcount_block_cache, &refblock_data);
 670    }
 671
 672    assert(block_offset == table_offset);
 673
 674    /* Write refcount blocks to disk */
 675    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
 676    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 677    if (ret < 0) {
 678        goto fail;
 679    }
 680
 681    /* Write refcount table to disk */
 682    for (i = 0; i < total_refblock_count; i++) {
 683        cpu_to_be64s(&new_table[i]);
 684    }
 685
 686    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
 687    ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
 688        table_size * REFTABLE_ENTRY_SIZE);
 689    if (ret < 0) {
 690        goto fail;
 691    }
 692
 693    for (i = 0; i < total_refblock_count; i++) {
 694        be64_to_cpus(&new_table[i]);
 695    }
 696
 697    /* Hook up the new refcount table in the qcow2 header */
 698    struct QEMU_PACKED {
 699        uint64_t d64;
 700        uint32_t d32;
 701    } data;
 702    data.d64 = cpu_to_be64(table_offset);
 703    data.d32 = cpu_to_be32(table_clusters);
 704    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
 705    ret = bdrv_pwrite_sync(bs->file,
 706                           offsetof(QCowHeader, refcount_table_offset),
 707                           &data, sizeof(data));
 708    if (ret < 0) {
 709        goto fail;
 710    }
 711
 712    /* And switch it in memory */
 713    uint64_t old_table_offset = s->refcount_table_offset;
 714    uint64_t old_table_size = s->refcount_table_size;
 715
 716    g_free(s->refcount_table);
 717    s->refcount_table = new_table;
 718    s->refcount_table_size = table_size;
 719    s->refcount_table_offset = table_offset;
 720    update_max_refcount_table_index(s);
 721
 722    /* Free old table. */
 723    qcow2_free_clusters(bs, old_table_offset,
 724                        old_table_size * REFTABLE_ENTRY_SIZE,
 725                        QCOW2_DISCARD_OTHER);
 726
 727    return end_offset;
 728
 729fail:
 730    g_free(new_table);
 731    return ret;
 732}
 733
 734void qcow2_process_discards(BlockDriverState *bs, int ret)
 735{
 736    BDRVQcow2State *s = bs->opaque;
 737    Qcow2DiscardRegion *d, *next;
 738
 739    QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
 740        QTAILQ_REMOVE(&s->discards, d, next);
 741
 742        /* Discard is optional, ignore the return value */
 743        if (ret >= 0) {
 744            int r2 = bdrv_pdiscard(bs->file, d->offset, d->bytes);
 745            if (r2 < 0) {
 746                trace_qcow2_process_discards_failed_region(d->offset, d->bytes,
 747                                                           r2);
 748            }
 749        }
 750
 751        g_free(d);
 752    }
 753}
 754
 755static void update_refcount_discard(BlockDriverState *bs,
 756                                    uint64_t offset, uint64_t length)
 757{
 758    BDRVQcow2State *s = bs->opaque;
 759    Qcow2DiscardRegion *d, *p, *next;
 760
 761    QTAILQ_FOREACH(d, &s->discards, next) {
 762        uint64_t new_start = MIN(offset, d->offset);
 763        uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
 764
 765        if (new_end - new_start <= length + d->bytes) {
 766            /* There can't be any overlap, areas ending up here have no
 767             * references any more and therefore shouldn't get freed another
 768             * time. */
 769            assert(d->bytes + length == new_end - new_start);
 770            d->offset = new_start;
 771            d->bytes = new_end - new_start;
 772            goto found;
 773        }
 774    }
 775
 776    d = g_malloc(sizeof(*d));
 777    *d = (Qcow2DiscardRegion) {
 778        .bs     = bs,
 779        .offset = offset,
 780        .bytes  = length,
 781    };
 782    QTAILQ_INSERT_TAIL(&s->discards, d, next);
 783
 784found:
 785    /* Merge discard requests if they are adjacent now */
 786    QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
 787        if (p == d
 788            || p->offset > d->offset + d->bytes
 789            || d->offset > p->offset + p->bytes)
 790        {
 791            continue;
 792        }
 793
 794        /* Still no overlap possible */
 795        assert(p->offset == d->offset + d->bytes
 796            || d->offset == p->offset + p->bytes);
 797
 798        QTAILQ_REMOVE(&s->discards, p, next);
 799        d->offset = MIN(d->offset, p->offset);
 800        d->bytes += p->bytes;
 801        g_free(p);
 802    }
 803}
 804
 805/* XXX: cache several refcount block clusters ? */
 806/* @addend is the absolute value of the addend; if @decrease is set, @addend
 807 * will be subtracted from the current refcount, otherwise it will be added */
 808static int update_refcount(BlockDriverState *bs,
 809                           int64_t offset,
 810                           int64_t length,
 811                           uint64_t addend,
 812                           bool decrease,
 813                           enum qcow2_discard_type type)
 814{
 815    BDRVQcow2State *s = bs->opaque;
 816    int64_t start, last, cluster_offset;
 817    void *refcount_block = NULL;
 818    int64_t old_table_index = -1;
 819    int ret;
 820
 821#ifdef DEBUG_ALLOC2
 822    fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
 823            " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
 824            addend);
 825#endif
 826    if (length < 0) {
 827        return -EINVAL;
 828    } else if (length == 0) {
 829        return 0;
 830    }
 831
 832    if (decrease) {
 833        qcow2_cache_set_dependency(bs, s->refcount_block_cache,
 834            s->l2_table_cache);
 835    }
 836
 837    start = start_of_cluster(s, offset);
 838    last = start_of_cluster(s, offset + length - 1);
 839    for(cluster_offset = start; cluster_offset <= last;
 840        cluster_offset += s->cluster_size)
 841    {
 842        int block_index;
 843        uint64_t refcount;
 844        int64_t cluster_index = cluster_offset >> s->cluster_bits;
 845        int64_t table_index = cluster_index >> s->refcount_block_bits;
 846
 847        /* Load the refcount block and allocate it if needed */
 848        if (table_index != old_table_index) {
 849            if (refcount_block) {
 850                qcow2_cache_put(s->refcount_block_cache, &refcount_block);
 851            }
 852            ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
 853            /* If the caller needs to restart the search for free clusters,
 854             * try the same ones first to see if they're still free. */
 855            if (ret == -EAGAIN) {
 856                if (s->free_cluster_index > (start >> s->cluster_bits)) {
 857                    s->free_cluster_index = (start >> s->cluster_bits);
 858                }
 859            }
 860            if (ret < 0) {
 861                goto fail;
 862            }
 863        }
 864        old_table_index = table_index;
 865
 866        qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
 867
 868        /* we can update the count and save it */
 869        block_index = cluster_index & (s->refcount_block_size - 1);
 870
 871        refcount = s->get_refcount(refcount_block, block_index);
 872        if (decrease ? (refcount - addend > refcount)
 873                     : (refcount + addend < refcount ||
 874                        refcount + addend > s->refcount_max))
 875        {
 876            ret = -EINVAL;
 877            goto fail;
 878        }
 879        if (decrease) {
 880            refcount -= addend;
 881        } else {
 882            refcount += addend;
 883        }
 884        if (refcount == 0 && cluster_index < s->free_cluster_index) {
 885            s->free_cluster_index = cluster_index;
 886        }
 887        s->set_refcount(refcount_block, block_index, refcount);
 888
 889        if (refcount == 0) {
 890            void *table;
 891
 892            table = qcow2_cache_is_table_offset(s->refcount_block_cache,
 893                                                offset);
 894            if (table != NULL) {
 895                qcow2_cache_put(s->refcount_block_cache, &refcount_block);
 896                old_table_index = -1;
 897                qcow2_cache_discard(s->refcount_block_cache, table);
 898            }
 899
 900            table = qcow2_cache_is_table_offset(s->l2_table_cache, offset);
 901            if (table != NULL) {
 902                qcow2_cache_discard(s->l2_table_cache, table);
 903            }
 904
 905            if (s->discard_passthrough[type]) {
 906                update_refcount_discard(bs, cluster_offset, s->cluster_size);
 907            }
 908        }
 909    }
 910
 911    ret = 0;
 912fail:
 913    if (!s->cache_discards) {
 914        qcow2_process_discards(bs, ret);
 915    }
 916
 917    /* Write last changed block to disk */
 918    if (refcount_block) {
 919        qcow2_cache_put(s->refcount_block_cache, &refcount_block);
 920    }
 921
 922    /*
 923     * Try do undo any updates if an error is returned (This may succeed in
 924     * some cases like ENOSPC for allocating a new refcount block)
 925     */
 926    if (ret < 0) {
 927        int dummy;
 928        dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
 929                                !decrease, QCOW2_DISCARD_NEVER);
 930        (void)dummy;
 931    }
 932
 933    return ret;
 934}
 935
 936/*
 937 * Increases or decreases the refcount of a given cluster.
 938 *
 939 * @addend is the absolute value of the addend; if @decrease is set, @addend
 940 * will be subtracted from the current refcount, otherwise it will be added.
 941 *
 942 * On success 0 is returned; on failure -errno is returned.
 943 */
 944int qcow2_update_cluster_refcount(BlockDriverState *bs,
 945                                  int64_t cluster_index,
 946                                  uint64_t addend, bool decrease,
 947                                  enum qcow2_discard_type type)
 948{
 949    BDRVQcow2State *s = bs->opaque;
 950    int ret;
 951
 952    ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
 953                          decrease, type);
 954    if (ret < 0) {
 955        return ret;
 956    }
 957
 958    return 0;
 959}
 960
 961
 962
 963/*********************************************************/
 964/* cluster allocation functions */
 965
 966
 967
 968/* return < 0 if error */
 969static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
 970                                    uint64_t max)
 971{
 972    BDRVQcow2State *s = bs->opaque;
 973    uint64_t i, nb_clusters, refcount;
 974    int ret;
 975
 976    /* We can't allocate clusters if they may still be queued for discard. */
 977    if (s->cache_discards) {
 978        qcow2_process_discards(bs, 0);
 979    }
 980
 981    nb_clusters = size_to_clusters(s, size);
 982retry:
 983    for(i = 0; i < nb_clusters; i++) {
 984        uint64_t next_cluster_index = s->free_cluster_index++;
 985        ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
 986
 987        if (ret < 0) {
 988            return ret;
 989        } else if (refcount != 0) {
 990            goto retry;
 991        }
 992    }
 993
 994    /* Make sure that all offsets in the "allocated" range are representable
 995     * in the requested max */
 996    if (s->free_cluster_index > 0 &&
 997        s->free_cluster_index - 1 > (max >> s->cluster_bits))
 998    {
 999        return -EFBIG;
1000    }
1001
1002#ifdef DEBUG_ALLOC2
1003    fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
1004            size,
1005            (s->free_cluster_index - nb_clusters) << s->cluster_bits);
1006#endif
1007    return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
1008}
1009
1010int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
1011{
1012    int64_t offset;
1013    int ret;
1014
1015    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
1016    do {
1017        offset = alloc_clusters_noref(bs, size, QCOW_MAX_CLUSTER_OFFSET);
1018        if (offset < 0) {
1019            return offset;
1020        }
1021
1022        ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1023    } while (ret == -EAGAIN);
1024
1025    if (ret < 0) {
1026        return ret;
1027    }
1028
1029    return offset;
1030}
1031
1032int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
1033                                int64_t nb_clusters)
1034{
1035    BDRVQcow2State *s = bs->opaque;
1036    uint64_t cluster_index, refcount;
1037    uint64_t i;
1038    int ret;
1039
1040    assert(nb_clusters >= 0);
1041    if (nb_clusters == 0) {
1042        return 0;
1043    }
1044
1045    do {
1046        /* Check how many clusters there are free */
1047        cluster_index = offset >> s->cluster_bits;
1048        for(i = 0; i < nb_clusters; i++) {
1049            ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
1050            if (ret < 0) {
1051                return ret;
1052            } else if (refcount != 0) {
1053                break;
1054            }
1055        }
1056
1057        /* And then allocate them */
1058        ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
1059                              QCOW2_DISCARD_NEVER);
1060    } while (ret == -EAGAIN);
1061
1062    if (ret < 0) {
1063        return ret;
1064    }
1065
1066    return i;
1067}
1068
1069/* only used to allocate compressed sectors. We try to allocate
1070   contiguous sectors. size must be <= cluster_size */
1071int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
1072{
1073    BDRVQcow2State *s = bs->opaque;
1074    int64_t offset;
1075    size_t free_in_cluster;
1076    int ret;
1077
1078    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
1079    assert(size > 0 && size <= s->cluster_size);
1080    assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
1081
1082    offset = s->free_byte_offset;
1083
1084    if (offset) {
1085        uint64_t refcount;
1086        ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
1087        if (ret < 0) {
1088            return ret;
1089        }
1090
1091        if (refcount == s->refcount_max) {
1092            offset = 0;
1093        }
1094    }
1095
1096    free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
1097    do {
1098        if (!offset || free_in_cluster < size) {
1099            int64_t new_cluster;
1100
1101            new_cluster = alloc_clusters_noref(bs, s->cluster_size,
1102                                               MIN(s->cluster_offset_mask,
1103                                                   QCOW_MAX_CLUSTER_OFFSET));
1104            if (new_cluster < 0) {
1105                return new_cluster;
1106            }
1107
1108            if (new_cluster == 0) {
1109                qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
1110                                        "allocation of compressed cluster "
1111                                        "at offset 0");
1112                return -EIO;
1113            }
1114
1115            if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
1116                offset = new_cluster;
1117                free_in_cluster = s->cluster_size;
1118            } else {
1119                free_in_cluster += s->cluster_size;
1120            }
1121        }
1122
1123        assert(offset);
1124        ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1125        if (ret < 0) {
1126            offset = 0;
1127        }
1128    } while (ret == -EAGAIN);
1129    if (ret < 0) {
1130        return ret;
1131    }
1132
1133    /* The cluster refcount was incremented; refcount blocks must be flushed
1134     * before the caller's L2 table updates. */
1135    qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
1136
1137    s->free_byte_offset = offset + size;
1138    if (!offset_into_cluster(s, s->free_byte_offset)) {
1139        s->free_byte_offset = 0;
1140    }
1141
1142    return offset;
1143}
1144
1145void qcow2_free_clusters(BlockDriverState *bs,
1146                          int64_t offset, int64_t size,
1147                          enum qcow2_discard_type type)
1148{
1149    int ret;
1150
1151    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
1152    ret = update_refcount(bs, offset, size, 1, true, type);
1153    if (ret < 0) {
1154        fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
1155        /* TODO Remember the clusters to free them later and avoid leaking */
1156    }
1157}
1158
1159/*
1160 * Free a cluster using its L2 entry (handles clusters of all types, e.g.
1161 * normal cluster, compressed cluster, etc.)
1162 */
1163void qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry,
1164                            enum qcow2_discard_type type)
1165{
1166    BDRVQcow2State *s = bs->opaque;
1167    QCow2ClusterType ctype = qcow2_get_cluster_type(bs, l2_entry);
1168
1169    if (has_data_file(bs)) {
1170        if (s->discard_passthrough[type] &&
1171            (ctype == QCOW2_CLUSTER_NORMAL ||
1172             ctype == QCOW2_CLUSTER_ZERO_ALLOC))
1173        {
1174            bdrv_pdiscard(s->data_file, l2_entry & L2E_OFFSET_MASK,
1175                          s->cluster_size);
1176        }
1177        return;
1178    }
1179
1180    switch (ctype) {
1181    case QCOW2_CLUSTER_COMPRESSED:
1182        {
1183            uint64_t coffset;
1184            int csize;
1185
1186            qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
1187            qcow2_free_clusters(bs, coffset, csize, type);
1188        }
1189        break;
1190    case QCOW2_CLUSTER_NORMAL:
1191    case QCOW2_CLUSTER_ZERO_ALLOC:
1192        if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1193            qcow2_signal_corruption(bs, false, -1, -1,
1194                                    "Cannot free unaligned cluster %#llx",
1195                                    l2_entry & L2E_OFFSET_MASK);
1196        } else {
1197            qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1198                                s->cluster_size, type);
1199        }
1200        break;
1201    case QCOW2_CLUSTER_ZERO_PLAIN:
1202    case QCOW2_CLUSTER_UNALLOCATED:
1203        break;
1204    default:
1205        abort();
1206    }
1207}
1208
1209int coroutine_fn qcow2_write_caches(BlockDriverState *bs)
1210{
1211    BDRVQcow2State *s = bs->opaque;
1212    int ret;
1213
1214    ret = qcow2_cache_write(bs, s->l2_table_cache);
1215    if (ret < 0) {
1216        return ret;
1217    }
1218
1219    if (qcow2_need_accurate_refcounts(s)) {
1220        ret = qcow2_cache_write(bs, s->refcount_block_cache);
1221        if (ret < 0) {
1222            return ret;
1223        }
1224    }
1225
1226    return 0;
1227}
1228
1229int coroutine_fn qcow2_flush_caches(BlockDriverState *bs)
1230{
1231    int ret = qcow2_write_caches(bs);
1232    if (ret < 0) {
1233        return ret;
1234    }
1235
1236    return bdrv_flush(bs->file->bs);
1237}
1238
1239/*********************************************************/
1240/* snapshots and image creation */
1241
1242
1243
1244/* update the refcounts of snapshots and the copied flag */
1245int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1246    int64_t l1_table_offset, int l1_size, int addend)
1247{
1248    BDRVQcow2State *s = bs->opaque;
1249    uint64_t *l1_table, *l2_slice, l2_offset, entry, l1_size2, refcount;
1250    bool l1_allocated = false;
1251    int64_t old_entry, old_l2_offset;
1252    unsigned slice, slice_size2, n_slices;
1253    int i, j, l1_modified = 0;
1254    int ret;
1255
1256    assert(addend >= -1 && addend <= 1);
1257
1258    l2_slice = NULL;
1259    l1_table = NULL;
1260    l1_size2 = l1_size * L1E_SIZE;
1261    slice_size2 = s->l2_slice_size * l2_entry_size(s);
1262    n_slices = s->cluster_size / slice_size2;
1263
1264    s->cache_discards = true;
1265
1266    /* WARNING: qcow2_snapshot_goto relies on this function not using the
1267     * l1_table_offset when it is the current s->l1_table_offset! Be careful
1268     * when changing this! */
1269    if (l1_table_offset != s->l1_table_offset) {
1270        l1_table = g_try_malloc0(l1_size2);
1271        if (l1_size2 && l1_table == NULL) {
1272            ret = -ENOMEM;
1273            goto fail;
1274        }
1275        l1_allocated = true;
1276
1277        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1278        if (ret < 0) {
1279            goto fail;
1280        }
1281
1282        for (i = 0; i < l1_size; i++) {
1283            be64_to_cpus(&l1_table[i]);
1284        }
1285    } else {
1286        assert(l1_size == s->l1_size);
1287        l1_table = s->l1_table;
1288        l1_allocated = false;
1289    }
1290
1291    for (i = 0; i < l1_size; i++) {
1292        l2_offset = l1_table[i];
1293        if (l2_offset) {
1294            old_l2_offset = l2_offset;
1295            l2_offset &= L1E_OFFSET_MASK;
1296
1297            if (offset_into_cluster(s, l2_offset)) {
1298                qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1299                                        PRIx64 " unaligned (L1 index: %#x)",
1300                                        l2_offset, i);
1301                ret = -EIO;
1302                goto fail;
1303            }
1304
1305            for (slice = 0; slice < n_slices; slice++) {
1306                ret = qcow2_cache_get(bs, s->l2_table_cache,
1307                                      l2_offset + slice * slice_size2,
1308                                      (void **) &l2_slice);
1309                if (ret < 0) {
1310                    goto fail;
1311                }
1312
1313                for (j = 0; j < s->l2_slice_size; j++) {
1314                    uint64_t cluster_index;
1315                    uint64_t offset;
1316
1317                    entry = get_l2_entry(s, l2_slice, j);
1318                    old_entry = entry;
1319                    entry &= ~QCOW_OFLAG_COPIED;
1320                    offset = entry & L2E_OFFSET_MASK;
1321
1322                    switch (qcow2_get_cluster_type(bs, entry)) {
1323                    case QCOW2_CLUSTER_COMPRESSED:
1324                        if (addend != 0) {
1325                            uint64_t coffset;
1326                            int csize;
1327
1328                            qcow2_parse_compressed_l2_entry(bs, entry,
1329                                                            &coffset, &csize);
1330                            ret = update_refcount(
1331                                bs, coffset, csize,
1332                                abs(addend), addend < 0,
1333                                QCOW2_DISCARD_SNAPSHOT);
1334                            if (ret < 0) {
1335                                goto fail;
1336                            }
1337                        }
1338                        /* compressed clusters are never modified */
1339                        refcount = 2;
1340                        break;
1341
1342                    case QCOW2_CLUSTER_NORMAL:
1343                    case QCOW2_CLUSTER_ZERO_ALLOC:
1344                        if (offset_into_cluster(s, offset)) {
1345                            /* Here l2_index means table (not slice) index */
1346                            int l2_index = slice * s->l2_slice_size + j;
1347                            qcow2_signal_corruption(
1348                                bs, true, -1, -1, "Cluster "
1349                                "allocation offset %#" PRIx64
1350                                " unaligned (L2 offset: %#"
1351                                PRIx64 ", L2 index: %#x)",
1352                                offset, l2_offset, l2_index);
1353                            ret = -EIO;
1354                            goto fail;
1355                        }
1356
1357                        cluster_index = offset >> s->cluster_bits;
1358                        assert(cluster_index);
1359                        if (addend != 0) {
1360                            ret = qcow2_update_cluster_refcount(
1361                                bs, cluster_index, abs(addend), addend < 0,
1362                                QCOW2_DISCARD_SNAPSHOT);
1363                            if (ret < 0) {
1364                                goto fail;
1365                            }
1366                        }
1367
1368                        ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1369                        if (ret < 0) {
1370                            goto fail;
1371                        }
1372                        break;
1373
1374                    case QCOW2_CLUSTER_ZERO_PLAIN:
1375                    case QCOW2_CLUSTER_UNALLOCATED:
1376                        refcount = 0;
1377                        break;
1378
1379                    default:
1380                        abort();
1381                    }
1382
1383                    if (refcount == 1) {
1384                        entry |= QCOW_OFLAG_COPIED;
1385                    }
1386                    if (entry != old_entry) {
1387                        if (addend > 0) {
1388                            qcow2_cache_set_dependency(bs, s->l2_table_cache,
1389                                                       s->refcount_block_cache);
1390                        }
1391                        set_l2_entry(s, l2_slice, j, entry);
1392                        qcow2_cache_entry_mark_dirty(s->l2_table_cache,
1393                                                     l2_slice);
1394                    }
1395                }
1396
1397                qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1398            }
1399
1400            if (addend != 0) {
1401                ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1402                                                        s->cluster_bits,
1403                                                    abs(addend), addend < 0,
1404                                                    QCOW2_DISCARD_SNAPSHOT);
1405                if (ret < 0) {
1406                    goto fail;
1407                }
1408            }
1409            ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1410                                     &refcount);
1411            if (ret < 0) {
1412                goto fail;
1413            } else if (refcount == 1) {
1414                l2_offset |= QCOW_OFLAG_COPIED;
1415            }
1416            if (l2_offset != old_l2_offset) {
1417                l1_table[i] = l2_offset;
1418                l1_modified = 1;
1419            }
1420        }
1421    }
1422
1423    ret = bdrv_flush(bs);
1424fail:
1425    if (l2_slice) {
1426        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1427    }
1428
1429    s->cache_discards = false;
1430    qcow2_process_discards(bs, ret);
1431
1432    /* Update L1 only if it isn't deleted anyway (addend = -1) */
1433    if (ret == 0 && addend >= 0 && l1_modified) {
1434        for (i = 0; i < l1_size; i++) {
1435            cpu_to_be64s(&l1_table[i]);
1436        }
1437
1438        ret = bdrv_pwrite_sync(bs->file, l1_table_offset,
1439                               l1_table, l1_size2);
1440
1441        for (i = 0; i < l1_size; i++) {
1442            be64_to_cpus(&l1_table[i]);
1443        }
1444    }
1445    if (l1_allocated)
1446        g_free(l1_table);
1447    return ret;
1448}
1449
1450
1451
1452
1453/*********************************************************/
1454/* refcount checking functions */
1455
1456
1457static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
1458{
1459    /* This assertion holds because there is no way we can address more than
1460     * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1461     * offsets have to be representable in bytes); due to every cluster
1462     * corresponding to one refcount entry, we are well below that limit */
1463    assert(entries < (UINT64_C(1) << (64 - 9)));
1464
1465    /* Thanks to the assertion this will not overflow, because
1466     * s->refcount_order < 7.
1467     * (note: x << s->refcount_order == x * s->refcount_bits) */
1468    return DIV_ROUND_UP(entries << s->refcount_order, 8);
1469}
1470
1471/**
1472 * Reallocates *array so that it can hold new_size entries. *size must contain
1473 * the current number of entries in *array. If the reallocation fails, *array
1474 * and *size will not be modified and -errno will be returned. If the
1475 * reallocation is successful, *array will be set to the new buffer, *size
1476 * will be set to new_size and 0 will be returned. The size of the reallocated
1477 * refcount array buffer will be aligned to a cluster boundary, and the newly
1478 * allocated area will be zeroed.
1479 */
1480static int realloc_refcount_array(BDRVQcow2State *s, void **array,
1481                                  int64_t *size, int64_t new_size)
1482{
1483    int64_t old_byte_size, new_byte_size;
1484    void *new_ptr;
1485
1486    /* Round to clusters so the array can be directly written to disk */
1487    old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1488                    * s->cluster_size;
1489    new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1490                    * s->cluster_size;
1491
1492    if (new_byte_size == old_byte_size) {
1493        *size = new_size;
1494        return 0;
1495    }
1496
1497    assert(new_byte_size > 0);
1498
1499    if (new_byte_size > SIZE_MAX) {
1500        return -ENOMEM;
1501    }
1502
1503    new_ptr = g_try_realloc(*array, new_byte_size);
1504    if (!new_ptr) {
1505        return -ENOMEM;
1506    }
1507
1508    if (new_byte_size > old_byte_size) {
1509        memset((char *)new_ptr + old_byte_size, 0,
1510               new_byte_size - old_byte_size);
1511    }
1512
1513    *array = new_ptr;
1514    *size  = new_size;
1515
1516    return 0;
1517}
1518
1519/*
1520 * Increases the refcount for a range of clusters in a given refcount table.
1521 * This is used to construct a temporary refcount table out of L1 and L2 tables
1522 * which can be compared to the refcount table saved in the image.
1523 *
1524 * Modifies the number of errors in res.
1525 */
1526int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
1527                             void **refcount_table,
1528                             int64_t *refcount_table_size,
1529                             int64_t offset, int64_t size)
1530{
1531    BDRVQcow2State *s = bs->opaque;
1532    uint64_t start, last, cluster_offset, k, refcount;
1533    int64_t file_len;
1534    int ret;
1535
1536    if (size <= 0) {
1537        return 0;
1538    }
1539
1540    file_len = bdrv_getlength(bs->file->bs);
1541    if (file_len < 0) {
1542        return file_len;
1543    }
1544
1545    /*
1546     * Last cluster of qcow2 image may be semi-allocated, so it may be OK to
1547     * reference some space after file end but it should be less than one
1548     * cluster.
1549     */
1550    if (offset + size - file_len >= s->cluster_size) {
1551        fprintf(stderr, "ERROR: counting reference for region exceeding the "
1552                "end of the file by one cluster or more: offset 0x%" PRIx64
1553                " size 0x%" PRIx64 "\n", offset, size);
1554        res->corruptions++;
1555        return 0;
1556    }
1557
1558    start = start_of_cluster(s, offset);
1559    last = start_of_cluster(s, offset + size - 1);
1560    for(cluster_offset = start; cluster_offset <= last;
1561        cluster_offset += s->cluster_size) {
1562        k = cluster_offset >> s->cluster_bits;
1563        if (k >= *refcount_table_size) {
1564            ret = realloc_refcount_array(s, refcount_table,
1565                                         refcount_table_size, k + 1);
1566            if (ret < 0) {
1567                res->check_errors++;
1568                return ret;
1569            }
1570        }
1571
1572        refcount = s->get_refcount(*refcount_table, k);
1573        if (refcount == s->refcount_max) {
1574            fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1575                    "\n", cluster_offset);
1576            fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1577                    "width or qemu-img convert to create a clean copy if the "
1578                    "image cannot be opened for writing\n");
1579            res->corruptions++;
1580            continue;
1581        }
1582        s->set_refcount(*refcount_table, k, refcount + 1);
1583    }
1584
1585    return 0;
1586}
1587
1588/* Flags for check_refcounts_l1() and check_refcounts_l2() */
1589enum {
1590    CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1591};
1592
1593/*
1594 * Fix L2 entry by making it QCOW2_CLUSTER_ZERO_PLAIN (or making all its present
1595 * subclusters QCOW2_SUBCLUSTER_ZERO_PLAIN).
1596 *
1597 * This function decrements res->corruptions on success, so the caller is
1598 * responsible to increment res->corruptions prior to the call.
1599 *
1600 * On failure in-memory @l2_table may be modified.
1601 */
1602static int fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res,
1603                                uint64_t l2_offset,
1604                                uint64_t *l2_table, int l2_index, bool active,
1605                                bool *metadata_overlap)
1606{
1607    BDRVQcow2State *s = bs->opaque;
1608    int ret;
1609    int idx = l2_index * (l2_entry_size(s) / sizeof(uint64_t));
1610    uint64_t l2e_offset = l2_offset + (uint64_t)l2_index * l2_entry_size(s);
1611    int ign = active ? QCOW2_OL_ACTIVE_L2 : QCOW2_OL_INACTIVE_L2;
1612
1613    if (has_subclusters(s)) {
1614        uint64_t l2_bitmap = get_l2_bitmap(s, l2_table, l2_index);
1615
1616        /* Allocated subclusters become zero */
1617        l2_bitmap |= l2_bitmap << 32;
1618        l2_bitmap &= QCOW_L2_BITMAP_ALL_ZEROES;
1619
1620        set_l2_bitmap(s, l2_table, l2_index, l2_bitmap);
1621        set_l2_entry(s, l2_table, l2_index, 0);
1622    } else {
1623        set_l2_entry(s, l2_table, l2_index, QCOW_OFLAG_ZERO);
1624    }
1625
1626    ret = qcow2_pre_write_overlap_check(bs, ign, l2e_offset, l2_entry_size(s),
1627                                        false);
1628    if (metadata_overlap) {
1629        *metadata_overlap = ret < 0;
1630    }
1631    if (ret < 0) {
1632        fprintf(stderr, "ERROR: Overlap check failed\n");
1633        goto fail;
1634    }
1635
1636    ret = bdrv_pwrite_sync(bs->file, l2e_offset, &l2_table[idx],
1637                           l2_entry_size(s));
1638    if (ret < 0) {
1639        fprintf(stderr, "ERROR: Failed to overwrite L2 "
1640                "table entry: %s\n", strerror(-ret));
1641        goto fail;
1642    }
1643
1644    res->corruptions--;
1645    res->corruptions_fixed++;
1646    return 0;
1647
1648fail:
1649    res->check_errors++;
1650    return ret;
1651}
1652
1653/*
1654 * Increases the refcount in the given refcount table for the all clusters
1655 * referenced in the L2 table. While doing so, performs some checks on L2
1656 * entries.
1657 *
1658 * Returns the number of errors found by the checks or -errno if an internal
1659 * error occurred.
1660 */
1661static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1662                              void **refcount_table,
1663                              int64_t *refcount_table_size, int64_t l2_offset,
1664                              int flags, BdrvCheckMode fix, bool active)
1665{
1666    BDRVQcow2State *s = bs->opaque;
1667    uint64_t l2_entry, l2_bitmap;
1668    uint64_t next_contiguous_offset = 0;
1669    int i, ret;
1670    size_t l2_size_bytes = s->l2_size * l2_entry_size(s);
1671    g_autofree uint64_t *l2_table = g_malloc(l2_size_bytes);
1672    bool metadata_overlap;
1673
1674    /* Read L2 table from disk */
1675    ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size_bytes);
1676    if (ret < 0) {
1677        fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1678        res->check_errors++;
1679        return ret;
1680    }
1681
1682    /* Do the actual checks */
1683    for (i = 0; i < s->l2_size; i++) {
1684        uint64_t coffset;
1685        int csize;
1686        QCow2ClusterType type;
1687
1688        l2_entry = get_l2_entry(s, l2_table, i);
1689        l2_bitmap = get_l2_bitmap(s, l2_table, i);
1690        type = qcow2_get_cluster_type(bs, l2_entry);
1691
1692        if (type != QCOW2_CLUSTER_COMPRESSED) {
1693            /* Check reserved bits of Standard Cluster Descriptor */
1694            if (l2_entry & L2E_STD_RESERVED_MASK) {
1695                fprintf(stderr, "ERROR found l2 entry with reserved bits set: "
1696                        "%" PRIx64 "\n", l2_entry);
1697                res->corruptions++;
1698            }
1699        }
1700
1701        switch (type) {
1702        case QCOW2_CLUSTER_COMPRESSED:
1703            /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1704            if (l2_entry & QCOW_OFLAG_COPIED) {
1705                fprintf(stderr, "ERROR: coffset=0x%" PRIx64 ": "
1706                    "copied flag must never be set for compressed "
1707                    "clusters\n", l2_entry & s->cluster_offset_mask);
1708                l2_entry &= ~QCOW_OFLAG_COPIED;
1709                res->corruptions++;
1710            }
1711
1712            if (has_data_file(bs)) {
1713                fprintf(stderr, "ERROR compressed cluster %d with data file, "
1714                        "entry=0x%" PRIx64 "\n", i, l2_entry);
1715                res->corruptions++;
1716                break;
1717            }
1718
1719            if (l2_bitmap) {
1720                fprintf(stderr, "ERROR compressed cluster %d with non-zero "
1721                        "subcluster allocation bitmap, entry=0x%" PRIx64 "\n",
1722                        i, l2_entry);
1723                res->corruptions++;
1724                break;
1725            }
1726
1727            /* Mark cluster as used */
1728            qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
1729            ret = qcow2_inc_refcounts_imrt(
1730                bs, res, refcount_table, refcount_table_size, coffset, csize);
1731            if (ret < 0) {
1732                return ret;
1733            }
1734
1735            if (flags & CHECK_FRAG_INFO) {
1736                res->bfi.allocated_clusters++;
1737                res->bfi.compressed_clusters++;
1738
1739                /*
1740                 * Compressed clusters are fragmented by nature.  Since they
1741                 * take up sub-sector space but we only have sector granularity
1742                 * I/O we need to re-read the same sectors even for adjacent
1743                 * compressed clusters.
1744                 */
1745                res->bfi.fragmented_clusters++;
1746            }
1747            break;
1748
1749        case QCOW2_CLUSTER_ZERO_ALLOC:
1750        case QCOW2_CLUSTER_NORMAL:
1751        {
1752            uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1753
1754            if ((l2_bitmap >> 32) & l2_bitmap) {
1755                res->corruptions++;
1756                fprintf(stderr, "ERROR offset=%" PRIx64 ": Allocated "
1757                        "cluster has corrupted subcluster allocation bitmap\n",
1758                        offset);
1759            }
1760
1761            /* Correct offsets are cluster aligned */
1762            if (offset_into_cluster(s, offset)) {
1763                bool contains_data;
1764                res->corruptions++;
1765
1766                if (has_subclusters(s)) {
1767                    contains_data = (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC);
1768                } else {
1769                    contains_data = !(l2_entry & QCOW_OFLAG_ZERO);
1770                }
1771
1772                if (!contains_data) {
1773                    fprintf(stderr, "%s offset=%" PRIx64 ": Preallocated "
1774                            "cluster is not properly aligned; L2 entry "
1775                            "corrupted.\n",
1776                            fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
1777                            offset);
1778                    if (fix & BDRV_FIX_ERRORS) {
1779                        ret = fix_l2_entry_by_zero(bs, res, l2_offset,
1780                                                   l2_table, i, active,
1781                                                   &metadata_overlap);
1782                        if (metadata_overlap) {
1783                            /*
1784                             * Something is seriously wrong, so abort checking
1785                             * this L2 table.
1786                             */
1787                            return ret;
1788                        }
1789
1790                        if (ret == 0) {
1791                            /*
1792                             * Skip marking the cluster as used
1793                             * (it is unused now).
1794                             */
1795                            continue;
1796                        }
1797
1798                        /*
1799                         * Failed to fix.
1800                         * Do not abort, continue checking the rest of this
1801                         * L2 table's entries.
1802                         */
1803                    }
1804                } else {
1805                    fprintf(stderr, "ERROR offset=%" PRIx64 ": Data cluster is "
1806                        "not properly aligned; L2 entry corrupted.\n", offset);
1807                }
1808            }
1809
1810            if (flags & CHECK_FRAG_INFO) {
1811                res->bfi.allocated_clusters++;
1812                if (next_contiguous_offset &&
1813                    offset != next_contiguous_offset) {
1814                    res->bfi.fragmented_clusters++;
1815                }
1816                next_contiguous_offset = offset + s->cluster_size;
1817            }
1818
1819            /* Mark cluster as used */
1820            if (!has_data_file(bs)) {
1821                ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table,
1822                                               refcount_table_size,
1823                                               offset, s->cluster_size);
1824                if (ret < 0) {
1825                    return ret;
1826                }
1827            }
1828            break;
1829        }
1830
1831        case QCOW2_CLUSTER_ZERO_PLAIN:
1832            /* Impossible when image has subclusters */
1833            assert(!l2_bitmap);
1834            break;
1835
1836        case QCOW2_CLUSTER_UNALLOCATED:
1837            if (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC) {
1838                res->corruptions++;
1839                fprintf(stderr, "ERROR: Unallocated "
1840                        "cluster has non-zero subcluster allocation map\n");
1841            }
1842            break;
1843
1844        default:
1845            abort();
1846        }
1847    }
1848
1849    return 0;
1850}
1851
1852/*
1853 * Increases the refcount for the L1 table, its L2 tables and all referenced
1854 * clusters in the given refcount table. While doing so, performs some checks
1855 * on L1 and L2 entries.
1856 *
1857 * Returns the number of errors found by the checks or -errno if an internal
1858 * error occurred.
1859 */
1860static int check_refcounts_l1(BlockDriverState *bs,
1861                              BdrvCheckResult *res,
1862                              void **refcount_table,
1863                              int64_t *refcount_table_size,
1864                              int64_t l1_table_offset, int l1_size,
1865                              int flags, BdrvCheckMode fix, bool active)
1866{
1867    BDRVQcow2State *s = bs->opaque;
1868    size_t l1_size_bytes = l1_size * L1E_SIZE;
1869    g_autofree uint64_t *l1_table = NULL;
1870    uint64_t l2_offset;
1871    int i, ret;
1872
1873    if (!l1_size) {
1874        return 0;
1875    }
1876
1877    /* Mark L1 table as used */
1878    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size,
1879                                   l1_table_offset, l1_size_bytes);
1880    if (ret < 0) {
1881        return ret;
1882    }
1883
1884    l1_table = g_try_malloc(l1_size_bytes);
1885    if (l1_table == NULL) {
1886        res->check_errors++;
1887        return -ENOMEM;
1888    }
1889
1890    /* Read L1 table entries from disk */
1891    ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size_bytes);
1892    if (ret < 0) {
1893        fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1894        res->check_errors++;
1895        return ret;
1896    }
1897
1898    for (i = 0; i < l1_size; i++) {
1899        be64_to_cpus(&l1_table[i]);
1900    }
1901
1902    /* Do the actual checks */
1903    for (i = 0; i < l1_size; i++) {
1904        if (!l1_table[i]) {
1905            continue;
1906        }
1907
1908        if (l1_table[i] & L1E_RESERVED_MASK) {
1909            fprintf(stderr, "ERROR found L1 entry with reserved bits set: "
1910                    "%" PRIx64 "\n", l1_table[i]);
1911            res->corruptions++;
1912        }
1913
1914        l2_offset = l1_table[i] & L1E_OFFSET_MASK;
1915
1916        /* Mark L2 table as used */
1917        ret = qcow2_inc_refcounts_imrt(bs, res,
1918                                       refcount_table, refcount_table_size,
1919                                       l2_offset, s->cluster_size);
1920        if (ret < 0) {
1921            return ret;
1922        }
1923
1924        /* L2 tables are cluster aligned */
1925        if (offset_into_cluster(s, l2_offset)) {
1926            fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1927                "cluster aligned; L1 entry corrupted\n", l2_offset);
1928            res->corruptions++;
1929        }
1930
1931        /* Process and check L2 entries */
1932        ret = check_refcounts_l2(bs, res, refcount_table,
1933                                 refcount_table_size, l2_offset, flags,
1934                                 fix, active);
1935        if (ret < 0) {
1936            return ret;
1937        }
1938    }
1939
1940    return 0;
1941}
1942
1943/*
1944 * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1945 *
1946 * This function does not print an error message nor does it increment
1947 * check_errors if qcow2_get_refcount fails (this is because such an error will
1948 * have been already detected and sufficiently signaled by the calling function
1949 * (qcow2_check_refcounts) by the time this function is called).
1950 */
1951static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1952                              BdrvCheckMode fix)
1953{
1954    BDRVQcow2State *s = bs->opaque;
1955    uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1956    int ret;
1957    uint64_t refcount;
1958    int i, j;
1959    bool repair;
1960
1961    if (fix & BDRV_FIX_ERRORS) {
1962        /* Always repair */
1963        repair = true;
1964    } else if (fix & BDRV_FIX_LEAKS) {
1965        /* Repair only if that seems safe: This function is always
1966         * called after the refcounts have been fixed, so the refcount
1967         * is accurate if that repair was successful */
1968        repair = !res->check_errors && !res->corruptions && !res->leaks;
1969    } else {
1970        repair = false;
1971    }
1972
1973    for (i = 0; i < s->l1_size; i++) {
1974        uint64_t l1_entry = s->l1_table[i];
1975        uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1976        int l2_dirty = 0;
1977
1978        if (!l2_offset) {
1979            continue;
1980        }
1981
1982        ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1983                                 &refcount);
1984        if (ret < 0) {
1985            /* don't print message nor increment check_errors */
1986            continue;
1987        }
1988        if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1989            res->corruptions++;
1990            fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1991                    "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1992                    repair ? "Repairing" : "ERROR", i, l1_entry, refcount);
1993            if (repair) {
1994                s->l1_table[i] = refcount == 1
1995                               ? l1_entry |  QCOW_OFLAG_COPIED
1996                               : l1_entry & ~QCOW_OFLAG_COPIED;
1997                ret = qcow2_write_l1_entry(bs, i);
1998                if (ret < 0) {
1999                    res->check_errors++;
2000                    goto fail;
2001                }
2002                res->corruptions--;
2003                res->corruptions_fixed++;
2004            }
2005        }
2006
2007        ret = bdrv_pread(bs->file, l2_offset, l2_table,
2008                         s->l2_size * l2_entry_size(s));
2009        if (ret < 0) {
2010            fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
2011                    strerror(-ret));
2012            res->check_errors++;
2013            goto fail;
2014        }
2015
2016        for (j = 0; j < s->l2_size; j++) {
2017            uint64_t l2_entry = get_l2_entry(s, l2_table, j);
2018            uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
2019            QCow2ClusterType cluster_type = qcow2_get_cluster_type(bs, l2_entry);
2020
2021            if (cluster_type == QCOW2_CLUSTER_NORMAL ||
2022                cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) {
2023                if (has_data_file(bs)) {
2024                    refcount = 1;
2025                } else {
2026                    ret = qcow2_get_refcount(bs,
2027                                             data_offset >> s->cluster_bits,
2028                                             &refcount);
2029                    if (ret < 0) {
2030                        /* don't print message nor increment check_errors */
2031                        continue;
2032                    }
2033                }
2034                if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
2035                    res->corruptions++;
2036                    fprintf(stderr, "%s OFLAG_COPIED data cluster: "
2037                            "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
2038                            repair ? "Repairing" : "ERROR", l2_entry, refcount);
2039                    if (repair) {
2040                        set_l2_entry(s, l2_table, j,
2041                                     refcount == 1 ?
2042                                     l2_entry |  QCOW_OFLAG_COPIED :
2043                                     l2_entry & ~QCOW_OFLAG_COPIED);
2044                        l2_dirty++;
2045                    }
2046                }
2047            }
2048        }
2049
2050        if (l2_dirty > 0) {
2051            ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
2052                                                l2_offset, s->cluster_size,
2053                                                false);
2054            if (ret < 0) {
2055                fprintf(stderr, "ERROR: Could not write L2 table; metadata "
2056                        "overlap check failed: %s\n", strerror(-ret));
2057                res->check_errors++;
2058                goto fail;
2059            }
2060
2061            ret = bdrv_pwrite(bs->file, l2_offset, l2_table,
2062                              s->cluster_size);
2063            if (ret < 0) {
2064                fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
2065                        strerror(-ret));
2066                res->check_errors++;
2067                goto fail;
2068            }
2069            res->corruptions -= l2_dirty;
2070            res->corruptions_fixed += l2_dirty;
2071        }
2072    }
2073
2074    ret = 0;
2075
2076fail:
2077    qemu_vfree(l2_table);
2078    return ret;
2079}
2080
2081/*
2082 * Checks consistency of refblocks and accounts for each refblock in
2083 * *refcount_table.
2084 */
2085static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
2086                           BdrvCheckMode fix, bool *rebuild,
2087                           void **refcount_table, int64_t *nb_clusters)
2088{
2089    BDRVQcow2State *s = bs->opaque;
2090    int64_t i, size;
2091    int ret;
2092
2093    for(i = 0; i < s->refcount_table_size; i++) {
2094        uint64_t offset, cluster;
2095        offset = s->refcount_table[i] & REFT_OFFSET_MASK;
2096        cluster = offset >> s->cluster_bits;
2097
2098        if (s->refcount_table[i] & REFT_RESERVED_MASK) {
2099            fprintf(stderr, "ERROR refcount table entry %" PRId64 " has "
2100                    "reserved bits set\n", i);
2101            res->corruptions++;
2102            *rebuild = true;
2103            continue;
2104        }
2105
2106        /* Refcount blocks are cluster aligned */
2107        if (offset_into_cluster(s, offset)) {
2108            fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
2109                "cluster aligned; refcount table entry corrupted\n", i);
2110            res->corruptions++;
2111            *rebuild = true;
2112            continue;
2113        }
2114
2115        if (cluster >= *nb_clusters) {
2116            res->corruptions++;
2117            fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
2118                    fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
2119
2120            if (fix & BDRV_FIX_ERRORS) {
2121                int64_t new_nb_clusters;
2122                Error *local_err = NULL;
2123
2124                if (offset > INT64_MAX - s->cluster_size) {
2125                    ret = -EINVAL;
2126                    goto resize_fail;
2127                }
2128
2129                ret = bdrv_truncate(bs->file, offset + s->cluster_size, false,
2130                                    PREALLOC_MODE_OFF, 0, &local_err);
2131                if (ret < 0) {
2132                    error_report_err(local_err);
2133                    goto resize_fail;
2134                }
2135                size = bdrv_getlength(bs->file->bs);
2136                if (size < 0) {
2137                    ret = size;
2138                    goto resize_fail;
2139                }
2140
2141                new_nb_clusters = size_to_clusters(s, size);
2142                assert(new_nb_clusters >= *nb_clusters);
2143
2144                ret = realloc_refcount_array(s, refcount_table,
2145                                             nb_clusters, new_nb_clusters);
2146                if (ret < 0) {
2147                    res->check_errors++;
2148                    return ret;
2149                }
2150
2151                if (cluster >= *nb_clusters) {
2152                    ret = -EINVAL;
2153                    goto resize_fail;
2154                }
2155
2156                res->corruptions--;
2157                res->corruptions_fixed++;
2158                ret = qcow2_inc_refcounts_imrt(bs, res,
2159                                               refcount_table, nb_clusters,
2160                                               offset, s->cluster_size);
2161                if (ret < 0) {
2162                    return ret;
2163                }
2164                /* No need to check whether the refcount is now greater than 1:
2165                 * This area was just allocated and zeroed, so it can only be
2166                 * exactly 1 after qcow2_inc_refcounts_imrt() */
2167                continue;
2168
2169resize_fail:
2170                *rebuild = true;
2171                fprintf(stderr, "ERROR could not resize image: %s\n",
2172                        strerror(-ret));
2173            }
2174            continue;
2175        }
2176
2177        if (offset != 0) {
2178            ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2179                                           offset, s->cluster_size);
2180            if (ret < 0) {
2181                return ret;
2182            }
2183            if (s->get_refcount(*refcount_table, cluster) != 1) {
2184                fprintf(stderr, "ERROR refcount block %" PRId64
2185                        " refcount=%" PRIu64 "\n", i,
2186                        s->get_refcount(*refcount_table, cluster));
2187                res->corruptions++;
2188                *rebuild = true;
2189            }
2190        }
2191    }
2192
2193    return 0;
2194}
2195
2196/*
2197 * Calculates an in-memory refcount table.
2198 */
2199static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2200                               BdrvCheckMode fix, bool *rebuild,
2201                               void **refcount_table, int64_t *nb_clusters)
2202{
2203    BDRVQcow2State *s = bs->opaque;
2204    int64_t i;
2205    QCowSnapshot *sn;
2206    int ret;
2207
2208    if (!*refcount_table) {
2209        int64_t old_size = 0;
2210        ret = realloc_refcount_array(s, refcount_table,
2211                                     &old_size, *nb_clusters);
2212        if (ret < 0) {
2213            res->check_errors++;
2214            return ret;
2215        }
2216    }
2217
2218    /* header */
2219    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2220                                   0, s->cluster_size);
2221    if (ret < 0) {
2222        return ret;
2223    }
2224
2225    /* current L1 table */
2226    ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2227                             s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO,
2228                             fix, true);
2229    if (ret < 0) {
2230        return ret;
2231    }
2232
2233    /* snapshots */
2234    if (has_data_file(bs) && s->nb_snapshots) {
2235        fprintf(stderr, "ERROR %d snapshots in image with data file\n",
2236                s->nb_snapshots);
2237        res->corruptions++;
2238    }
2239
2240    for (i = 0; i < s->nb_snapshots; i++) {
2241        sn = s->snapshots + i;
2242        if (offset_into_cluster(s, sn->l1_table_offset)) {
2243            fprintf(stderr, "ERROR snapshot %s (%s) l1_offset=%#" PRIx64 ": "
2244                    "L1 table is not cluster aligned; snapshot table entry "
2245                    "corrupted\n", sn->id_str, sn->name, sn->l1_table_offset);
2246            res->corruptions++;
2247            continue;
2248        }
2249        if (sn->l1_size > QCOW_MAX_L1_SIZE / L1E_SIZE) {
2250            fprintf(stderr, "ERROR snapshot %s (%s) l1_size=%#" PRIx32 ": "
2251                    "L1 table is too large; snapshot table entry corrupted\n",
2252                    sn->id_str, sn->name, sn->l1_size);
2253            res->corruptions++;
2254            continue;
2255        }
2256        ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2257                                 sn->l1_table_offset, sn->l1_size, 0, fix,
2258                                 false);
2259        if (ret < 0) {
2260            return ret;
2261        }
2262    }
2263    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2264                                   s->snapshots_offset, s->snapshots_size);
2265    if (ret < 0) {
2266        return ret;
2267    }
2268
2269    /* refcount data */
2270    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2271                                   s->refcount_table_offset,
2272                                   s->refcount_table_size *
2273                                   REFTABLE_ENTRY_SIZE);
2274    if (ret < 0) {
2275        return ret;
2276    }
2277
2278    /* encryption */
2279    if (s->crypto_header.length) {
2280        ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2281                                       s->crypto_header.offset,
2282                                       s->crypto_header.length);
2283        if (ret < 0) {
2284            return ret;
2285        }
2286    }
2287
2288    /* bitmaps */
2289    ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters);
2290    if (ret < 0) {
2291        return ret;
2292    }
2293
2294    return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
2295}
2296
2297/*
2298 * Compares the actual reference count for each cluster in the image against the
2299 * refcount as reported by the refcount structures on-disk.
2300 */
2301static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2302                              BdrvCheckMode fix, bool *rebuild,
2303                              int64_t *highest_cluster,
2304                              void *refcount_table, int64_t nb_clusters)
2305{
2306    BDRVQcow2State *s = bs->opaque;
2307    int64_t i;
2308    uint64_t refcount1, refcount2;
2309    int ret;
2310
2311    for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
2312        ret = qcow2_get_refcount(bs, i, &refcount1);
2313        if (ret < 0) {
2314            fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
2315                    i, strerror(-ret));
2316            res->check_errors++;
2317            continue;
2318        }
2319
2320        refcount2 = s->get_refcount(refcount_table, i);
2321
2322        if (refcount1 > 0 || refcount2 > 0) {
2323            *highest_cluster = i;
2324        }
2325
2326        if (refcount1 != refcount2) {
2327            /* Check if we're allowed to fix the mismatch */
2328            int *num_fixed = NULL;
2329            if (refcount1 == 0) {
2330                *rebuild = true;
2331            } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
2332                num_fixed = &res->leaks_fixed;
2333            } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
2334                num_fixed = &res->corruptions_fixed;
2335            }
2336
2337            fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
2338                    " reference=%" PRIu64 "\n",
2339                   num_fixed != NULL     ? "Repairing" :
2340                   refcount1 < refcount2 ? "ERROR" :
2341                                           "Leaked",
2342                   i, refcount1, refcount2);
2343
2344            if (num_fixed) {
2345                ret = update_refcount(bs, i << s->cluster_bits, 1,
2346                                      refcount_diff(refcount1, refcount2),
2347                                      refcount1 > refcount2,
2348                                      QCOW2_DISCARD_ALWAYS);
2349                if (ret >= 0) {
2350                    (*num_fixed)++;
2351                    continue;
2352                }
2353            }
2354
2355            /* And if we couldn't, print an error */
2356            if (refcount1 < refcount2) {
2357                res->corruptions++;
2358            } else {
2359                res->leaks++;
2360            }
2361        }
2362    }
2363}
2364
2365/*
2366 * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
2367 * the on-disk refcount structures.
2368 *
2369 * On input, *first_free_cluster tells where to start looking, and need not
2370 * actually be a free cluster; the returned offset will not be before that
2371 * cluster.  On output, *first_free_cluster points to the first gap found, even
2372 * if that gap was too small to be used as the returned offset.
2373 *
2374 * Note that *first_free_cluster is a cluster index whereas the return value is
2375 * an offset.
2376 */
2377static int64_t alloc_clusters_imrt(BlockDriverState *bs,
2378                                   int cluster_count,
2379                                   void **refcount_table,
2380                                   int64_t *imrt_nb_clusters,
2381                                   int64_t *first_free_cluster)
2382{
2383    BDRVQcow2State *s = bs->opaque;
2384    int64_t cluster = *first_free_cluster, i;
2385    bool first_gap = true;
2386    int contiguous_free_clusters;
2387    int ret;
2388
2389    /* Starting at *first_free_cluster, find a range of at least cluster_count
2390     * continuously free clusters */
2391    for (contiguous_free_clusters = 0;
2392         cluster < *imrt_nb_clusters &&
2393         contiguous_free_clusters < cluster_count;
2394         cluster++)
2395    {
2396        if (!s->get_refcount(*refcount_table, cluster)) {
2397            contiguous_free_clusters++;
2398            if (first_gap) {
2399                /* If this is the first free cluster found, update
2400                 * *first_free_cluster accordingly */
2401                *first_free_cluster = cluster;
2402                first_gap = false;
2403            }
2404        } else if (contiguous_free_clusters) {
2405            contiguous_free_clusters = 0;
2406        }
2407    }
2408
2409    /* If contiguous_free_clusters is greater than zero, it contains the number
2410     * of continuously free clusters until the current cluster; the first free
2411     * cluster in the current "gap" is therefore
2412     * cluster - contiguous_free_clusters */
2413
2414    /* If no such range could be found, grow the in-memory refcount table
2415     * accordingly to append free clusters at the end of the image */
2416    if (contiguous_free_clusters < cluster_count) {
2417        /* contiguous_free_clusters clusters are already empty at the image end;
2418         * we need cluster_count clusters; therefore, we have to allocate
2419         * cluster_count - contiguous_free_clusters new clusters at the end of
2420         * the image (which is the current value of cluster; note that cluster
2421         * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
2422         * the image end) */
2423        ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
2424                                     cluster + cluster_count
2425                                     - contiguous_free_clusters);
2426        if (ret < 0) {
2427            return ret;
2428        }
2429    }
2430
2431    /* Go back to the first free cluster */
2432    cluster -= contiguous_free_clusters;
2433    for (i = 0; i < cluster_count; i++) {
2434        s->set_refcount(*refcount_table, cluster + i, 1);
2435    }
2436
2437    return cluster << s->cluster_bits;
2438}
2439
2440/*
2441 * Creates a new refcount structure based solely on the in-memory information
2442 * given through *refcount_table. All necessary allocations will be reflected
2443 * in that array.
2444 *
2445 * On success, the old refcount structure is leaked (it will be covered by the
2446 * new refcount structure).
2447 */
2448static int rebuild_refcount_structure(BlockDriverState *bs,
2449                                      BdrvCheckResult *res,
2450                                      void **refcount_table,
2451                                      int64_t *nb_clusters)
2452{
2453    BDRVQcow2State *s = bs->opaque;
2454    int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0;
2455    int64_t refblock_offset, refblock_start, refblock_index;
2456    uint32_t reftable_size = 0;
2457    uint64_t *on_disk_reftable = NULL;
2458    void *on_disk_refblock;
2459    int ret = 0;
2460    struct {
2461        uint64_t reftable_offset;
2462        uint32_t reftable_clusters;
2463    } QEMU_PACKED reftable_offset_and_clusters;
2464
2465    qcow2_cache_empty(bs, s->refcount_block_cache);
2466
2467write_refblocks:
2468    for (; cluster < *nb_clusters; cluster++) {
2469        if (!s->get_refcount(*refcount_table, cluster)) {
2470            continue;
2471        }
2472
2473        refblock_index = cluster >> s->refcount_block_bits;
2474        refblock_start = refblock_index << s->refcount_block_bits;
2475
2476        /* Don't allocate a cluster in a refblock already written to disk */
2477        if (first_free_cluster < refblock_start) {
2478            first_free_cluster = refblock_start;
2479        }
2480        refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2481                                              nb_clusters, &first_free_cluster);
2482        if (refblock_offset < 0) {
2483            fprintf(stderr, "ERROR allocating refblock: %s\n",
2484                    strerror(-refblock_offset));
2485            res->check_errors++;
2486            ret = refblock_offset;
2487            goto fail;
2488        }
2489
2490        if (reftable_size <= refblock_index) {
2491            uint32_t old_reftable_size = reftable_size;
2492            uint64_t *new_on_disk_reftable;
2493
2494            reftable_size = ROUND_UP((refblock_index + 1) * REFTABLE_ENTRY_SIZE,
2495                                     s->cluster_size) / REFTABLE_ENTRY_SIZE;
2496            new_on_disk_reftable = g_try_realloc(on_disk_reftable,
2497                                                 reftable_size *
2498                                                 REFTABLE_ENTRY_SIZE);
2499            if (!new_on_disk_reftable) {
2500                res->check_errors++;
2501                ret = -ENOMEM;
2502                goto fail;
2503            }
2504            on_disk_reftable = new_on_disk_reftable;
2505
2506            memset(on_disk_reftable + old_reftable_size, 0,
2507                   (reftable_size - old_reftable_size) * REFTABLE_ENTRY_SIZE);
2508
2509            /* The offset we have for the reftable is now no longer valid;
2510             * this will leak that range, but we can easily fix that by running
2511             * a leak-fixing check after this rebuild operation */
2512            reftable_offset = -1;
2513        } else {
2514            assert(on_disk_reftable);
2515        }
2516        on_disk_reftable[refblock_index] = refblock_offset;
2517
2518        /* If this is apparently the last refblock (for now), try to squeeze the
2519         * reftable in */
2520        if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits &&
2521            reftable_offset < 0)
2522        {
2523            uint64_t reftable_clusters = size_to_clusters(s, reftable_size *
2524                                                          REFTABLE_ENTRY_SIZE);
2525            reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2526                                                  refcount_table, nb_clusters,
2527                                                  &first_free_cluster);
2528            if (reftable_offset < 0) {
2529                fprintf(stderr, "ERROR allocating reftable: %s\n",
2530                        strerror(-reftable_offset));
2531                res->check_errors++;
2532                ret = reftable_offset;
2533                goto fail;
2534            }
2535        }
2536
2537        ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2538                                            s->cluster_size, false);
2539        if (ret < 0) {
2540            fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2541            goto fail;
2542        }
2543
2544        /* The size of *refcount_table is always cluster-aligned, therefore the
2545         * write operation will not overflow */
2546        on_disk_refblock = (void *)((char *) *refcount_table +
2547                                    refblock_index * s->cluster_size);
2548
2549        ret = bdrv_pwrite(bs->file, refblock_offset, on_disk_refblock,
2550                          s->cluster_size);
2551        if (ret < 0) {
2552            fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2553            goto fail;
2554        }
2555
2556        /* Go to the end of this refblock */
2557        cluster = refblock_start + s->refcount_block_size - 1;
2558    }
2559
2560    if (reftable_offset < 0) {
2561        uint64_t post_refblock_start, reftable_clusters;
2562
2563        post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size);
2564        reftable_clusters =
2565            size_to_clusters(s, reftable_size * REFTABLE_ENTRY_SIZE);
2566        /* Not pretty but simple */
2567        if (first_free_cluster < post_refblock_start) {
2568            first_free_cluster = post_refblock_start;
2569        }
2570        reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2571                                              refcount_table, nb_clusters,
2572                                              &first_free_cluster);
2573        if (reftable_offset < 0) {
2574            fprintf(stderr, "ERROR allocating reftable: %s\n",
2575                    strerror(-reftable_offset));
2576            res->check_errors++;
2577            ret = reftable_offset;
2578            goto fail;
2579        }
2580
2581        goto write_refblocks;
2582    }
2583
2584    for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2585        cpu_to_be64s(&on_disk_reftable[refblock_index]);
2586    }
2587
2588    ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset,
2589                                        reftable_size * REFTABLE_ENTRY_SIZE,
2590                                        false);
2591    if (ret < 0) {
2592        fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2593        goto fail;
2594    }
2595
2596    assert(reftable_size < INT_MAX / REFTABLE_ENTRY_SIZE);
2597    ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable,
2598                      reftable_size * REFTABLE_ENTRY_SIZE);
2599    if (ret < 0) {
2600        fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2601        goto fail;
2602    }
2603
2604    /* Enter new reftable into the image header */
2605    reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
2606    reftable_offset_and_clusters.reftable_clusters =
2607        cpu_to_be32(size_to_clusters(s, reftable_size * REFTABLE_ENTRY_SIZE));
2608    ret = bdrv_pwrite_sync(bs->file,
2609                           offsetof(QCowHeader, refcount_table_offset),
2610                           &reftable_offset_and_clusters,
2611                           sizeof(reftable_offset_and_clusters));
2612    if (ret < 0) {
2613        fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret));
2614        goto fail;
2615    }
2616
2617    for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2618        be64_to_cpus(&on_disk_reftable[refblock_index]);
2619    }
2620    s->refcount_table = on_disk_reftable;
2621    s->refcount_table_offset = reftable_offset;
2622    s->refcount_table_size = reftable_size;
2623    update_max_refcount_table_index(s);
2624
2625    return 0;
2626
2627fail:
2628    g_free(on_disk_reftable);
2629    return ret;
2630}
2631
2632/*
2633 * Checks an image for refcount consistency.
2634 *
2635 * Returns 0 if no errors are found, the number of errors in case the image is
2636 * detected as corrupted, and -errno when an internal error occurred.
2637 */
2638int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2639                          BdrvCheckMode fix)
2640{
2641    BDRVQcow2State *s = bs->opaque;
2642    BdrvCheckResult pre_compare_res;
2643    int64_t size, highest_cluster, nb_clusters;
2644    void *refcount_table = NULL;
2645    bool rebuild = false;
2646    int ret;
2647
2648    size = bdrv_getlength(bs->file->bs);
2649    if (size < 0) {
2650        res->check_errors++;
2651        return size;
2652    }
2653
2654    nb_clusters = size_to_clusters(s, size);
2655    if (nb_clusters > INT_MAX) {
2656        res->check_errors++;
2657        return -EFBIG;
2658    }
2659
2660    res->bfi.total_clusters =
2661        size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2662
2663    ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2664                              &nb_clusters);
2665    if (ret < 0) {
2666        goto fail;
2667    }
2668
2669    /* In case we don't need to rebuild the refcount structure (but want to fix
2670     * something), this function is immediately called again, in which case the
2671     * result should be ignored */
2672    pre_compare_res = *res;
2673    compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2674                      nb_clusters);
2675
2676    if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2677        BdrvCheckResult old_res = *res;
2678        int fresh_leaks = 0;
2679
2680        fprintf(stderr, "Rebuilding refcount structure\n");
2681        ret = rebuild_refcount_structure(bs, res, &refcount_table,
2682                                         &nb_clusters);
2683        if (ret < 0) {
2684            goto fail;
2685        }
2686
2687        res->corruptions = 0;
2688        res->leaks = 0;
2689
2690        /* Because the old reftable has been exchanged for a new one the
2691         * references have to be recalculated */
2692        rebuild = false;
2693        memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2694        ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2695                                  &nb_clusters);
2696        if (ret < 0) {
2697            goto fail;
2698        }
2699
2700        if (fix & BDRV_FIX_LEAKS) {
2701            /* The old refcount structures are now leaked, fix it; the result
2702             * can be ignored, aside from leaks which were introduced by
2703             * rebuild_refcount_structure() that could not be fixed */
2704            BdrvCheckResult saved_res = *res;
2705            *res = (BdrvCheckResult){ 0 };
2706
2707            compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2708                              &highest_cluster, refcount_table, nb_clusters);
2709            if (rebuild) {
2710                fprintf(stderr, "ERROR rebuilt refcount structure is still "
2711                        "broken\n");
2712            }
2713
2714            /* Any leaks accounted for here were introduced by
2715             * rebuild_refcount_structure() because that function has created a
2716             * new refcount structure from scratch */
2717            fresh_leaks = res->leaks;
2718            *res = saved_res;
2719        }
2720
2721        if (res->corruptions < old_res.corruptions) {
2722            res->corruptions_fixed += old_res.corruptions - res->corruptions;
2723        }
2724        if (res->leaks < old_res.leaks) {
2725            res->leaks_fixed += old_res.leaks - res->leaks;
2726        }
2727        res->leaks += fresh_leaks;
2728    } else if (fix) {
2729        if (rebuild) {
2730            fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2731            res->check_errors++;
2732            ret = -EIO;
2733            goto fail;
2734        }
2735
2736        if (res->leaks || res->corruptions) {
2737            *res = pre_compare_res;
2738            compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2739                              refcount_table, nb_clusters);
2740        }
2741    }
2742
2743    /* check OFLAG_COPIED */
2744    ret = check_oflag_copied(bs, res, fix);
2745    if (ret < 0) {
2746        goto fail;
2747    }
2748
2749    res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2750    ret = 0;
2751
2752fail:
2753    g_free(refcount_table);
2754
2755    return ret;
2756}
2757
2758#define overlaps_with(ofs, sz) \
2759    ranges_overlap(offset, size, ofs, sz)
2760
2761/*
2762 * Checks if the given offset into the image file is actually free to use by
2763 * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2764 * i.e. a sanity check without relying on the refcount tables.
2765 *
2766 * The ign parameter specifies what checks not to perform (being a bitmask of
2767 * QCow2MetadataOverlap values), i.e., what sections to ignore.
2768 *
2769 * Returns:
2770 * - 0 if writing to this offset will not affect the mentioned metadata
2771 * - a positive QCow2MetadataOverlap value indicating one overlapping section
2772 * - a negative value (-errno) indicating an error while performing a check,
2773 *   e.g. when bdrv_pread failed on QCOW2_OL_INACTIVE_L2
2774 */
2775int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2776                                 int64_t size)
2777{
2778    BDRVQcow2State *s = bs->opaque;
2779    int chk = s->overlap_check & ~ign;
2780    int i, j;
2781
2782    if (!size) {
2783        return 0;
2784    }
2785
2786    if (chk & QCOW2_OL_MAIN_HEADER) {
2787        if (offset < s->cluster_size) {
2788            return QCOW2_OL_MAIN_HEADER;
2789        }
2790    }
2791
2792    /* align range to test to cluster boundaries */
2793    size = ROUND_UP(offset_into_cluster(s, offset) + size, s->cluster_size);
2794    offset = start_of_cluster(s, offset);
2795
2796    if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2797        if (overlaps_with(s->l1_table_offset, s->l1_size * L1E_SIZE)) {
2798            return QCOW2_OL_ACTIVE_L1;
2799        }
2800    }
2801
2802    if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2803        if (overlaps_with(s->refcount_table_offset,
2804            s->refcount_table_size * REFTABLE_ENTRY_SIZE)) {
2805            return QCOW2_OL_REFCOUNT_TABLE;
2806        }
2807    }
2808
2809    if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2810        if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2811            return QCOW2_OL_SNAPSHOT_TABLE;
2812        }
2813    }
2814
2815    if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2816        for (i = 0; i < s->nb_snapshots; i++) {
2817            if (s->snapshots[i].l1_size &&
2818                overlaps_with(s->snapshots[i].l1_table_offset,
2819                s->snapshots[i].l1_size * L1E_SIZE)) {
2820                return QCOW2_OL_INACTIVE_L1;
2821            }
2822        }
2823    }
2824
2825    if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2826        for (i = 0; i < s->l1_size; i++) {
2827            if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2828                overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2829                s->cluster_size)) {
2830                return QCOW2_OL_ACTIVE_L2;
2831            }
2832        }
2833    }
2834
2835    if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2836        unsigned last_entry = s->max_refcount_table_index;
2837        assert(last_entry < s->refcount_table_size);
2838        assert(last_entry + 1 == s->refcount_table_size ||
2839               (s->refcount_table[last_entry + 1] & REFT_OFFSET_MASK) == 0);
2840        for (i = 0; i <= last_entry; i++) {
2841            if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2842                overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2843                s->cluster_size)) {
2844                return QCOW2_OL_REFCOUNT_BLOCK;
2845            }
2846        }
2847    }
2848
2849    if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2850        for (i = 0; i < s->nb_snapshots; i++) {
2851            uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2852            uint32_t l1_sz  = s->snapshots[i].l1_size;
2853            uint64_t l1_sz2 = l1_sz * L1E_SIZE;
2854            uint64_t *l1;
2855            int ret;
2856
2857            ret = qcow2_validate_table(bs, l1_ofs, l1_sz, L1E_SIZE,
2858                                       QCOW_MAX_L1_SIZE, "", NULL);
2859            if (ret < 0) {
2860                return ret;
2861            }
2862
2863            l1 = g_try_malloc(l1_sz2);
2864
2865            if (l1_sz2 && l1 == NULL) {
2866                return -ENOMEM;
2867            }
2868
2869            ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
2870            if (ret < 0) {
2871                g_free(l1);
2872                return ret;
2873            }
2874
2875            for (j = 0; j < l1_sz; j++) {
2876                uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
2877                if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
2878                    g_free(l1);
2879                    return QCOW2_OL_INACTIVE_L2;
2880                }
2881            }
2882
2883            g_free(l1);
2884        }
2885    }
2886
2887    if ((chk & QCOW2_OL_BITMAP_DIRECTORY) &&
2888        (s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS))
2889    {
2890        if (overlaps_with(s->bitmap_directory_offset,
2891                          s->bitmap_directory_size))
2892        {
2893            return QCOW2_OL_BITMAP_DIRECTORY;
2894        }
2895    }
2896
2897    return 0;
2898}
2899
2900static const char *metadata_ol_names[] = {
2901    [QCOW2_OL_MAIN_HEADER_BITNR]        = "qcow2_header",
2902    [QCOW2_OL_ACTIVE_L1_BITNR]          = "active L1 table",
2903    [QCOW2_OL_ACTIVE_L2_BITNR]          = "active L2 table",
2904    [QCOW2_OL_REFCOUNT_TABLE_BITNR]     = "refcount table",
2905    [QCOW2_OL_REFCOUNT_BLOCK_BITNR]     = "refcount block",
2906    [QCOW2_OL_SNAPSHOT_TABLE_BITNR]     = "snapshot table",
2907    [QCOW2_OL_INACTIVE_L1_BITNR]        = "inactive L1 table",
2908    [QCOW2_OL_INACTIVE_L2_BITNR]        = "inactive L2 table",
2909    [QCOW2_OL_BITMAP_DIRECTORY_BITNR]   = "bitmap directory",
2910};
2911QEMU_BUILD_BUG_ON(QCOW2_OL_MAX_BITNR != ARRAY_SIZE(metadata_ol_names));
2912
2913/*
2914 * First performs a check for metadata overlaps (through
2915 * qcow2_check_metadata_overlap); if that fails with a negative value (error
2916 * while performing a check), that value is returned. If an impending overlap
2917 * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
2918 * and -EIO returned.
2919 *
2920 * Returns 0 if there were neither overlaps nor errors while checking for
2921 * overlaps; or a negative value (-errno) on error.
2922 */
2923int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
2924                                  int64_t size, bool data_file)
2925{
2926    int ret;
2927
2928    if (data_file && has_data_file(bs)) {
2929        return 0;
2930    }
2931
2932    ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
2933    if (ret < 0) {
2934        return ret;
2935    } else if (ret > 0) {
2936        int metadata_ol_bitnr = ctz32(ret);
2937        assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
2938
2939        qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
2940                                "write on metadata (overlaps with %s)",
2941                                metadata_ol_names[metadata_ol_bitnr]);
2942        return -EIO;
2943    }
2944
2945    return 0;
2946}
2947
2948/* A pointer to a function of this type is given to walk_over_reftable(). That
2949 * function will create refblocks and pass them to a RefblockFinishOp once they
2950 * are completed (@refblock). @refblock_empty is set if the refblock is
2951 * completely empty.
2952 *
2953 * Along with the refblock, a corresponding reftable entry is passed, in the
2954 * reftable @reftable (which may be reallocated) at @reftable_index.
2955 *
2956 * @allocated should be set to true if a new cluster has been allocated.
2957 */
2958typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
2959                               uint64_t reftable_index, uint64_t *reftable_size,
2960                               void *refblock, bool refblock_empty,
2961                               bool *allocated, Error **errp);
2962
2963/**
2964 * This "operation" for walk_over_reftable() allocates the refblock on disk (if
2965 * it is not empty) and inserts its offset into the new reftable. The size of
2966 * this new reftable is increased as required.
2967 */
2968static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
2969                          uint64_t reftable_index, uint64_t *reftable_size,
2970                          void *refblock, bool refblock_empty, bool *allocated,
2971                          Error **errp)
2972{
2973    BDRVQcow2State *s = bs->opaque;
2974    int64_t offset;
2975
2976    if (!refblock_empty && reftable_index >= *reftable_size) {
2977        uint64_t *new_reftable;
2978        uint64_t new_reftable_size;
2979
2980        new_reftable_size = ROUND_UP(reftable_index + 1,
2981                                     s->cluster_size / REFTABLE_ENTRY_SIZE);
2982        if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / REFTABLE_ENTRY_SIZE) {
2983            error_setg(errp,
2984                       "This operation would make the refcount table grow "
2985                       "beyond the maximum size supported by QEMU, aborting");
2986            return -ENOTSUP;
2987        }
2988
2989        new_reftable = g_try_realloc(*reftable, new_reftable_size *
2990                                                REFTABLE_ENTRY_SIZE);
2991        if (!new_reftable) {
2992            error_setg(errp, "Failed to increase reftable buffer size");
2993            return -ENOMEM;
2994        }
2995
2996        memset(new_reftable + *reftable_size, 0,
2997               (new_reftable_size - *reftable_size) * REFTABLE_ENTRY_SIZE);
2998
2999        *reftable      = new_reftable;
3000        *reftable_size = new_reftable_size;
3001    }
3002
3003    if (!refblock_empty && !(*reftable)[reftable_index]) {
3004        offset = qcow2_alloc_clusters(bs, s->cluster_size);
3005        if (offset < 0) {
3006            error_setg_errno(errp, -offset, "Failed to allocate refblock");
3007            return offset;
3008        }
3009        (*reftable)[reftable_index] = offset;
3010        *allocated = true;
3011    }
3012
3013    return 0;
3014}
3015
3016/**
3017 * This "operation" for walk_over_reftable() writes the refblock to disk at the
3018 * offset specified by the new reftable's entry. It does not modify the new
3019 * reftable or change any refcounts.
3020 */
3021static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
3022                          uint64_t reftable_index, uint64_t *reftable_size,
3023                          void *refblock, bool refblock_empty, bool *allocated,
3024                          Error **errp)
3025{
3026    BDRVQcow2State *s = bs->opaque;
3027    int64_t offset;
3028    int ret;
3029
3030    if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
3031        offset = (*reftable)[reftable_index];
3032
3033        ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size,
3034                                            false);
3035        if (ret < 0) {
3036            error_setg_errno(errp, -ret, "Overlap check failed");
3037            return ret;
3038        }
3039
3040        ret = bdrv_pwrite(bs->file, offset, refblock, s->cluster_size);
3041        if (ret < 0) {
3042            error_setg_errno(errp, -ret, "Failed to write refblock");
3043            return ret;
3044        }
3045    } else {
3046        assert(refblock_empty);
3047    }
3048
3049    return 0;
3050}
3051
3052/**
3053 * This function walks over the existing reftable and every referenced refblock;
3054 * if @new_set_refcount is non-NULL, it is called for every refcount entry to
3055 * create an equal new entry in the passed @new_refblock. Once that
3056 * @new_refblock is completely filled, @operation will be called.
3057 *
3058 * @status_cb and @cb_opaque are used for the amend operation's status callback.
3059 * @index is the index of the walk_over_reftable() calls and @total is the total
3060 * number of walk_over_reftable() calls per amend operation. Both are used for
3061 * calculating the parameters for the status callback.
3062 *
3063 * @allocated is set to true if a new cluster has been allocated.
3064 */
3065static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
3066                              uint64_t *new_reftable_index,
3067                              uint64_t *new_reftable_size,
3068                              void *new_refblock, int new_refblock_size,
3069                              int new_refcount_bits,
3070                              RefblockFinishOp *operation, bool *allocated,
3071                              Qcow2SetRefcountFunc *new_set_refcount,
3072                              BlockDriverAmendStatusCB *status_cb,
3073                              void *cb_opaque, int index, int total,
3074                              Error **errp)
3075{
3076    BDRVQcow2State *s = bs->opaque;
3077    uint64_t reftable_index;
3078    bool new_refblock_empty = true;
3079    int refblock_index;
3080    int new_refblock_index = 0;
3081    int ret;
3082
3083    for (reftable_index = 0; reftable_index < s->refcount_table_size;
3084         reftable_index++)
3085    {
3086        uint64_t refblock_offset = s->refcount_table[reftable_index]
3087                                 & REFT_OFFSET_MASK;
3088
3089        status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
3090                  (uint64_t)total * s->refcount_table_size, cb_opaque);
3091
3092        if (refblock_offset) {
3093            void *refblock;
3094
3095            if (offset_into_cluster(s, refblock_offset)) {
3096                qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
3097                                        PRIx64 " unaligned (reftable index: %#"
3098                                        PRIx64 ")", refblock_offset,
3099                                        reftable_index);
3100                error_setg(errp,
3101                           "Image is corrupt (unaligned refblock offset)");
3102                return -EIO;
3103            }
3104
3105            ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
3106                                  &refblock);
3107            if (ret < 0) {
3108                error_setg_errno(errp, -ret, "Failed to retrieve refblock");
3109                return ret;
3110            }
3111
3112            for (refblock_index = 0; refblock_index < s->refcount_block_size;
3113                 refblock_index++)
3114            {
3115                uint64_t refcount;
3116
3117                if (new_refblock_index >= new_refblock_size) {
3118                    /* new_refblock is now complete */
3119                    ret = operation(bs, new_reftable, *new_reftable_index,
3120                                    new_reftable_size, new_refblock,
3121                                    new_refblock_empty, allocated, errp);
3122                    if (ret < 0) {
3123                        qcow2_cache_put(s->refcount_block_cache, &refblock);
3124                        return ret;
3125                    }
3126
3127                    (*new_reftable_index)++;
3128                    new_refblock_index = 0;
3129                    new_refblock_empty = true;
3130                }
3131
3132                refcount = s->get_refcount(refblock, refblock_index);
3133                if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
3134                    uint64_t offset;
3135
3136                    qcow2_cache_put(s->refcount_block_cache, &refblock);
3137
3138                    offset = ((reftable_index << s->refcount_block_bits)
3139                              + refblock_index) << s->cluster_bits;
3140
3141                    error_setg(errp, "Cannot decrease refcount entry width to "
3142                               "%i bits: Cluster at offset %#" PRIx64 " has a "
3143                               "refcount of %" PRIu64, new_refcount_bits,
3144                               offset, refcount);
3145                    return -EINVAL;
3146                }
3147
3148                if (new_set_refcount) {
3149                    new_set_refcount(new_refblock, new_refblock_index++,
3150                                     refcount);
3151                } else {
3152                    new_refblock_index++;
3153                }
3154                new_refblock_empty = new_refblock_empty && refcount == 0;
3155            }
3156
3157            qcow2_cache_put(s->refcount_block_cache, &refblock);
3158        } else {
3159            /* No refblock means every refcount is 0 */
3160            for (refblock_index = 0; refblock_index < s->refcount_block_size;
3161                 refblock_index++)
3162            {
3163                if (new_refblock_index >= new_refblock_size) {
3164                    /* new_refblock is now complete */
3165                    ret = operation(bs, new_reftable, *new_reftable_index,
3166                                    new_reftable_size, new_refblock,
3167                                    new_refblock_empty, allocated, errp);
3168                    if (ret < 0) {
3169                        return ret;
3170                    }
3171
3172                    (*new_reftable_index)++;
3173                    new_refblock_index = 0;
3174                    new_refblock_empty = true;
3175                }
3176
3177                if (new_set_refcount) {
3178                    new_set_refcount(new_refblock, new_refblock_index++, 0);
3179                } else {
3180                    new_refblock_index++;
3181                }
3182            }
3183        }
3184    }
3185
3186    if (new_refblock_index > 0) {
3187        /* Complete the potentially existing partially filled final refblock */
3188        if (new_set_refcount) {
3189            for (; new_refblock_index < new_refblock_size;
3190                 new_refblock_index++)
3191            {
3192                new_set_refcount(new_refblock, new_refblock_index, 0);
3193            }
3194        }
3195
3196        ret = operation(bs, new_reftable, *new_reftable_index,
3197                        new_reftable_size, new_refblock, new_refblock_empty,
3198                        allocated, errp);
3199        if (ret < 0) {
3200            return ret;
3201        }
3202
3203        (*new_reftable_index)++;
3204    }
3205
3206    status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
3207              (uint64_t)total * s->refcount_table_size, cb_opaque);
3208
3209    return 0;
3210}
3211
3212int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
3213                                BlockDriverAmendStatusCB *status_cb,
3214                                void *cb_opaque, Error **errp)
3215{
3216    BDRVQcow2State *s = bs->opaque;
3217    Qcow2GetRefcountFunc *new_get_refcount;
3218    Qcow2SetRefcountFunc *new_set_refcount;
3219    void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
3220    uint64_t *new_reftable = NULL, new_reftable_size = 0;
3221    uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
3222    uint64_t new_reftable_index = 0;
3223    uint64_t i;
3224    int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
3225    int new_refblock_size, new_refcount_bits = 1 << refcount_order;
3226    int old_refcount_order;
3227    int walk_index = 0;
3228    int ret;
3229    bool new_allocation;
3230
3231    assert(s->qcow_version >= 3);
3232    assert(refcount_order >= 0 && refcount_order <= 6);
3233
3234    /* see qcow2_open() */
3235    new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
3236
3237    new_get_refcount = get_refcount_funcs[refcount_order];
3238    new_set_refcount = set_refcount_funcs[refcount_order];
3239
3240
3241    do {
3242        int total_walks;
3243
3244        new_allocation = false;
3245
3246        /* At least we have to do this walk and the one which writes the
3247         * refblocks; also, at least we have to do this loop here at least
3248         * twice (normally), first to do the allocations, and second to
3249         * determine that everything is correctly allocated, this then makes
3250         * three walks in total */
3251        total_walks = MAX(walk_index + 2, 3);
3252
3253        /* First, allocate the structures so they are present in the refcount
3254         * structures */
3255        ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3256                                 &new_reftable_size, NULL, new_refblock_size,
3257                                 new_refcount_bits, &alloc_refblock,
3258                                 &new_allocation, NULL, status_cb, cb_opaque,
3259                                 walk_index++, total_walks, errp);
3260        if (ret < 0) {
3261            goto done;
3262        }
3263
3264        new_reftable_index = 0;
3265
3266        if (new_allocation) {
3267            if (new_reftable_offset) {
3268                qcow2_free_clusters(
3269                    bs, new_reftable_offset,
3270                    allocated_reftable_size * REFTABLE_ENTRY_SIZE,
3271                    QCOW2_DISCARD_NEVER);
3272            }
3273
3274            new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
3275                                                           REFTABLE_ENTRY_SIZE);
3276            if (new_reftable_offset < 0) {
3277                error_setg_errno(errp, -new_reftable_offset,
3278                                 "Failed to allocate the new reftable");
3279                ret = new_reftable_offset;
3280                goto done;
3281            }
3282            allocated_reftable_size = new_reftable_size;
3283        }
3284    } while (new_allocation);
3285
3286    /* Second, write the new refblocks */
3287    ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3288                             &new_reftable_size, new_refblock,
3289                             new_refblock_size, new_refcount_bits,
3290                             &flush_refblock, &new_allocation, new_set_refcount,
3291                             status_cb, cb_opaque, walk_index, walk_index + 1,
3292                             errp);
3293    if (ret < 0) {
3294        goto done;
3295    }
3296    assert(!new_allocation);
3297
3298
3299    /* Write the new reftable */
3300    ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
3301                                        new_reftable_size * REFTABLE_ENTRY_SIZE,
3302                                        false);
3303    if (ret < 0) {
3304        error_setg_errno(errp, -ret, "Overlap check failed");
3305        goto done;
3306    }
3307
3308    for (i = 0; i < new_reftable_size; i++) {
3309        cpu_to_be64s(&new_reftable[i]);
3310    }
3311
3312    ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable,
3313                      new_reftable_size * REFTABLE_ENTRY_SIZE);
3314
3315    for (i = 0; i < new_reftable_size; i++) {
3316        be64_to_cpus(&new_reftable[i]);
3317    }
3318
3319    if (ret < 0) {
3320        error_setg_errno(errp, -ret, "Failed to write the new reftable");
3321        goto done;
3322    }
3323
3324
3325    /* Empty the refcount cache */
3326    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
3327    if (ret < 0) {
3328        error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
3329        goto done;
3330    }
3331
3332    /* Update the image header to point to the new reftable; this only updates
3333     * the fields which are relevant to qcow2_update_header(); other fields
3334     * such as s->refcount_table or s->refcount_bits stay stale for now
3335     * (because we have to restore everything if qcow2_update_header() fails) */
3336    old_refcount_order  = s->refcount_order;
3337    old_reftable_size   = s->refcount_table_size;
3338    old_reftable_offset = s->refcount_table_offset;
3339
3340    s->refcount_order        = refcount_order;
3341    s->refcount_table_size   = new_reftable_size;
3342    s->refcount_table_offset = new_reftable_offset;
3343
3344    ret = qcow2_update_header(bs);
3345    if (ret < 0) {
3346        s->refcount_order        = old_refcount_order;
3347        s->refcount_table_size   = old_reftable_size;
3348        s->refcount_table_offset = old_reftable_offset;
3349        error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
3350        goto done;
3351    }
3352
3353    /* Now update the rest of the in-memory information */
3354    old_reftable = s->refcount_table;
3355    s->refcount_table = new_reftable;
3356    update_max_refcount_table_index(s);
3357
3358    s->refcount_bits = 1 << refcount_order;
3359    s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
3360    s->refcount_max += s->refcount_max - 1;
3361
3362    s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
3363    s->refcount_block_size = 1 << s->refcount_block_bits;
3364
3365    s->get_refcount = new_get_refcount;
3366    s->set_refcount = new_set_refcount;
3367
3368    /* For cleaning up all old refblocks and the old reftable below the "done"
3369     * label */
3370    new_reftable        = old_reftable;
3371    new_reftable_size   = old_reftable_size;
3372    new_reftable_offset = old_reftable_offset;
3373
3374done:
3375    if (new_reftable) {
3376        /* On success, new_reftable actually points to the old reftable (and
3377         * new_reftable_size is the old reftable's size); but that is just
3378         * fine */
3379        for (i = 0; i < new_reftable_size; i++) {
3380            uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
3381            if (offset) {
3382                qcow2_free_clusters(bs, offset, s->cluster_size,
3383                                    QCOW2_DISCARD_OTHER);
3384            }
3385        }
3386        g_free(new_reftable);
3387
3388        if (new_reftable_offset > 0) {
3389            qcow2_free_clusters(bs, new_reftable_offset,
3390                                new_reftable_size * REFTABLE_ENTRY_SIZE,
3391                                QCOW2_DISCARD_OTHER);
3392        }
3393    }
3394
3395    qemu_vfree(new_refblock);
3396    return ret;
3397}
3398
3399static int64_t get_refblock_offset(BlockDriverState *bs, uint64_t offset)
3400{
3401    BDRVQcow2State *s = bs->opaque;
3402    uint32_t index = offset_to_reftable_index(s, offset);
3403    int64_t covering_refblock_offset = 0;
3404
3405    if (index < s->refcount_table_size) {
3406        covering_refblock_offset = s->refcount_table[index] & REFT_OFFSET_MASK;
3407    }
3408    if (!covering_refblock_offset) {
3409        qcow2_signal_corruption(bs, true, -1, -1, "Refblock at %#" PRIx64 " is "
3410                                "not covered by the refcount structures",
3411                                offset);
3412        return -EIO;
3413    }
3414
3415    return covering_refblock_offset;
3416}
3417
3418static int qcow2_discard_refcount_block(BlockDriverState *bs,
3419                                        uint64_t discard_block_offs)
3420{
3421    BDRVQcow2State *s = bs->opaque;
3422    int64_t refblock_offs;
3423    uint64_t cluster_index = discard_block_offs >> s->cluster_bits;
3424    uint32_t block_index = cluster_index & (s->refcount_block_size - 1);
3425    void *refblock;
3426    int ret;
3427
3428    refblock_offs = get_refblock_offset(bs, discard_block_offs);
3429    if (refblock_offs < 0) {
3430        return refblock_offs;
3431    }
3432
3433    assert(discard_block_offs != 0);
3434
3435    ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3436                          &refblock);
3437    if (ret < 0) {
3438        return ret;
3439    }
3440
3441    if (s->get_refcount(refblock, block_index) != 1) {
3442        qcow2_signal_corruption(bs, true, -1, -1, "Invalid refcount:"
3443                                " refblock offset %#" PRIx64
3444                                ", reftable index %u"
3445                                ", block offset %#" PRIx64
3446                                ", refcount %#" PRIx64,
3447                                refblock_offs,
3448                                offset_to_reftable_index(s, discard_block_offs),
3449                                discard_block_offs,
3450                                s->get_refcount(refblock, block_index));
3451        qcow2_cache_put(s->refcount_block_cache, &refblock);
3452        return -EINVAL;
3453    }
3454    s->set_refcount(refblock, block_index, 0);
3455
3456    qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refblock);
3457
3458    qcow2_cache_put(s->refcount_block_cache, &refblock);
3459
3460    if (cluster_index < s->free_cluster_index) {
3461        s->free_cluster_index = cluster_index;
3462    }
3463
3464    refblock = qcow2_cache_is_table_offset(s->refcount_block_cache,
3465                                           discard_block_offs);
3466    if (refblock) {
3467        /* discard refblock from the cache if refblock is cached */
3468        qcow2_cache_discard(s->refcount_block_cache, refblock);
3469    }
3470    update_refcount_discard(bs, discard_block_offs, s->cluster_size);
3471
3472    return 0;
3473}
3474
3475int qcow2_shrink_reftable(BlockDriverState *bs)
3476{
3477    BDRVQcow2State *s = bs->opaque;
3478    uint64_t *reftable_tmp =
3479        g_malloc(s->refcount_table_size * REFTABLE_ENTRY_SIZE);
3480    int i, ret;
3481
3482    for (i = 0; i < s->refcount_table_size; i++) {
3483        int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK;
3484        void *refblock;
3485        bool unused_block;
3486
3487        if (refblock_offs == 0) {
3488            reftable_tmp[i] = 0;
3489            continue;
3490        }
3491        ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3492                              &refblock);
3493        if (ret < 0) {
3494            goto out;
3495        }
3496
3497        /* the refblock has own reference */
3498        if (i == offset_to_reftable_index(s, refblock_offs)) {
3499            uint64_t block_index = (refblock_offs >> s->cluster_bits) &
3500                                   (s->refcount_block_size - 1);
3501            uint64_t refcount = s->get_refcount(refblock, block_index);
3502
3503            s->set_refcount(refblock, block_index, 0);
3504
3505            unused_block = buffer_is_zero(refblock, s->cluster_size);
3506
3507            s->set_refcount(refblock, block_index, refcount);
3508        } else {
3509            unused_block = buffer_is_zero(refblock, s->cluster_size);
3510        }
3511        qcow2_cache_put(s->refcount_block_cache, &refblock);
3512
3513        reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
3514    }
3515
3516    ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset, reftable_tmp,
3517                           s->refcount_table_size * REFTABLE_ENTRY_SIZE);
3518    /*
3519     * If the write in the reftable failed the image may contain a partially
3520     * overwritten reftable. In this case it would be better to clear the
3521     * reftable in memory to avoid possible image corruption.
3522     */
3523    for (i = 0; i < s->refcount_table_size; i++) {
3524        if (s->refcount_table[i] && !reftable_tmp[i]) {
3525            if (ret == 0) {
3526                ret = qcow2_discard_refcount_block(bs, s->refcount_table[i] &
3527                                                       REFT_OFFSET_MASK);
3528            }
3529            s->refcount_table[i] = 0;
3530        }
3531    }
3532
3533    if (!s->cache_discards) {
3534        qcow2_process_discards(bs, ret);
3535    }
3536
3537out:
3538    g_free(reftable_tmp);
3539    return ret;
3540}
3541
3542int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
3543{
3544    BDRVQcow2State *s = bs->opaque;
3545    int64_t i;
3546
3547    for (i = size_to_clusters(s, size) - 1; i >= 0; i--) {
3548        uint64_t refcount;
3549        int ret = qcow2_get_refcount(bs, i, &refcount);
3550        if (ret < 0) {
3551            fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
3552                    i, strerror(-ret));
3553            return ret;
3554        }
3555        if (refcount > 0) {
3556            return i;
3557        }
3558    }
3559    qcow2_signal_corruption(bs, true, -1, -1,
3560                            "There are no references in the refcount table.");
3561    return -EIO;
3562}
3563
3564int qcow2_detect_metadata_preallocation(BlockDriverState *bs)
3565{
3566    BDRVQcow2State *s = bs->opaque;
3567    int64_t i, end_cluster, cluster_count = 0, threshold;
3568    int64_t file_length, real_allocation, real_clusters;
3569
3570    qemu_co_mutex_assert_locked(&s->lock);
3571
3572    file_length = bdrv_getlength(bs->file->bs);
3573    if (file_length < 0) {
3574        return file_length;
3575    }
3576
3577    real_allocation = bdrv_get_allocated_file_size(bs->file->bs);
3578    if (real_allocation < 0) {
3579        return real_allocation;
3580    }
3581
3582    real_clusters = real_allocation / s->cluster_size;
3583    threshold = MAX(real_clusters * 10 / 9, real_clusters + 2);
3584
3585    end_cluster = size_to_clusters(s, file_length);
3586    for (i = 0; i < end_cluster && cluster_count < threshold; i++) {
3587        uint64_t refcount;
3588        int ret = qcow2_get_refcount(bs, i, &refcount);
3589        if (ret < 0) {
3590            return ret;
3591        }
3592        cluster_count += !!refcount;
3593    }
3594
3595    return cluster_count >= threshold;
3596}
3597