qemu/block/qcow2-refcount.c
<<
>>
Prefs
   1/*
   2 * Block driver for the QCOW version 2 format
   3 *
   4 * Copyright (c) 2004-2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "qapi/error.h"
  27#include "qcow2.h"
  28#include "qemu/range.h"
  29#include "qemu/bswap.h"
  30#include "qemu/cutils.h"
  31#include "trace.h"
  32
  33static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
  34                                    uint64_t max);
  35static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
  36                            int64_t offset, int64_t length, uint64_t addend,
  37                            bool decrease, enum qcow2_discard_type type);
  38
  39static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
  40static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
  41static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
  42static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
  43static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
  44static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
  45static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
  46
  47static void set_refcount_ro0(void *refcount_array, uint64_t index,
  48                             uint64_t value);
  49static void set_refcount_ro1(void *refcount_array, uint64_t index,
  50                             uint64_t value);
  51static void set_refcount_ro2(void *refcount_array, uint64_t index,
  52                             uint64_t value);
  53static void set_refcount_ro3(void *refcount_array, uint64_t index,
  54                             uint64_t value);
  55static void set_refcount_ro4(void *refcount_array, uint64_t index,
  56                             uint64_t value);
  57static void set_refcount_ro5(void *refcount_array, uint64_t index,
  58                             uint64_t value);
  59static void set_refcount_ro6(void *refcount_array, uint64_t index,
  60                             uint64_t value);
  61
  62
  63static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
  64    &get_refcount_ro0,
  65    &get_refcount_ro1,
  66    &get_refcount_ro2,
  67    &get_refcount_ro3,
  68    &get_refcount_ro4,
  69    &get_refcount_ro5,
  70    &get_refcount_ro6
  71};
  72
  73static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
  74    &set_refcount_ro0,
  75    &set_refcount_ro1,
  76    &set_refcount_ro2,
  77    &set_refcount_ro3,
  78    &set_refcount_ro4,
  79    &set_refcount_ro5,
  80    &set_refcount_ro6
  81};
  82
  83
  84/*********************************************************/
  85/* refcount handling */
  86
  87static void update_max_refcount_table_index(BDRVQcow2State *s)
  88{
  89    unsigned i = s->refcount_table_size - 1;
  90    while (i > 0 && (s->refcount_table[i] & REFT_OFFSET_MASK) == 0) {
  91        i--;
  92    }
  93    /* Set s->max_refcount_table_index to the index of the last used entry */
  94    s->max_refcount_table_index = i;
  95}
  96
  97int qcow2_refcount_init(BlockDriverState *bs)
  98{
  99    BDRVQcow2State *s = bs->opaque;
 100    unsigned int refcount_table_size2, i;
 101    int ret;
 102
 103    assert(s->refcount_order >= 0 && s->refcount_order <= 6);
 104
 105    s->get_refcount = get_refcount_funcs[s->refcount_order];
 106    s->set_refcount = set_refcount_funcs[s->refcount_order];
 107
 108    assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
 109    refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
 110    s->refcount_table = g_try_malloc(refcount_table_size2);
 111
 112    if (s->refcount_table_size > 0) {
 113        if (s->refcount_table == NULL) {
 114            ret = -ENOMEM;
 115            goto fail;
 116        }
 117        BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
 118        ret = bdrv_pread(bs->file, s->refcount_table_offset,
 119                         s->refcount_table, refcount_table_size2);
 120        if (ret < 0) {
 121            goto fail;
 122        }
 123        for(i = 0; i < s->refcount_table_size; i++)
 124            be64_to_cpus(&s->refcount_table[i]);
 125        update_max_refcount_table_index(s);
 126    }
 127    return 0;
 128 fail:
 129    return ret;
 130}
 131
 132void qcow2_refcount_close(BlockDriverState *bs)
 133{
 134    BDRVQcow2State *s = bs->opaque;
 135    g_free(s->refcount_table);
 136}
 137
 138
 139static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
 140{
 141    return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
 142}
 143
 144static void set_refcount_ro0(void *refcount_array, uint64_t index,
 145                             uint64_t value)
 146{
 147    assert(!(value >> 1));
 148    ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
 149    ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
 150}
 151
 152static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
 153{
 154    return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
 155           & 0x3;
 156}
 157
 158static void set_refcount_ro1(void *refcount_array, uint64_t index,
 159                             uint64_t value)
 160{
 161    assert(!(value >> 2));
 162    ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
 163    ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
 164}
 165
 166static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
 167{
 168    return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
 169           & 0xf;
 170}
 171
 172static void set_refcount_ro2(void *refcount_array, uint64_t index,
 173                             uint64_t value)
 174{
 175    assert(!(value >> 4));
 176    ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
 177    ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
 178}
 179
 180static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
 181{
 182    return ((const uint8_t *)refcount_array)[index];
 183}
 184
 185static void set_refcount_ro3(void *refcount_array, uint64_t index,
 186                             uint64_t value)
 187{
 188    assert(!(value >> 8));
 189    ((uint8_t *)refcount_array)[index] = value;
 190}
 191
 192static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
 193{
 194    return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
 195}
 196
 197static void set_refcount_ro4(void *refcount_array, uint64_t index,
 198                             uint64_t value)
 199{
 200    assert(!(value >> 16));
 201    ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
 202}
 203
 204static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
 205{
 206    return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
 207}
 208
 209static void set_refcount_ro5(void *refcount_array, uint64_t index,
 210                             uint64_t value)
 211{
 212    assert(!(value >> 32));
 213    ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
 214}
 215
 216static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
 217{
 218    return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
 219}
 220
 221static void set_refcount_ro6(void *refcount_array, uint64_t index,
 222                             uint64_t value)
 223{
 224    ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
 225}
 226
 227
 228static int load_refcount_block(BlockDriverState *bs,
 229                               int64_t refcount_block_offset,
 230                               void **refcount_block)
 231{
 232    BDRVQcow2State *s = bs->opaque;
 233
 234    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
 235    return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
 236                           refcount_block);
 237}
 238
 239/*
 240 * Retrieves the refcount of the cluster given by its index and stores it in
 241 * *refcount. Returns 0 on success and -errno on failure.
 242 */
 243int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
 244                       uint64_t *refcount)
 245{
 246    BDRVQcow2State *s = bs->opaque;
 247    uint64_t refcount_table_index, block_index;
 248    int64_t refcount_block_offset;
 249    int ret;
 250    void *refcount_block;
 251
 252    refcount_table_index = cluster_index >> s->refcount_block_bits;
 253    if (refcount_table_index >= s->refcount_table_size) {
 254        *refcount = 0;
 255        return 0;
 256    }
 257    refcount_block_offset =
 258        s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
 259    if (!refcount_block_offset) {
 260        *refcount = 0;
 261        return 0;
 262    }
 263
 264    if (offset_into_cluster(s, refcount_block_offset)) {
 265        qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
 266                                " unaligned (reftable index: %#" PRIx64 ")",
 267                                refcount_block_offset, refcount_table_index);
 268        return -EIO;
 269    }
 270
 271    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
 272                          &refcount_block);
 273    if (ret < 0) {
 274        return ret;
 275    }
 276
 277    block_index = cluster_index & (s->refcount_block_size - 1);
 278    *refcount = s->get_refcount(refcount_block, block_index);
 279
 280    qcow2_cache_put(s->refcount_block_cache, &refcount_block);
 281
 282    return 0;
 283}
 284
 285/* Checks if two offsets are described by the same refcount block */
 286static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
 287    uint64_t offset_b)
 288{
 289    uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
 290    uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
 291
 292    return (block_a == block_b);
 293}
 294
 295/*
 296 * Loads a refcount block. If it doesn't exist yet, it is allocated first
 297 * (including growing the refcount table if needed).
 298 *
 299 * Returns 0 on success or -errno in error case
 300 */
 301static int alloc_refcount_block(BlockDriverState *bs,
 302                                int64_t cluster_index, void **refcount_block)
 303{
 304    BDRVQcow2State *s = bs->opaque;
 305    unsigned int refcount_table_index;
 306    int64_t ret;
 307
 308    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
 309
 310    /* Find the refcount block for the given cluster */
 311    refcount_table_index = cluster_index >> s->refcount_block_bits;
 312
 313    if (refcount_table_index < s->refcount_table_size) {
 314
 315        uint64_t refcount_block_offset =
 316            s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
 317
 318        /* If it's already there, we're done */
 319        if (refcount_block_offset) {
 320            if (offset_into_cluster(s, refcount_block_offset)) {
 321                qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
 322                                        PRIx64 " unaligned (reftable index: "
 323                                        "%#x)", refcount_block_offset,
 324                                        refcount_table_index);
 325                return -EIO;
 326            }
 327
 328             return load_refcount_block(bs, refcount_block_offset,
 329                                        refcount_block);
 330        }
 331    }
 332
 333    /*
 334     * If we came here, we need to allocate something. Something is at least
 335     * a cluster for the new refcount block. It may also include a new refcount
 336     * table if the old refcount table is too small.
 337     *
 338     * Note that allocating clusters here needs some special care:
 339     *
 340     * - We can't use the normal qcow2_alloc_clusters(), it would try to
 341     *   increase the refcount and very likely we would end up with an endless
 342     *   recursion. Instead we must place the refcount blocks in a way that
 343     *   they can describe them themselves.
 344     *
 345     * - We need to consider that at this point we are inside update_refcounts
 346     *   and potentially doing an initial refcount increase. This means that
 347     *   some clusters have already been allocated by the caller, but their
 348     *   refcount isn't accurate yet. If we allocate clusters for metadata, we
 349     *   need to return -EAGAIN to signal the caller that it needs to restart
 350     *   the search for free clusters.
 351     *
 352     * - alloc_clusters_noref and qcow2_free_clusters may load a different
 353     *   refcount block into the cache
 354     */
 355
 356    *refcount_block = NULL;
 357
 358    /* We write to the refcount table, so we might depend on L2 tables */
 359    ret = qcow2_cache_flush(bs, s->l2_table_cache);
 360    if (ret < 0) {
 361        return ret;
 362    }
 363
 364    /* Allocate the refcount block itself and mark it as used */
 365    int64_t new_block = alloc_clusters_noref(bs, s->cluster_size, INT64_MAX);
 366    if (new_block < 0) {
 367        return new_block;
 368    }
 369
 370    /* The offset must fit in the offset field of the refcount table entry */
 371    assert((new_block & REFT_OFFSET_MASK) == new_block);
 372
 373    /* If we're allocating the block at offset 0 then something is wrong */
 374    if (new_block == 0) {
 375        qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
 376                                "allocation of refcount block at offset 0");
 377        return -EIO;
 378    }
 379
 380#ifdef DEBUG_ALLOC2
 381    fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
 382        " at %" PRIx64 "\n",
 383        refcount_table_index, cluster_index << s->cluster_bits, new_block);
 384#endif
 385
 386    if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
 387        /* Zero the new refcount block before updating it */
 388        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
 389                                    refcount_block);
 390        if (ret < 0) {
 391            goto fail;
 392        }
 393
 394        memset(*refcount_block, 0, s->cluster_size);
 395
 396        /* The block describes itself, need to update the cache */
 397        int block_index = (new_block >> s->cluster_bits) &
 398            (s->refcount_block_size - 1);
 399        s->set_refcount(*refcount_block, block_index, 1);
 400    } else {
 401        /* Described somewhere else. This can recurse at most twice before we
 402         * arrive at a block that describes itself. */
 403        ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
 404                              QCOW2_DISCARD_NEVER);
 405        if (ret < 0) {
 406            goto fail;
 407        }
 408
 409        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 410        if (ret < 0) {
 411            goto fail;
 412        }
 413
 414        /* Initialize the new refcount block only after updating its refcount,
 415         * update_refcount uses the refcount cache itself */
 416        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
 417                                    refcount_block);
 418        if (ret < 0) {
 419            goto fail;
 420        }
 421
 422        memset(*refcount_block, 0, s->cluster_size);
 423    }
 424
 425    /* Now the new refcount block needs to be written to disk */
 426    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
 427    qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
 428    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 429    if (ret < 0) {
 430        goto fail;
 431    }
 432
 433    /* If the refcount table is big enough, just hook the block up there */
 434    if (refcount_table_index < s->refcount_table_size) {
 435        uint64_t data64 = cpu_to_be64(new_block);
 436        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
 437        ret = bdrv_pwrite_sync(bs->file,
 438            s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
 439            &data64, sizeof(data64));
 440        if (ret < 0) {
 441            goto fail;
 442        }
 443
 444        s->refcount_table[refcount_table_index] = new_block;
 445        /* If there's a hole in s->refcount_table then it can happen
 446         * that refcount_table_index < s->max_refcount_table_index */
 447        s->max_refcount_table_index =
 448            MAX(s->max_refcount_table_index, refcount_table_index);
 449
 450        /* The new refcount block may be where the caller intended to put its
 451         * data, so let it restart the search. */
 452        return -EAGAIN;
 453    }
 454
 455    qcow2_cache_put(s->refcount_block_cache, refcount_block);
 456
 457    /*
 458     * If we come here, we need to grow the refcount table. Again, a new
 459     * refcount table needs some space and we can't simply allocate to avoid
 460     * endless recursion.
 461     *
 462     * Therefore let's grab new refcount blocks at the end of the image, which
 463     * will describe themselves and the new refcount table. This way we can
 464     * reference them only in the new table and do the switch to the new
 465     * refcount table at once without producing an inconsistent state in
 466     * between.
 467     */
 468    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
 469
 470    /* Calculate the number of refcount blocks needed so far; this will be the
 471     * basis for calculating the index of the first cluster used for the
 472     * self-describing refcount structures which we are about to create.
 473     *
 474     * Because we reached this point, there cannot be any refcount entries for
 475     * cluster_index or higher indices yet. However, because new_block has been
 476     * allocated to describe that cluster (and it will assume this role later
 477     * on), we cannot use that index; also, new_block may actually have a higher
 478     * cluster index than cluster_index, so it needs to be taken into account
 479     * here (and 1 needs to be added to its value because that cluster is used).
 480     */
 481    uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
 482                                            (new_block >> s->cluster_bits) + 1),
 483                                        s->refcount_block_size);
 484
 485    /* Create the new refcount table and blocks */
 486    uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
 487        s->cluster_size;
 488
 489    ret = qcow2_refcount_area(bs, meta_offset, 0, false,
 490                              refcount_table_index, new_block);
 491    if (ret < 0) {
 492        return ret;
 493    }
 494
 495    ret = load_refcount_block(bs, new_block, refcount_block);
 496    if (ret < 0) {
 497        return ret;
 498    }
 499
 500    /* If we were trying to do the initial refcount update for some cluster
 501     * allocation, we might have used the same clusters to store newly
 502     * allocated metadata. Make the caller search some new space. */
 503    return -EAGAIN;
 504
 505fail:
 506    if (*refcount_block != NULL) {
 507        qcow2_cache_put(s->refcount_block_cache, refcount_block);
 508    }
 509    return ret;
 510}
 511
 512/*
 513 * Starting at @start_offset, this function creates new self-covering refcount
 514 * structures: A new refcount table and refcount blocks which cover all of
 515 * themselves, and a number of @additional_clusters beyond their end.
 516 * @start_offset must be at the end of the image file, that is, there must be
 517 * only empty space beyond it.
 518 * If @exact_size is false, the refcount table will have 50 % more entries than
 519 * necessary so it will not need to grow again soon.
 520 * If @new_refblock_offset is not zero, it contains the offset of a refcount
 521 * block that should be entered into the new refcount table at index
 522 * @new_refblock_index.
 523 *
 524 * Returns: The offset after the new refcount structures (i.e. where the
 525 *          @additional_clusters may be placed) on success, -errno on error.
 526 */
 527int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
 528                            uint64_t additional_clusters, bool exact_size,
 529                            int new_refblock_index,
 530                            uint64_t new_refblock_offset)
 531{
 532    BDRVQcow2State *s = bs->opaque;
 533    uint64_t total_refblock_count_u64, additional_refblock_count;
 534    int total_refblock_count, table_size, area_reftable_index, table_clusters;
 535    int i;
 536    uint64_t table_offset, block_offset, end_offset;
 537    int ret;
 538    uint64_t *new_table;
 539
 540    assert(!(start_offset % s->cluster_size));
 541
 542    qcow2_refcount_metadata_size(start_offset / s->cluster_size +
 543                                 additional_clusters,
 544                                 s->cluster_size, s->refcount_order,
 545                                 !exact_size, &total_refblock_count_u64);
 546    if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) {
 547        return -EFBIG;
 548    }
 549    total_refblock_count = total_refblock_count_u64;
 550
 551    /* Index in the refcount table of the first refcount block to cover the area
 552     * of refcount structures we are about to create; we know that
 553     * @total_refblock_count can cover @start_offset, so this will definitely
 554     * fit into an int. */
 555    area_reftable_index = (start_offset / s->cluster_size) /
 556                          s->refcount_block_size;
 557
 558    if (exact_size) {
 559        table_size = total_refblock_count;
 560    } else {
 561        table_size = total_refblock_count +
 562                     DIV_ROUND_UP(total_refblock_count, 2);
 563    }
 564    /* The qcow2 file can only store the reftable size in number of clusters */
 565    table_size = ROUND_UP(table_size, s->cluster_size / sizeof(uint64_t));
 566    table_clusters = (table_size * sizeof(uint64_t)) / s->cluster_size;
 567
 568    if (table_size > QCOW_MAX_REFTABLE_SIZE) {
 569        return -EFBIG;
 570    }
 571
 572    new_table = g_try_new0(uint64_t, table_size);
 573
 574    assert(table_size > 0);
 575    if (new_table == NULL) {
 576        ret = -ENOMEM;
 577        goto fail;
 578    }
 579
 580    /* Fill the new refcount table */
 581    if (table_size > s->max_refcount_table_index) {
 582        /* We're actually growing the reftable */
 583        memcpy(new_table, s->refcount_table,
 584               (s->max_refcount_table_index + 1) * sizeof(uint64_t));
 585    } else {
 586        /* Improbable case: We're shrinking the reftable. However, the caller
 587         * has assured us that there is only empty space beyond @start_offset,
 588         * so we can simply drop all of the refblocks that won't fit into the
 589         * new reftable. */
 590        memcpy(new_table, s->refcount_table, table_size * sizeof(uint64_t));
 591    }
 592
 593    if (new_refblock_offset) {
 594        assert(new_refblock_index < total_refblock_count);
 595        new_table[new_refblock_index] = new_refblock_offset;
 596    }
 597
 598    /* Count how many new refblocks we have to create */
 599    additional_refblock_count = 0;
 600    for (i = area_reftable_index; i < total_refblock_count; i++) {
 601        if (!new_table[i]) {
 602            additional_refblock_count++;
 603        }
 604    }
 605
 606    table_offset = start_offset + additional_refblock_count * s->cluster_size;
 607    end_offset = table_offset + table_clusters * s->cluster_size;
 608
 609    /* Fill the refcount blocks, and create new ones, if necessary */
 610    block_offset = start_offset;
 611    for (i = area_reftable_index; i < total_refblock_count; i++) {
 612        void *refblock_data;
 613        uint64_t first_offset_covered;
 614
 615        /* Reuse an existing refblock if possible, create a new one otherwise */
 616        if (new_table[i]) {
 617            ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i],
 618                                  &refblock_data);
 619            if (ret < 0) {
 620                goto fail;
 621            }
 622        } else {
 623            ret = qcow2_cache_get_empty(bs, s->refcount_block_cache,
 624                                        block_offset, &refblock_data);
 625            if (ret < 0) {
 626                goto fail;
 627            }
 628            memset(refblock_data, 0, s->cluster_size);
 629            qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
 630                                         refblock_data);
 631
 632            new_table[i] = block_offset;
 633            block_offset += s->cluster_size;
 634        }
 635
 636        /* First host offset covered by this refblock */
 637        first_offset_covered = (uint64_t)i * s->refcount_block_size *
 638                               s->cluster_size;
 639        if (first_offset_covered < end_offset) {
 640            int j, end_index;
 641
 642            /* Set the refcount of all of the new refcount structures to 1 */
 643
 644            if (first_offset_covered < start_offset) {
 645                assert(i == area_reftable_index);
 646                j = (start_offset - first_offset_covered) / s->cluster_size;
 647                assert(j < s->refcount_block_size);
 648            } else {
 649                j = 0;
 650            }
 651
 652            end_index = MIN((end_offset - first_offset_covered) /
 653                            s->cluster_size,
 654                            s->refcount_block_size);
 655
 656            for (; j < end_index; j++) {
 657                /* The caller guaranteed us this space would be empty */
 658                assert(s->get_refcount(refblock_data, j) == 0);
 659                s->set_refcount(refblock_data, j, 1);
 660            }
 661
 662            qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
 663                                         refblock_data);
 664        }
 665
 666        qcow2_cache_put(s->refcount_block_cache, &refblock_data);
 667    }
 668
 669    assert(block_offset == table_offset);
 670
 671    /* Write refcount blocks to disk */
 672    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
 673    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 674    if (ret < 0) {
 675        goto fail;
 676    }
 677
 678    /* Write refcount table to disk */
 679    for (i = 0; i < total_refblock_count; i++) {
 680        cpu_to_be64s(&new_table[i]);
 681    }
 682
 683    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
 684    ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
 685        table_size * sizeof(uint64_t));
 686    if (ret < 0) {
 687        goto fail;
 688    }
 689
 690    for (i = 0; i < total_refblock_count; i++) {
 691        be64_to_cpus(&new_table[i]);
 692    }
 693
 694    /* Hook up the new refcount table in the qcow2 header */
 695    struct QEMU_PACKED {
 696        uint64_t d64;
 697        uint32_t d32;
 698    } data;
 699    data.d64 = cpu_to_be64(table_offset);
 700    data.d32 = cpu_to_be32(table_clusters);
 701    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
 702    ret = bdrv_pwrite_sync(bs->file,
 703                           offsetof(QCowHeader, refcount_table_offset),
 704                           &data, sizeof(data));
 705    if (ret < 0) {
 706        goto fail;
 707    }
 708
 709    /* And switch it in memory */
 710    uint64_t old_table_offset = s->refcount_table_offset;
 711    uint64_t old_table_size = s->refcount_table_size;
 712
 713    g_free(s->refcount_table);
 714    s->refcount_table = new_table;
 715    s->refcount_table_size = table_size;
 716    s->refcount_table_offset = table_offset;
 717    update_max_refcount_table_index(s);
 718
 719    /* Free old table. */
 720    qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
 721                        QCOW2_DISCARD_OTHER);
 722
 723    return end_offset;
 724
 725fail:
 726    g_free(new_table);
 727    return ret;
 728}
 729
 730void qcow2_process_discards(BlockDriverState *bs, int ret)
 731{
 732    BDRVQcow2State *s = bs->opaque;
 733    Qcow2DiscardRegion *d, *next;
 734
 735    QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
 736        QTAILQ_REMOVE(&s->discards, d, next);
 737
 738        /* Discard is optional, ignore the return value */
 739        if (ret >= 0) {
 740            int r2 = bdrv_pdiscard(bs->file, d->offset, d->bytes);
 741            if (r2 < 0) {
 742                trace_qcow2_process_discards_failed_region(d->offset, d->bytes,
 743                                                           r2);
 744            }
 745        }
 746
 747        g_free(d);
 748    }
 749}
 750
 751static void update_refcount_discard(BlockDriverState *bs,
 752                                    uint64_t offset, uint64_t length)
 753{
 754    BDRVQcow2State *s = bs->opaque;
 755    Qcow2DiscardRegion *d, *p, *next;
 756
 757    QTAILQ_FOREACH(d, &s->discards, next) {
 758        uint64_t new_start = MIN(offset, d->offset);
 759        uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
 760
 761        if (new_end - new_start <= length + d->bytes) {
 762            /* There can't be any overlap, areas ending up here have no
 763             * references any more and therefore shouldn't get freed another
 764             * time. */
 765            assert(d->bytes + length == new_end - new_start);
 766            d->offset = new_start;
 767            d->bytes = new_end - new_start;
 768            goto found;
 769        }
 770    }
 771
 772    d = g_malloc(sizeof(*d));
 773    *d = (Qcow2DiscardRegion) {
 774        .bs     = bs,
 775        .offset = offset,
 776        .bytes  = length,
 777    };
 778    QTAILQ_INSERT_TAIL(&s->discards, d, next);
 779
 780found:
 781    /* Merge discard requests if they are adjacent now */
 782    QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
 783        if (p == d
 784            || p->offset > d->offset + d->bytes
 785            || d->offset > p->offset + p->bytes)
 786        {
 787            continue;
 788        }
 789
 790        /* Still no overlap possible */
 791        assert(p->offset == d->offset + d->bytes
 792            || d->offset == p->offset + p->bytes);
 793
 794        QTAILQ_REMOVE(&s->discards, p, next);
 795        d->offset = MIN(d->offset, p->offset);
 796        d->bytes += p->bytes;
 797        g_free(p);
 798    }
 799}
 800
 801/* XXX: cache several refcount block clusters ? */
 802/* @addend is the absolute value of the addend; if @decrease is set, @addend
 803 * will be subtracted from the current refcount, otherwise it will be added */
 804static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
 805                                                   int64_t offset,
 806                                                   int64_t length,
 807                                                   uint64_t addend,
 808                                                   bool decrease,
 809                                                   enum qcow2_discard_type type)
 810{
 811    BDRVQcow2State *s = bs->opaque;
 812    int64_t start, last, cluster_offset;
 813    void *refcount_block = NULL;
 814    int64_t old_table_index = -1;
 815    int ret;
 816
 817#ifdef DEBUG_ALLOC2
 818    fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
 819            " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
 820            addend);
 821#endif
 822    if (length < 0) {
 823        return -EINVAL;
 824    } else if (length == 0) {
 825        return 0;
 826    }
 827
 828    if (decrease) {
 829        qcow2_cache_set_dependency(bs, s->refcount_block_cache,
 830            s->l2_table_cache);
 831    }
 832
 833    start = start_of_cluster(s, offset);
 834    last = start_of_cluster(s, offset + length - 1);
 835    for(cluster_offset = start; cluster_offset <= last;
 836        cluster_offset += s->cluster_size)
 837    {
 838        int block_index;
 839        uint64_t refcount;
 840        int64_t cluster_index = cluster_offset >> s->cluster_bits;
 841        int64_t table_index = cluster_index >> s->refcount_block_bits;
 842
 843        /* Load the refcount block and allocate it if needed */
 844        if (table_index != old_table_index) {
 845            if (refcount_block) {
 846                qcow2_cache_put(s->refcount_block_cache, &refcount_block);
 847            }
 848            ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
 849            /* If the caller needs to restart the search for free clusters,
 850             * try the same ones first to see if they're still free. */
 851            if (ret == -EAGAIN) {
 852                if (s->free_cluster_index > (start >> s->cluster_bits)) {
 853                    s->free_cluster_index = (start >> s->cluster_bits);
 854                }
 855            }
 856            if (ret < 0) {
 857                goto fail;
 858            }
 859        }
 860        old_table_index = table_index;
 861
 862        qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
 863
 864        /* we can update the count and save it */
 865        block_index = cluster_index & (s->refcount_block_size - 1);
 866
 867        refcount = s->get_refcount(refcount_block, block_index);
 868        if (decrease ? (refcount - addend > refcount)
 869                     : (refcount + addend < refcount ||
 870                        refcount + addend > s->refcount_max))
 871        {
 872            ret = -EINVAL;
 873            goto fail;
 874        }
 875        if (decrease) {
 876            refcount -= addend;
 877        } else {
 878            refcount += addend;
 879        }
 880        if (refcount == 0 && cluster_index < s->free_cluster_index) {
 881            s->free_cluster_index = cluster_index;
 882        }
 883        s->set_refcount(refcount_block, block_index, refcount);
 884
 885        if (refcount == 0) {
 886            void *table;
 887
 888            table = qcow2_cache_is_table_offset(s->refcount_block_cache,
 889                                                offset);
 890            if (table != NULL) {
 891                qcow2_cache_put(s->refcount_block_cache, &refcount_block);
 892                qcow2_cache_discard(s->refcount_block_cache, table);
 893            }
 894
 895            table = qcow2_cache_is_table_offset(s->l2_table_cache, offset);
 896            if (table != NULL) {
 897                qcow2_cache_discard(s->l2_table_cache, table);
 898            }
 899
 900            if (s->discard_passthrough[type]) {
 901                update_refcount_discard(bs, cluster_offset, s->cluster_size);
 902            }
 903        }
 904    }
 905
 906    ret = 0;
 907fail:
 908    if (!s->cache_discards) {
 909        qcow2_process_discards(bs, ret);
 910    }
 911
 912    /* Write last changed block to disk */
 913    if (refcount_block) {
 914        qcow2_cache_put(s->refcount_block_cache, &refcount_block);
 915    }
 916
 917    /*
 918     * Try do undo any updates if an error is returned (This may succeed in
 919     * some cases like ENOSPC for allocating a new refcount block)
 920     */
 921    if (ret < 0) {
 922        int dummy;
 923        dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
 924                                !decrease, QCOW2_DISCARD_NEVER);
 925        (void)dummy;
 926    }
 927
 928    return ret;
 929}
 930
 931/*
 932 * Increases or decreases the refcount of a given cluster.
 933 *
 934 * @addend is the absolute value of the addend; if @decrease is set, @addend
 935 * will be subtracted from the current refcount, otherwise it will be added.
 936 *
 937 * On success 0 is returned; on failure -errno is returned.
 938 */
 939int qcow2_update_cluster_refcount(BlockDriverState *bs,
 940                                  int64_t cluster_index,
 941                                  uint64_t addend, bool decrease,
 942                                  enum qcow2_discard_type type)
 943{
 944    BDRVQcow2State *s = bs->opaque;
 945    int ret;
 946
 947    ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
 948                          decrease, type);
 949    if (ret < 0) {
 950        return ret;
 951    }
 952
 953    return 0;
 954}
 955
 956
 957
 958/*********************************************************/
 959/* cluster allocation functions */
 960
 961
 962
 963/* return < 0 if error */
 964static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
 965                                    uint64_t max)
 966{
 967    BDRVQcow2State *s = bs->opaque;
 968    uint64_t i, nb_clusters, refcount;
 969    int ret;
 970
 971    /* We can't allocate clusters if they may still be queued for discard. */
 972    if (s->cache_discards) {
 973        qcow2_process_discards(bs, 0);
 974    }
 975
 976    nb_clusters = size_to_clusters(s, size);
 977retry:
 978    for(i = 0; i < nb_clusters; i++) {
 979        uint64_t next_cluster_index = s->free_cluster_index++;
 980        ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
 981
 982        if (ret < 0) {
 983            return ret;
 984        } else if (refcount != 0) {
 985            goto retry;
 986        }
 987    }
 988
 989    /* Make sure that all offsets in the "allocated" range are representable
 990     * in the requested max */
 991    if (s->free_cluster_index > 0 &&
 992        s->free_cluster_index - 1 > (max >> s->cluster_bits))
 993    {
 994        return -EFBIG;
 995    }
 996
 997#ifdef DEBUG_ALLOC2
 998    fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
 999            size,
1000            (s->free_cluster_index - nb_clusters) << s->cluster_bits);
1001#endif
1002    return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
1003}
1004
1005int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
1006{
1007    int64_t offset;
1008    int ret;
1009
1010    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
1011    do {
1012        offset = alloc_clusters_noref(bs, size, QCOW_MAX_CLUSTER_OFFSET);
1013        if (offset < 0) {
1014            return offset;
1015        }
1016
1017        ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1018    } while (ret == -EAGAIN);
1019
1020    if (ret < 0) {
1021        return ret;
1022    }
1023
1024    return offset;
1025}
1026
1027int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
1028                                int64_t nb_clusters)
1029{
1030    BDRVQcow2State *s = bs->opaque;
1031    uint64_t cluster_index, refcount;
1032    uint64_t i;
1033    int ret;
1034
1035    assert(nb_clusters >= 0);
1036    if (nb_clusters == 0) {
1037        return 0;
1038    }
1039
1040    do {
1041        /* Check how many clusters there are free */
1042        cluster_index = offset >> s->cluster_bits;
1043        for(i = 0; i < nb_clusters; i++) {
1044            ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
1045            if (ret < 0) {
1046                return ret;
1047            } else if (refcount != 0) {
1048                break;
1049            }
1050        }
1051
1052        /* And then allocate them */
1053        ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
1054                              QCOW2_DISCARD_NEVER);
1055    } while (ret == -EAGAIN);
1056
1057    if (ret < 0) {
1058        return ret;
1059    }
1060
1061    return i;
1062}
1063
1064/* only used to allocate compressed sectors. We try to allocate
1065   contiguous sectors. size must be <= cluster_size */
1066int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
1067{
1068    BDRVQcow2State *s = bs->opaque;
1069    int64_t offset;
1070    size_t free_in_cluster;
1071    int ret;
1072
1073    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
1074    assert(size > 0 && size <= s->cluster_size);
1075    assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
1076
1077    offset = s->free_byte_offset;
1078
1079    if (offset) {
1080        uint64_t refcount;
1081        ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
1082        if (ret < 0) {
1083            return ret;
1084        }
1085
1086        if (refcount == s->refcount_max) {
1087            offset = 0;
1088        }
1089    }
1090
1091    free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
1092    do {
1093        if (!offset || free_in_cluster < size) {
1094            int64_t new_cluster;
1095
1096            new_cluster = alloc_clusters_noref(bs, s->cluster_size,
1097                                               MIN(s->cluster_offset_mask,
1098                                                   QCOW_MAX_CLUSTER_OFFSET));
1099            if (new_cluster < 0) {
1100                return new_cluster;
1101            }
1102
1103            if (new_cluster == 0) {
1104                qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
1105                                        "allocation of compressed cluster "
1106                                        "at offset 0");
1107                return -EIO;
1108            }
1109
1110            if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
1111                offset = new_cluster;
1112                free_in_cluster = s->cluster_size;
1113            } else {
1114                free_in_cluster += s->cluster_size;
1115            }
1116        }
1117
1118        assert(offset);
1119        ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1120        if (ret < 0) {
1121            offset = 0;
1122        }
1123    } while (ret == -EAGAIN);
1124    if (ret < 0) {
1125        return ret;
1126    }
1127
1128    /* The cluster refcount was incremented; refcount blocks must be flushed
1129     * before the caller's L2 table updates. */
1130    qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
1131
1132    s->free_byte_offset = offset + size;
1133    if (!offset_into_cluster(s, s->free_byte_offset)) {
1134        s->free_byte_offset = 0;
1135    }
1136
1137    return offset;
1138}
1139
1140void qcow2_free_clusters(BlockDriverState *bs,
1141                          int64_t offset, int64_t size,
1142                          enum qcow2_discard_type type)
1143{
1144    int ret;
1145
1146    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
1147    ret = update_refcount(bs, offset, size, 1, true, type);
1148    if (ret < 0) {
1149        fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
1150        /* TODO Remember the clusters to free them later and avoid leaking */
1151    }
1152}
1153
1154/*
1155 * Free a cluster using its L2 entry (handles clusters of all types, e.g.
1156 * normal cluster, compressed cluster, etc.)
1157 */
1158void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
1159                             int nb_clusters, enum qcow2_discard_type type)
1160{
1161    BDRVQcow2State *s = bs->opaque;
1162    QCow2ClusterType ctype = qcow2_get_cluster_type(bs, l2_entry);
1163
1164    if (has_data_file(bs)) {
1165        if (s->discard_passthrough[type] &&
1166            (ctype == QCOW2_CLUSTER_NORMAL ||
1167             ctype == QCOW2_CLUSTER_ZERO_ALLOC))
1168        {
1169            bdrv_pdiscard(s->data_file, l2_entry & L2E_OFFSET_MASK,
1170                          nb_clusters << s->cluster_bits);
1171        }
1172        return;
1173    }
1174
1175    switch (ctype) {
1176    case QCOW2_CLUSTER_COMPRESSED:
1177        {
1178            int64_t offset = (l2_entry & s->cluster_offset_mask)
1179                & QCOW2_COMPRESSED_SECTOR_MASK;
1180            int size = QCOW2_COMPRESSED_SECTOR_SIZE *
1181                (((l2_entry >> s->csize_shift) & s->csize_mask) + 1);
1182            qcow2_free_clusters(bs, offset, size, type);
1183        }
1184        break;
1185    case QCOW2_CLUSTER_NORMAL:
1186    case QCOW2_CLUSTER_ZERO_ALLOC:
1187        if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1188            qcow2_signal_corruption(bs, false, -1, -1,
1189                                    "Cannot free unaligned cluster %#llx",
1190                                    l2_entry & L2E_OFFSET_MASK);
1191        } else {
1192            qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1193                                nb_clusters << s->cluster_bits, type);
1194        }
1195        break;
1196    case QCOW2_CLUSTER_ZERO_PLAIN:
1197    case QCOW2_CLUSTER_UNALLOCATED:
1198        break;
1199    default:
1200        abort();
1201    }
1202}
1203
1204int coroutine_fn qcow2_write_caches(BlockDriverState *bs)
1205{
1206    BDRVQcow2State *s = bs->opaque;
1207    int ret;
1208
1209    ret = qcow2_cache_write(bs, s->l2_table_cache);
1210    if (ret < 0) {
1211        return ret;
1212    }
1213
1214    if (qcow2_need_accurate_refcounts(s)) {
1215        ret = qcow2_cache_write(bs, s->refcount_block_cache);
1216        if (ret < 0) {
1217            return ret;
1218        }
1219    }
1220
1221    return 0;
1222}
1223
1224int coroutine_fn qcow2_flush_caches(BlockDriverState *bs)
1225{
1226    int ret = qcow2_write_caches(bs);
1227    if (ret < 0) {
1228        return ret;
1229    }
1230
1231    return bdrv_flush(bs->file->bs);
1232}
1233
1234/*********************************************************/
1235/* snapshots and image creation */
1236
1237
1238
1239/* update the refcounts of snapshots and the copied flag */
1240int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1241    int64_t l1_table_offset, int l1_size, int addend)
1242{
1243    BDRVQcow2State *s = bs->opaque;
1244    uint64_t *l1_table, *l2_slice, l2_offset, entry, l1_size2, refcount;
1245    bool l1_allocated = false;
1246    int64_t old_entry, old_l2_offset;
1247    unsigned slice, slice_size2, n_slices;
1248    int i, j, l1_modified = 0, nb_csectors;
1249    int ret;
1250
1251    assert(addend >= -1 && addend <= 1);
1252
1253    l2_slice = NULL;
1254    l1_table = NULL;
1255    l1_size2 = l1_size * sizeof(uint64_t);
1256    slice_size2 = s->l2_slice_size * sizeof(uint64_t);
1257    n_slices = s->cluster_size / slice_size2;
1258
1259    s->cache_discards = true;
1260
1261    /* WARNING: qcow2_snapshot_goto relies on this function not using the
1262     * l1_table_offset when it is the current s->l1_table_offset! Be careful
1263     * when changing this! */
1264    if (l1_table_offset != s->l1_table_offset) {
1265        l1_table = g_try_malloc0(ROUND_UP(l1_size2, 512));
1266        if (l1_size2 && l1_table == NULL) {
1267            ret = -ENOMEM;
1268            goto fail;
1269        }
1270        l1_allocated = true;
1271
1272        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1273        if (ret < 0) {
1274            goto fail;
1275        }
1276
1277        for (i = 0; i < l1_size; i++) {
1278            be64_to_cpus(&l1_table[i]);
1279        }
1280    } else {
1281        assert(l1_size == s->l1_size);
1282        l1_table = s->l1_table;
1283        l1_allocated = false;
1284    }
1285
1286    for (i = 0; i < l1_size; i++) {
1287        l2_offset = l1_table[i];
1288        if (l2_offset) {
1289            old_l2_offset = l2_offset;
1290            l2_offset &= L1E_OFFSET_MASK;
1291
1292            if (offset_into_cluster(s, l2_offset)) {
1293                qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1294                                        PRIx64 " unaligned (L1 index: %#x)",
1295                                        l2_offset, i);
1296                ret = -EIO;
1297                goto fail;
1298            }
1299
1300            for (slice = 0; slice < n_slices; slice++) {
1301                ret = qcow2_cache_get(bs, s->l2_table_cache,
1302                                      l2_offset + slice * slice_size2,
1303                                      (void **) &l2_slice);
1304                if (ret < 0) {
1305                    goto fail;
1306                }
1307
1308                for (j = 0; j < s->l2_slice_size; j++) {
1309                    uint64_t cluster_index;
1310                    uint64_t offset;
1311
1312                    entry = be64_to_cpu(l2_slice[j]);
1313                    old_entry = entry;
1314                    entry &= ~QCOW_OFLAG_COPIED;
1315                    offset = entry & L2E_OFFSET_MASK;
1316
1317                    switch (qcow2_get_cluster_type(bs, entry)) {
1318                    case QCOW2_CLUSTER_COMPRESSED:
1319                        nb_csectors = ((entry >> s->csize_shift) &
1320                                       s->csize_mask) + 1;
1321                        if (addend != 0) {
1322                            uint64_t coffset = (entry & s->cluster_offset_mask)
1323                                & QCOW2_COMPRESSED_SECTOR_MASK;
1324                            ret = update_refcount(
1325                                bs, coffset,
1326                                nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE,
1327                                abs(addend), addend < 0,
1328                                QCOW2_DISCARD_SNAPSHOT);
1329                            if (ret < 0) {
1330                                goto fail;
1331                            }
1332                        }
1333                        /* compressed clusters are never modified */
1334                        refcount = 2;
1335                        break;
1336
1337                    case QCOW2_CLUSTER_NORMAL:
1338                    case QCOW2_CLUSTER_ZERO_ALLOC:
1339                        if (offset_into_cluster(s, offset)) {
1340                            /* Here l2_index means table (not slice) index */
1341                            int l2_index = slice * s->l2_slice_size + j;
1342                            qcow2_signal_corruption(
1343                                bs, true, -1, -1, "Cluster "
1344                                "allocation offset %#" PRIx64
1345                                " unaligned (L2 offset: %#"
1346                                PRIx64 ", L2 index: %#x)",
1347                                offset, l2_offset, l2_index);
1348                            ret = -EIO;
1349                            goto fail;
1350                        }
1351
1352                        cluster_index = offset >> s->cluster_bits;
1353                        assert(cluster_index);
1354                        if (addend != 0) {
1355                            ret = qcow2_update_cluster_refcount(
1356                                bs, cluster_index, abs(addend), addend < 0,
1357                                QCOW2_DISCARD_SNAPSHOT);
1358                            if (ret < 0) {
1359                                goto fail;
1360                            }
1361                        }
1362
1363                        ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1364                        if (ret < 0) {
1365                            goto fail;
1366                        }
1367                        break;
1368
1369                    case QCOW2_CLUSTER_ZERO_PLAIN:
1370                    case QCOW2_CLUSTER_UNALLOCATED:
1371                        refcount = 0;
1372                        break;
1373
1374                    default:
1375                        abort();
1376                    }
1377
1378                    if (refcount == 1) {
1379                        entry |= QCOW_OFLAG_COPIED;
1380                    }
1381                    if (entry != old_entry) {
1382                        if (addend > 0) {
1383                            qcow2_cache_set_dependency(bs, s->l2_table_cache,
1384                                                       s->refcount_block_cache);
1385                        }
1386                        l2_slice[j] = cpu_to_be64(entry);
1387                        qcow2_cache_entry_mark_dirty(s->l2_table_cache,
1388                                                     l2_slice);
1389                    }
1390                }
1391
1392                qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1393            }
1394
1395            if (addend != 0) {
1396                ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1397                                                        s->cluster_bits,
1398                                                    abs(addend), addend < 0,
1399                                                    QCOW2_DISCARD_SNAPSHOT);
1400                if (ret < 0) {
1401                    goto fail;
1402                }
1403            }
1404            ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1405                                     &refcount);
1406            if (ret < 0) {
1407                goto fail;
1408            } else if (refcount == 1) {
1409                l2_offset |= QCOW_OFLAG_COPIED;
1410            }
1411            if (l2_offset != old_l2_offset) {
1412                l1_table[i] = l2_offset;
1413                l1_modified = 1;
1414            }
1415        }
1416    }
1417
1418    ret = bdrv_flush(bs);
1419fail:
1420    if (l2_slice) {
1421        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1422    }
1423
1424    s->cache_discards = false;
1425    qcow2_process_discards(bs, ret);
1426
1427    /* Update L1 only if it isn't deleted anyway (addend = -1) */
1428    if (ret == 0 && addend >= 0 && l1_modified) {
1429        for (i = 0; i < l1_size; i++) {
1430            cpu_to_be64s(&l1_table[i]);
1431        }
1432
1433        ret = bdrv_pwrite_sync(bs->file, l1_table_offset,
1434                               l1_table, l1_size2);
1435
1436        for (i = 0; i < l1_size; i++) {
1437            be64_to_cpus(&l1_table[i]);
1438        }
1439    }
1440    if (l1_allocated)
1441        g_free(l1_table);
1442    return ret;
1443}
1444
1445
1446
1447
1448/*********************************************************/
1449/* refcount checking functions */
1450
1451
1452static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
1453{
1454    /* This assertion holds because there is no way we can address more than
1455     * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1456     * offsets have to be representable in bytes); due to every cluster
1457     * corresponding to one refcount entry, we are well below that limit */
1458    assert(entries < (UINT64_C(1) << (64 - 9)));
1459
1460    /* Thanks to the assertion this will not overflow, because
1461     * s->refcount_order < 7.
1462     * (note: x << s->refcount_order == x * s->refcount_bits) */
1463    return DIV_ROUND_UP(entries << s->refcount_order, 8);
1464}
1465
1466/**
1467 * Reallocates *array so that it can hold new_size entries. *size must contain
1468 * the current number of entries in *array. If the reallocation fails, *array
1469 * and *size will not be modified and -errno will be returned. If the
1470 * reallocation is successful, *array will be set to the new buffer, *size
1471 * will be set to new_size and 0 will be returned. The size of the reallocated
1472 * refcount array buffer will be aligned to a cluster boundary, and the newly
1473 * allocated area will be zeroed.
1474 */
1475static int realloc_refcount_array(BDRVQcow2State *s, void **array,
1476                                  int64_t *size, int64_t new_size)
1477{
1478    int64_t old_byte_size, new_byte_size;
1479    void *new_ptr;
1480
1481    /* Round to clusters so the array can be directly written to disk */
1482    old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1483                    * s->cluster_size;
1484    new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1485                    * s->cluster_size;
1486
1487    if (new_byte_size == old_byte_size) {
1488        *size = new_size;
1489        return 0;
1490    }
1491
1492    assert(new_byte_size > 0);
1493
1494    if (new_byte_size > SIZE_MAX) {
1495        return -ENOMEM;
1496    }
1497
1498    new_ptr = g_try_realloc(*array, new_byte_size);
1499    if (!new_ptr) {
1500        return -ENOMEM;
1501    }
1502
1503    if (new_byte_size > old_byte_size) {
1504        memset((char *)new_ptr + old_byte_size, 0,
1505               new_byte_size - old_byte_size);
1506    }
1507
1508    *array = new_ptr;
1509    *size  = new_size;
1510
1511    return 0;
1512}
1513
1514/*
1515 * Increases the refcount for a range of clusters in a given refcount table.
1516 * This is used to construct a temporary refcount table out of L1 and L2 tables
1517 * which can be compared to the refcount table saved in the image.
1518 *
1519 * Modifies the number of errors in res.
1520 */
1521int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
1522                             void **refcount_table,
1523                             int64_t *refcount_table_size,
1524                             int64_t offset, int64_t size)
1525{
1526    BDRVQcow2State *s = bs->opaque;
1527    uint64_t start, last, cluster_offset, k, refcount;
1528    int64_t file_len;
1529    int ret;
1530
1531    if (size <= 0) {
1532        return 0;
1533    }
1534
1535    file_len = bdrv_getlength(bs->file->bs);
1536    if (file_len < 0) {
1537        return file_len;
1538    }
1539
1540    /*
1541     * Last cluster of qcow2 image may be semi-allocated, so it may be OK to
1542     * reference some space after file end but it should be less than one
1543     * cluster.
1544     */
1545    if (offset + size - file_len >= s->cluster_size) {
1546        fprintf(stderr, "ERROR: counting reference for region exceeding the "
1547                "end of the file by one cluster or more: offset 0x%" PRIx64
1548                " size 0x%" PRIx64 "\n", offset, size);
1549        res->corruptions++;
1550        return 0;
1551    }
1552
1553    start = start_of_cluster(s, offset);
1554    last = start_of_cluster(s, offset + size - 1);
1555    for(cluster_offset = start; cluster_offset <= last;
1556        cluster_offset += s->cluster_size) {
1557        k = cluster_offset >> s->cluster_bits;
1558        if (k >= *refcount_table_size) {
1559            ret = realloc_refcount_array(s, refcount_table,
1560                                         refcount_table_size, k + 1);
1561            if (ret < 0) {
1562                res->check_errors++;
1563                return ret;
1564            }
1565        }
1566
1567        refcount = s->get_refcount(*refcount_table, k);
1568        if (refcount == s->refcount_max) {
1569            fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1570                    "\n", cluster_offset);
1571            fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1572                    "width or qemu-img convert to create a clean copy if the "
1573                    "image cannot be opened for writing\n");
1574            res->corruptions++;
1575            continue;
1576        }
1577        s->set_refcount(*refcount_table, k, refcount + 1);
1578    }
1579
1580    return 0;
1581}
1582
1583/* Flags for check_refcounts_l1() and check_refcounts_l2() */
1584enum {
1585    CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1586};
1587
1588/*
1589 * Increases the refcount in the given refcount table for the all clusters
1590 * referenced in the L2 table. While doing so, performs some checks on L2
1591 * entries.
1592 *
1593 * Returns the number of errors found by the checks or -errno if an internal
1594 * error occurred.
1595 */
1596static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1597                              void **refcount_table,
1598                              int64_t *refcount_table_size, int64_t l2_offset,
1599                              int flags, BdrvCheckMode fix, bool active)
1600{
1601    BDRVQcow2State *s = bs->opaque;
1602    uint64_t *l2_table, l2_entry;
1603    uint64_t next_contiguous_offset = 0;
1604    int i, l2_size, nb_csectors, ret;
1605
1606    /* Read L2 table from disk */
1607    l2_size = s->l2_size * sizeof(uint64_t);
1608    l2_table = g_malloc(l2_size);
1609
1610    ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size);
1611    if (ret < 0) {
1612        fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1613        res->check_errors++;
1614        goto fail;
1615    }
1616
1617    /* Do the actual checks */
1618    for(i = 0; i < s->l2_size; i++) {
1619        l2_entry = be64_to_cpu(l2_table[i]);
1620
1621        switch (qcow2_get_cluster_type(bs, l2_entry)) {
1622        case QCOW2_CLUSTER_COMPRESSED:
1623            /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1624            if (l2_entry & QCOW_OFLAG_COPIED) {
1625                fprintf(stderr, "ERROR: coffset=0x%" PRIx64 ": "
1626                    "copied flag must never be set for compressed "
1627                    "clusters\n", l2_entry & s->cluster_offset_mask);
1628                l2_entry &= ~QCOW_OFLAG_COPIED;
1629                res->corruptions++;
1630            }
1631
1632            if (has_data_file(bs)) {
1633                fprintf(stderr, "ERROR compressed cluster %d with data file, "
1634                        "entry=0x%" PRIx64 "\n", i, l2_entry);
1635                res->corruptions++;
1636                break;
1637            }
1638
1639            /* Mark cluster as used */
1640            nb_csectors = ((l2_entry >> s->csize_shift) &
1641                           s->csize_mask) + 1;
1642            l2_entry &= s->cluster_offset_mask;
1643            ret = qcow2_inc_refcounts_imrt(
1644                bs, res, refcount_table, refcount_table_size,
1645                l2_entry & QCOW2_COMPRESSED_SECTOR_MASK,
1646                nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE);
1647            if (ret < 0) {
1648                goto fail;
1649            }
1650
1651            if (flags & CHECK_FRAG_INFO) {
1652                res->bfi.allocated_clusters++;
1653                res->bfi.compressed_clusters++;
1654
1655                /* Compressed clusters are fragmented by nature.  Since they
1656                 * take up sub-sector space but we only have sector granularity
1657                 * I/O we need to re-read the same sectors even for adjacent
1658                 * compressed clusters.
1659                 */
1660                res->bfi.fragmented_clusters++;
1661            }
1662            break;
1663
1664        case QCOW2_CLUSTER_ZERO_ALLOC:
1665        case QCOW2_CLUSTER_NORMAL:
1666        {
1667            uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1668
1669            /* Correct offsets are cluster aligned */
1670            if (offset_into_cluster(s, offset)) {
1671                res->corruptions++;
1672
1673                if (qcow2_get_cluster_type(bs, l2_entry) ==
1674                    QCOW2_CLUSTER_ZERO_ALLOC)
1675                {
1676                    fprintf(stderr, "%s offset=%" PRIx64 ": Preallocated zero "
1677                            "cluster is not properly aligned; L2 entry "
1678                            "corrupted.\n",
1679                            fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
1680                            offset);
1681                    if (fix & BDRV_FIX_ERRORS) {
1682                        uint64_t l2e_offset =
1683                            l2_offset + (uint64_t)i * sizeof(uint64_t);
1684                        int ign = active ? QCOW2_OL_ACTIVE_L2 :
1685                                           QCOW2_OL_INACTIVE_L2;
1686
1687                        l2_entry = QCOW_OFLAG_ZERO;
1688                        l2_table[i] = cpu_to_be64(l2_entry);
1689                        ret = qcow2_pre_write_overlap_check(bs, ign,
1690                                l2e_offset, sizeof(uint64_t), false);
1691                        if (ret < 0) {
1692                            fprintf(stderr, "ERROR: Overlap check failed\n");
1693                            res->check_errors++;
1694                            /* Something is seriously wrong, so abort checking
1695                             * this L2 table */
1696                            goto fail;
1697                        }
1698
1699                        ret = bdrv_pwrite_sync(bs->file, l2e_offset,
1700                                               &l2_table[i], sizeof(uint64_t));
1701                        if (ret < 0) {
1702                            fprintf(stderr, "ERROR: Failed to overwrite L2 "
1703                                    "table entry: %s\n", strerror(-ret));
1704                            res->check_errors++;
1705                            /* Do not abort, continue checking the rest of this
1706                             * L2 table's entries */
1707                        } else {
1708                            res->corruptions--;
1709                            res->corruptions_fixed++;
1710                            /* Skip marking the cluster as used
1711                             * (it is unused now) */
1712                            continue;
1713                        }
1714                    }
1715                } else {
1716                    fprintf(stderr, "ERROR offset=%" PRIx64 ": Data cluster is "
1717                        "not properly aligned; L2 entry corrupted.\n", offset);
1718                }
1719            }
1720
1721            if (flags & CHECK_FRAG_INFO) {
1722                res->bfi.allocated_clusters++;
1723                if (next_contiguous_offset &&
1724                    offset != next_contiguous_offset) {
1725                    res->bfi.fragmented_clusters++;
1726                }
1727                next_contiguous_offset = offset + s->cluster_size;
1728            }
1729
1730            /* Mark cluster as used */
1731            if (!has_data_file(bs)) {
1732                ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table,
1733                                               refcount_table_size,
1734                                               offset, s->cluster_size);
1735                if (ret < 0) {
1736                    goto fail;
1737                }
1738            }
1739            break;
1740        }
1741
1742        case QCOW2_CLUSTER_ZERO_PLAIN:
1743        case QCOW2_CLUSTER_UNALLOCATED:
1744            break;
1745
1746        default:
1747            abort();
1748        }
1749    }
1750
1751    g_free(l2_table);
1752    return 0;
1753
1754fail:
1755    g_free(l2_table);
1756    return ret;
1757}
1758
1759/*
1760 * Increases the refcount for the L1 table, its L2 tables and all referenced
1761 * clusters in the given refcount table. While doing so, performs some checks
1762 * on L1 and L2 entries.
1763 *
1764 * Returns the number of errors found by the checks or -errno if an internal
1765 * error occurred.
1766 */
1767static int check_refcounts_l1(BlockDriverState *bs,
1768                              BdrvCheckResult *res,
1769                              void **refcount_table,
1770                              int64_t *refcount_table_size,
1771                              int64_t l1_table_offset, int l1_size,
1772                              int flags, BdrvCheckMode fix, bool active)
1773{
1774    BDRVQcow2State *s = bs->opaque;
1775    uint64_t *l1_table = NULL, l2_offset, l1_size2;
1776    int i, ret;
1777
1778    l1_size2 = l1_size * sizeof(uint64_t);
1779
1780    /* Mark L1 table as used */
1781    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size,
1782                                   l1_table_offset, l1_size2);
1783    if (ret < 0) {
1784        goto fail;
1785    }
1786
1787    /* Read L1 table entries from disk */
1788    if (l1_size2 > 0) {
1789        l1_table = g_try_malloc(l1_size2);
1790        if (l1_table == NULL) {
1791            ret = -ENOMEM;
1792            res->check_errors++;
1793            goto fail;
1794        }
1795        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1796        if (ret < 0) {
1797            fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1798            res->check_errors++;
1799            goto fail;
1800        }
1801        for(i = 0;i < l1_size; i++)
1802            be64_to_cpus(&l1_table[i]);
1803    }
1804
1805    /* Do the actual checks */
1806    for(i = 0; i < l1_size; i++) {
1807        l2_offset = l1_table[i];
1808        if (l2_offset) {
1809            /* Mark L2 table as used */
1810            l2_offset &= L1E_OFFSET_MASK;
1811            ret = qcow2_inc_refcounts_imrt(bs, res,
1812                                           refcount_table, refcount_table_size,
1813                                           l2_offset, s->cluster_size);
1814            if (ret < 0) {
1815                goto fail;
1816            }
1817
1818            /* L2 tables are cluster aligned */
1819            if (offset_into_cluster(s, l2_offset)) {
1820                fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1821                    "cluster aligned; L1 entry corrupted\n", l2_offset);
1822                res->corruptions++;
1823            }
1824
1825            /* Process and check L2 entries */
1826            ret = check_refcounts_l2(bs, res, refcount_table,
1827                                     refcount_table_size, l2_offset, flags,
1828                                     fix, active);
1829            if (ret < 0) {
1830                goto fail;
1831            }
1832        }
1833    }
1834    g_free(l1_table);
1835    return 0;
1836
1837fail:
1838    g_free(l1_table);
1839    return ret;
1840}
1841
1842/*
1843 * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1844 *
1845 * This function does not print an error message nor does it increment
1846 * check_errors if qcow2_get_refcount fails (this is because such an error will
1847 * have been already detected and sufficiently signaled by the calling function
1848 * (qcow2_check_refcounts) by the time this function is called).
1849 */
1850static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1851                              BdrvCheckMode fix)
1852{
1853    BDRVQcow2State *s = bs->opaque;
1854    uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1855    int ret;
1856    uint64_t refcount;
1857    int i, j;
1858    bool repair;
1859
1860    if (fix & BDRV_FIX_ERRORS) {
1861        /* Always repair */
1862        repair = true;
1863    } else if (fix & BDRV_FIX_LEAKS) {
1864        /* Repair only if that seems safe: This function is always
1865         * called after the refcounts have been fixed, so the refcount
1866         * is accurate if that repair was successful */
1867        repair = !res->check_errors && !res->corruptions && !res->leaks;
1868    } else {
1869        repair = false;
1870    }
1871
1872    for (i = 0; i < s->l1_size; i++) {
1873        uint64_t l1_entry = s->l1_table[i];
1874        uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1875        int l2_dirty = 0;
1876
1877        if (!l2_offset) {
1878            continue;
1879        }
1880
1881        ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1882                                 &refcount);
1883        if (ret < 0) {
1884            /* don't print message nor increment check_errors */
1885            continue;
1886        }
1887        if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1888            res->corruptions++;
1889            fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1890                    "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1891                    repair ? "Repairing" : "ERROR", i, l1_entry, refcount);
1892            if (repair) {
1893                s->l1_table[i] = refcount == 1
1894                               ? l1_entry |  QCOW_OFLAG_COPIED
1895                               : l1_entry & ~QCOW_OFLAG_COPIED;
1896                ret = qcow2_write_l1_entry(bs, i);
1897                if (ret < 0) {
1898                    res->check_errors++;
1899                    goto fail;
1900                }
1901                res->corruptions--;
1902                res->corruptions_fixed++;
1903            }
1904        }
1905
1906        ret = bdrv_pread(bs->file, l2_offset, l2_table,
1907                         s->l2_size * sizeof(uint64_t));
1908        if (ret < 0) {
1909            fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1910                    strerror(-ret));
1911            res->check_errors++;
1912            goto fail;
1913        }
1914
1915        for (j = 0; j < s->l2_size; j++) {
1916            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1917            uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
1918            QCow2ClusterType cluster_type = qcow2_get_cluster_type(bs, l2_entry);
1919
1920            if (cluster_type == QCOW2_CLUSTER_NORMAL ||
1921                cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) {
1922                if (has_data_file(bs)) {
1923                    refcount = 1;
1924                } else {
1925                    ret = qcow2_get_refcount(bs,
1926                                             data_offset >> s->cluster_bits,
1927                                             &refcount);
1928                    if (ret < 0) {
1929                        /* don't print message nor increment check_errors */
1930                        continue;
1931                    }
1932                }
1933                if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
1934                    res->corruptions++;
1935                    fprintf(stderr, "%s OFLAG_COPIED data cluster: "
1936                            "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1937                            repair ? "Repairing" : "ERROR", l2_entry, refcount);
1938                    if (repair) {
1939                        l2_table[j] = cpu_to_be64(refcount == 1
1940                                    ? l2_entry |  QCOW_OFLAG_COPIED
1941                                    : l2_entry & ~QCOW_OFLAG_COPIED);
1942                        l2_dirty++;
1943                    }
1944                }
1945            }
1946        }
1947
1948        if (l2_dirty > 0) {
1949            ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1950                                                l2_offset, s->cluster_size,
1951                                                false);
1952            if (ret < 0) {
1953                fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1954                        "overlap check failed: %s\n", strerror(-ret));
1955                res->check_errors++;
1956                goto fail;
1957            }
1958
1959            ret = bdrv_pwrite(bs->file, l2_offset, l2_table,
1960                              s->cluster_size);
1961            if (ret < 0) {
1962                fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1963                        strerror(-ret));
1964                res->check_errors++;
1965                goto fail;
1966            }
1967            res->corruptions -= l2_dirty;
1968            res->corruptions_fixed += l2_dirty;
1969        }
1970    }
1971
1972    ret = 0;
1973
1974fail:
1975    qemu_vfree(l2_table);
1976    return ret;
1977}
1978
1979/*
1980 * Checks consistency of refblocks and accounts for each refblock in
1981 * *refcount_table.
1982 */
1983static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
1984                           BdrvCheckMode fix, bool *rebuild,
1985                           void **refcount_table, int64_t *nb_clusters)
1986{
1987    BDRVQcow2State *s = bs->opaque;
1988    int64_t i, size;
1989    int ret;
1990
1991    for(i = 0; i < s->refcount_table_size; i++) {
1992        uint64_t offset, cluster;
1993        offset = s->refcount_table[i];
1994        cluster = offset >> s->cluster_bits;
1995
1996        /* Refcount blocks are cluster aligned */
1997        if (offset_into_cluster(s, offset)) {
1998            fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
1999                "cluster aligned; refcount table entry corrupted\n", i);
2000            res->corruptions++;
2001            *rebuild = true;
2002            continue;
2003        }
2004
2005        if (cluster >= *nb_clusters) {
2006            res->corruptions++;
2007            fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
2008                    fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
2009
2010            if (fix & BDRV_FIX_ERRORS) {
2011                int64_t new_nb_clusters;
2012                Error *local_err = NULL;
2013
2014                if (offset > INT64_MAX - s->cluster_size) {
2015                    ret = -EINVAL;
2016                    goto resize_fail;
2017                }
2018
2019                ret = bdrv_truncate(bs->file, offset + s->cluster_size,
2020                                    PREALLOC_MODE_OFF, &local_err);
2021                if (ret < 0) {
2022                    error_report_err(local_err);
2023                    goto resize_fail;
2024                }
2025                size = bdrv_getlength(bs->file->bs);
2026                if (size < 0) {
2027                    ret = size;
2028                    goto resize_fail;
2029                }
2030
2031                new_nb_clusters = size_to_clusters(s, size);
2032                assert(new_nb_clusters >= *nb_clusters);
2033
2034                ret = realloc_refcount_array(s, refcount_table,
2035                                             nb_clusters, new_nb_clusters);
2036                if (ret < 0) {
2037                    res->check_errors++;
2038                    return ret;
2039                }
2040
2041                if (cluster >= *nb_clusters) {
2042                    ret = -EINVAL;
2043                    goto resize_fail;
2044                }
2045
2046                res->corruptions--;
2047                res->corruptions_fixed++;
2048                ret = qcow2_inc_refcounts_imrt(bs, res,
2049                                               refcount_table, nb_clusters,
2050                                               offset, s->cluster_size);
2051                if (ret < 0) {
2052                    return ret;
2053                }
2054                /* No need to check whether the refcount is now greater than 1:
2055                 * This area was just allocated and zeroed, so it can only be
2056                 * exactly 1 after qcow2_inc_refcounts_imrt() */
2057                continue;
2058
2059resize_fail:
2060                *rebuild = true;
2061                fprintf(stderr, "ERROR could not resize image: %s\n",
2062                        strerror(-ret));
2063            }
2064            continue;
2065        }
2066
2067        if (offset != 0) {
2068            ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2069                                           offset, s->cluster_size);
2070            if (ret < 0) {
2071                return ret;
2072            }
2073            if (s->get_refcount(*refcount_table, cluster) != 1) {
2074                fprintf(stderr, "ERROR refcount block %" PRId64
2075                        " refcount=%" PRIu64 "\n", i,
2076                        s->get_refcount(*refcount_table, cluster));
2077                res->corruptions++;
2078                *rebuild = true;
2079            }
2080        }
2081    }
2082
2083    return 0;
2084}
2085
2086/*
2087 * Calculates an in-memory refcount table.
2088 */
2089static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2090                               BdrvCheckMode fix, bool *rebuild,
2091                               void **refcount_table, int64_t *nb_clusters)
2092{
2093    BDRVQcow2State *s = bs->opaque;
2094    int64_t i;
2095    QCowSnapshot *sn;
2096    int ret;
2097
2098    if (!*refcount_table) {
2099        int64_t old_size = 0;
2100        ret = realloc_refcount_array(s, refcount_table,
2101                                     &old_size, *nb_clusters);
2102        if (ret < 0) {
2103            res->check_errors++;
2104            return ret;
2105        }
2106    }
2107
2108    /* header */
2109    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2110                                   0, s->cluster_size);
2111    if (ret < 0) {
2112        return ret;
2113    }
2114
2115    /* current L1 table */
2116    ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2117                             s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO,
2118                             fix, true);
2119    if (ret < 0) {
2120        return ret;
2121    }
2122
2123    /* snapshots */
2124    if (has_data_file(bs) && s->nb_snapshots) {
2125        fprintf(stderr, "ERROR %d snapshots in image with data file\n",
2126                s->nb_snapshots);
2127        res->corruptions++;
2128    }
2129
2130    for (i = 0; i < s->nb_snapshots; i++) {
2131        sn = s->snapshots + i;
2132        if (offset_into_cluster(s, sn->l1_table_offset)) {
2133            fprintf(stderr, "ERROR snapshot %s (%s) l1_offset=%#" PRIx64 ": "
2134                    "L1 table is not cluster aligned; snapshot table entry "
2135                    "corrupted\n", sn->id_str, sn->name, sn->l1_table_offset);
2136            res->corruptions++;
2137            continue;
2138        }
2139        if (sn->l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) {
2140            fprintf(stderr, "ERROR snapshot %s (%s) l1_size=%#" PRIx32 ": "
2141                    "L1 table is too large; snapshot table entry corrupted\n",
2142                    sn->id_str, sn->name, sn->l1_size);
2143            res->corruptions++;
2144            continue;
2145        }
2146        ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2147                                 sn->l1_table_offset, sn->l1_size, 0, fix,
2148                                 false);
2149        if (ret < 0) {
2150            return ret;
2151        }
2152    }
2153    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2154                                   s->snapshots_offset, s->snapshots_size);
2155    if (ret < 0) {
2156        return ret;
2157    }
2158
2159    /* refcount data */
2160    ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2161                                   s->refcount_table_offset,
2162                                   s->refcount_table_size * sizeof(uint64_t));
2163    if (ret < 0) {
2164        return ret;
2165    }
2166
2167    /* encryption */
2168    if (s->crypto_header.length) {
2169        ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2170                                       s->crypto_header.offset,
2171                                       s->crypto_header.length);
2172        if (ret < 0) {
2173            return ret;
2174        }
2175    }
2176
2177    /* bitmaps */
2178    ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters);
2179    if (ret < 0) {
2180        return ret;
2181    }
2182
2183    return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
2184}
2185
2186/*
2187 * Compares the actual reference count for each cluster in the image against the
2188 * refcount as reported by the refcount structures on-disk.
2189 */
2190static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2191                              BdrvCheckMode fix, bool *rebuild,
2192                              int64_t *highest_cluster,
2193                              void *refcount_table, int64_t nb_clusters)
2194{
2195    BDRVQcow2State *s = bs->opaque;
2196    int64_t i;
2197    uint64_t refcount1, refcount2;
2198    int ret;
2199
2200    for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
2201        ret = qcow2_get_refcount(bs, i, &refcount1);
2202        if (ret < 0) {
2203            fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
2204                    i, strerror(-ret));
2205            res->check_errors++;
2206            continue;
2207        }
2208
2209        refcount2 = s->get_refcount(refcount_table, i);
2210
2211        if (refcount1 > 0 || refcount2 > 0) {
2212            *highest_cluster = i;
2213        }
2214
2215        if (refcount1 != refcount2) {
2216            /* Check if we're allowed to fix the mismatch */
2217            int *num_fixed = NULL;
2218            if (refcount1 == 0) {
2219                *rebuild = true;
2220            } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
2221                num_fixed = &res->leaks_fixed;
2222            } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
2223                num_fixed = &res->corruptions_fixed;
2224            }
2225
2226            fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
2227                    " reference=%" PRIu64 "\n",
2228                   num_fixed != NULL     ? "Repairing" :
2229                   refcount1 < refcount2 ? "ERROR" :
2230                                           "Leaked",
2231                   i, refcount1, refcount2);
2232
2233            if (num_fixed) {
2234                ret = update_refcount(bs, i << s->cluster_bits, 1,
2235                                      refcount_diff(refcount1, refcount2),
2236                                      refcount1 > refcount2,
2237                                      QCOW2_DISCARD_ALWAYS);
2238                if (ret >= 0) {
2239                    (*num_fixed)++;
2240                    continue;
2241                }
2242            }
2243
2244            /* And if we couldn't, print an error */
2245            if (refcount1 < refcount2) {
2246                res->corruptions++;
2247            } else {
2248                res->leaks++;
2249            }
2250        }
2251    }
2252}
2253
2254/*
2255 * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
2256 * the on-disk refcount structures.
2257 *
2258 * On input, *first_free_cluster tells where to start looking, and need not
2259 * actually be a free cluster; the returned offset will not be before that
2260 * cluster.  On output, *first_free_cluster points to the first gap found, even
2261 * if that gap was too small to be used as the returned offset.
2262 *
2263 * Note that *first_free_cluster is a cluster index whereas the return value is
2264 * an offset.
2265 */
2266static int64_t alloc_clusters_imrt(BlockDriverState *bs,
2267                                   int cluster_count,
2268                                   void **refcount_table,
2269                                   int64_t *imrt_nb_clusters,
2270                                   int64_t *first_free_cluster)
2271{
2272    BDRVQcow2State *s = bs->opaque;
2273    int64_t cluster = *first_free_cluster, i;
2274    bool first_gap = true;
2275    int contiguous_free_clusters;
2276    int ret;
2277
2278    /* Starting at *first_free_cluster, find a range of at least cluster_count
2279     * continuously free clusters */
2280    for (contiguous_free_clusters = 0;
2281         cluster < *imrt_nb_clusters &&
2282         contiguous_free_clusters < cluster_count;
2283         cluster++)
2284    {
2285        if (!s->get_refcount(*refcount_table, cluster)) {
2286            contiguous_free_clusters++;
2287            if (first_gap) {
2288                /* If this is the first free cluster found, update
2289                 * *first_free_cluster accordingly */
2290                *first_free_cluster = cluster;
2291                first_gap = false;
2292            }
2293        } else if (contiguous_free_clusters) {
2294            contiguous_free_clusters = 0;
2295        }
2296    }
2297
2298    /* If contiguous_free_clusters is greater than zero, it contains the number
2299     * of continuously free clusters until the current cluster; the first free
2300     * cluster in the current "gap" is therefore
2301     * cluster - contiguous_free_clusters */
2302
2303    /* If no such range could be found, grow the in-memory refcount table
2304     * accordingly to append free clusters at the end of the image */
2305    if (contiguous_free_clusters < cluster_count) {
2306        /* contiguous_free_clusters clusters are already empty at the image end;
2307         * we need cluster_count clusters; therefore, we have to allocate
2308         * cluster_count - contiguous_free_clusters new clusters at the end of
2309         * the image (which is the current value of cluster; note that cluster
2310         * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
2311         * the image end) */
2312        ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
2313                                     cluster + cluster_count
2314                                     - contiguous_free_clusters);
2315        if (ret < 0) {
2316            return ret;
2317        }
2318    }
2319
2320    /* Go back to the first free cluster */
2321    cluster -= contiguous_free_clusters;
2322    for (i = 0; i < cluster_count; i++) {
2323        s->set_refcount(*refcount_table, cluster + i, 1);
2324    }
2325
2326    return cluster << s->cluster_bits;
2327}
2328
2329/*
2330 * Creates a new refcount structure based solely on the in-memory information
2331 * given through *refcount_table. All necessary allocations will be reflected
2332 * in that array.
2333 *
2334 * On success, the old refcount structure is leaked (it will be covered by the
2335 * new refcount structure).
2336 */
2337static int rebuild_refcount_structure(BlockDriverState *bs,
2338                                      BdrvCheckResult *res,
2339                                      void **refcount_table,
2340                                      int64_t *nb_clusters)
2341{
2342    BDRVQcow2State *s = bs->opaque;
2343    int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0;
2344    int64_t refblock_offset, refblock_start, refblock_index;
2345    uint32_t reftable_size = 0;
2346    uint64_t *on_disk_reftable = NULL;
2347    void *on_disk_refblock;
2348    int ret = 0;
2349    struct {
2350        uint64_t reftable_offset;
2351        uint32_t reftable_clusters;
2352    } QEMU_PACKED reftable_offset_and_clusters;
2353
2354    qcow2_cache_empty(bs, s->refcount_block_cache);
2355
2356write_refblocks:
2357    for (; cluster < *nb_clusters; cluster++) {
2358        if (!s->get_refcount(*refcount_table, cluster)) {
2359            continue;
2360        }
2361
2362        refblock_index = cluster >> s->refcount_block_bits;
2363        refblock_start = refblock_index << s->refcount_block_bits;
2364
2365        /* Don't allocate a cluster in a refblock already written to disk */
2366        if (first_free_cluster < refblock_start) {
2367            first_free_cluster = refblock_start;
2368        }
2369        refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2370                                              nb_clusters, &first_free_cluster);
2371        if (refblock_offset < 0) {
2372            fprintf(stderr, "ERROR allocating refblock: %s\n",
2373                    strerror(-refblock_offset));
2374            res->check_errors++;
2375            ret = refblock_offset;
2376            goto fail;
2377        }
2378
2379        if (reftable_size <= refblock_index) {
2380            uint32_t old_reftable_size = reftable_size;
2381            uint64_t *new_on_disk_reftable;
2382
2383            reftable_size = ROUND_UP((refblock_index + 1) * sizeof(uint64_t),
2384                                     s->cluster_size) / sizeof(uint64_t);
2385            new_on_disk_reftable = g_try_realloc(on_disk_reftable,
2386                                                 reftable_size *
2387                                                 sizeof(uint64_t));
2388            if (!new_on_disk_reftable) {
2389                res->check_errors++;
2390                ret = -ENOMEM;
2391                goto fail;
2392            }
2393            on_disk_reftable = new_on_disk_reftable;
2394
2395            memset(on_disk_reftable + old_reftable_size, 0,
2396                   (reftable_size - old_reftable_size) * sizeof(uint64_t));
2397
2398            /* The offset we have for the reftable is now no longer valid;
2399             * this will leak that range, but we can easily fix that by running
2400             * a leak-fixing check after this rebuild operation */
2401            reftable_offset = -1;
2402        } else {
2403            assert(on_disk_reftable);
2404        }
2405        on_disk_reftable[refblock_index] = refblock_offset;
2406
2407        /* If this is apparently the last refblock (for now), try to squeeze the
2408         * reftable in */
2409        if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits &&
2410            reftable_offset < 0)
2411        {
2412            uint64_t reftable_clusters = size_to_clusters(s, reftable_size *
2413                                                          sizeof(uint64_t));
2414            reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2415                                                  refcount_table, nb_clusters,
2416                                                  &first_free_cluster);
2417            if (reftable_offset < 0) {
2418                fprintf(stderr, "ERROR allocating reftable: %s\n",
2419                        strerror(-reftable_offset));
2420                res->check_errors++;
2421                ret = reftable_offset;
2422                goto fail;
2423            }
2424        }
2425
2426        ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2427                                            s->cluster_size, false);
2428        if (ret < 0) {
2429            fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2430            goto fail;
2431        }
2432
2433        /* The size of *refcount_table is always cluster-aligned, therefore the
2434         * write operation will not overflow */
2435        on_disk_refblock = (void *)((char *) *refcount_table +
2436                                    refblock_index * s->cluster_size);
2437
2438        ret = bdrv_pwrite(bs->file, refblock_offset, on_disk_refblock,
2439                          s->cluster_size);
2440        if (ret < 0) {
2441            fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2442            goto fail;
2443        }
2444
2445        /* Go to the end of this refblock */
2446        cluster = refblock_start + s->refcount_block_size - 1;
2447    }
2448
2449    if (reftable_offset < 0) {
2450        uint64_t post_refblock_start, reftable_clusters;
2451
2452        post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size);
2453        reftable_clusters = size_to_clusters(s,
2454                                             reftable_size * sizeof(uint64_t));
2455        /* Not pretty but simple */
2456        if (first_free_cluster < post_refblock_start) {
2457            first_free_cluster = post_refblock_start;
2458        }
2459        reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2460                                              refcount_table, nb_clusters,
2461                                              &first_free_cluster);
2462        if (reftable_offset < 0) {
2463            fprintf(stderr, "ERROR allocating reftable: %s\n",
2464                    strerror(-reftable_offset));
2465            res->check_errors++;
2466            ret = reftable_offset;
2467            goto fail;
2468        }
2469
2470        goto write_refblocks;
2471    }
2472
2473    for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2474        cpu_to_be64s(&on_disk_reftable[refblock_index]);
2475    }
2476
2477    ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset,
2478                                        reftable_size * sizeof(uint64_t),
2479                                        false);
2480    if (ret < 0) {
2481        fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2482        goto fail;
2483    }
2484
2485    assert(reftable_size < INT_MAX / sizeof(uint64_t));
2486    ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable,
2487                      reftable_size * sizeof(uint64_t));
2488    if (ret < 0) {
2489        fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2490        goto fail;
2491    }
2492
2493    /* Enter new reftable into the image header */
2494    reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
2495    reftable_offset_and_clusters.reftable_clusters =
2496        cpu_to_be32(size_to_clusters(s, reftable_size * sizeof(uint64_t)));
2497    ret = bdrv_pwrite_sync(bs->file,
2498                           offsetof(QCowHeader, refcount_table_offset),
2499                           &reftable_offset_and_clusters,
2500                           sizeof(reftable_offset_and_clusters));
2501    if (ret < 0) {
2502        fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret));
2503        goto fail;
2504    }
2505
2506    for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2507        be64_to_cpus(&on_disk_reftable[refblock_index]);
2508    }
2509    s->refcount_table = on_disk_reftable;
2510    s->refcount_table_offset = reftable_offset;
2511    s->refcount_table_size = reftable_size;
2512    update_max_refcount_table_index(s);
2513
2514    return 0;
2515
2516fail:
2517    g_free(on_disk_reftable);
2518    return ret;
2519}
2520
2521/*
2522 * Checks an image for refcount consistency.
2523 *
2524 * Returns 0 if no errors are found, the number of errors in case the image is
2525 * detected as corrupted, and -errno when an internal error occurred.
2526 */
2527int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2528                          BdrvCheckMode fix)
2529{
2530    BDRVQcow2State *s = bs->opaque;
2531    BdrvCheckResult pre_compare_res;
2532    int64_t size, highest_cluster, nb_clusters;
2533    void *refcount_table = NULL;
2534    bool rebuild = false;
2535    int ret;
2536
2537    size = bdrv_getlength(bs->file->bs);
2538    if (size < 0) {
2539        res->check_errors++;
2540        return size;
2541    }
2542
2543    nb_clusters = size_to_clusters(s, size);
2544    if (nb_clusters > INT_MAX) {
2545        res->check_errors++;
2546        return -EFBIG;
2547    }
2548
2549    res->bfi.total_clusters =
2550        size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2551
2552    ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2553                              &nb_clusters);
2554    if (ret < 0) {
2555        goto fail;
2556    }
2557
2558    /* In case we don't need to rebuild the refcount structure (but want to fix
2559     * something), this function is immediately called again, in which case the
2560     * result should be ignored */
2561    pre_compare_res = *res;
2562    compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2563                      nb_clusters);
2564
2565    if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2566        BdrvCheckResult old_res = *res;
2567        int fresh_leaks = 0;
2568
2569        fprintf(stderr, "Rebuilding refcount structure\n");
2570        ret = rebuild_refcount_structure(bs, res, &refcount_table,
2571                                         &nb_clusters);
2572        if (ret < 0) {
2573            goto fail;
2574        }
2575
2576        res->corruptions = 0;
2577        res->leaks = 0;
2578
2579        /* Because the old reftable has been exchanged for a new one the
2580         * references have to be recalculated */
2581        rebuild = false;
2582        memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2583        ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2584                                  &nb_clusters);
2585        if (ret < 0) {
2586            goto fail;
2587        }
2588
2589        if (fix & BDRV_FIX_LEAKS) {
2590            /* The old refcount structures are now leaked, fix it; the result
2591             * can be ignored, aside from leaks which were introduced by
2592             * rebuild_refcount_structure() that could not be fixed */
2593            BdrvCheckResult saved_res = *res;
2594            *res = (BdrvCheckResult){ 0 };
2595
2596            compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2597                              &highest_cluster, refcount_table, nb_clusters);
2598            if (rebuild) {
2599                fprintf(stderr, "ERROR rebuilt refcount structure is still "
2600                        "broken\n");
2601            }
2602
2603            /* Any leaks accounted for here were introduced by
2604             * rebuild_refcount_structure() because that function has created a
2605             * new refcount structure from scratch */
2606            fresh_leaks = res->leaks;
2607            *res = saved_res;
2608        }
2609
2610        if (res->corruptions < old_res.corruptions) {
2611            res->corruptions_fixed += old_res.corruptions - res->corruptions;
2612        }
2613        if (res->leaks < old_res.leaks) {
2614            res->leaks_fixed += old_res.leaks - res->leaks;
2615        }
2616        res->leaks += fresh_leaks;
2617    } else if (fix) {
2618        if (rebuild) {
2619            fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2620            res->check_errors++;
2621            ret = -EIO;
2622            goto fail;
2623        }
2624
2625        if (res->leaks || res->corruptions) {
2626            *res = pre_compare_res;
2627            compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2628                              refcount_table, nb_clusters);
2629        }
2630    }
2631
2632    /* check OFLAG_COPIED */
2633    ret = check_oflag_copied(bs, res, fix);
2634    if (ret < 0) {
2635        goto fail;
2636    }
2637
2638    res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2639    ret = 0;
2640
2641fail:
2642    g_free(refcount_table);
2643
2644    return ret;
2645}
2646
2647#define overlaps_with(ofs, sz) \
2648    ranges_overlap(offset, size, ofs, sz)
2649
2650/*
2651 * Checks if the given offset into the image file is actually free to use by
2652 * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2653 * i.e. a sanity check without relying on the refcount tables.
2654 *
2655 * The ign parameter specifies what checks not to perform (being a bitmask of
2656 * QCow2MetadataOverlap values), i.e., what sections to ignore.
2657 *
2658 * Returns:
2659 * - 0 if writing to this offset will not affect the mentioned metadata
2660 * - a positive QCow2MetadataOverlap value indicating one overlapping section
2661 * - a negative value (-errno) indicating an error while performing a check,
2662 *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
2663 */
2664int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2665                                 int64_t size)
2666{
2667    BDRVQcow2State *s = bs->opaque;
2668    int chk = s->overlap_check & ~ign;
2669    int i, j;
2670
2671    if (!size) {
2672        return 0;
2673    }
2674
2675    if (chk & QCOW2_OL_MAIN_HEADER) {
2676        if (offset < s->cluster_size) {
2677            return QCOW2_OL_MAIN_HEADER;
2678        }
2679    }
2680
2681    /* align range to test to cluster boundaries */
2682    size = ROUND_UP(offset_into_cluster(s, offset) + size, s->cluster_size);
2683    offset = start_of_cluster(s, offset);
2684
2685    if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2686        if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
2687            return QCOW2_OL_ACTIVE_L1;
2688        }
2689    }
2690
2691    if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2692        if (overlaps_with(s->refcount_table_offset,
2693            s->refcount_table_size * sizeof(uint64_t))) {
2694            return QCOW2_OL_REFCOUNT_TABLE;
2695        }
2696    }
2697
2698    if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2699        if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2700            return QCOW2_OL_SNAPSHOT_TABLE;
2701        }
2702    }
2703
2704    if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2705        for (i = 0; i < s->nb_snapshots; i++) {
2706            if (s->snapshots[i].l1_size &&
2707                overlaps_with(s->snapshots[i].l1_table_offset,
2708                s->snapshots[i].l1_size * sizeof(uint64_t))) {
2709                return QCOW2_OL_INACTIVE_L1;
2710            }
2711        }
2712    }
2713
2714    if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2715        for (i = 0; i < s->l1_size; i++) {
2716            if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2717                overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2718                s->cluster_size)) {
2719                return QCOW2_OL_ACTIVE_L2;
2720            }
2721        }
2722    }
2723
2724    if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2725        unsigned last_entry = s->max_refcount_table_index;
2726        assert(last_entry < s->refcount_table_size);
2727        assert(last_entry + 1 == s->refcount_table_size ||
2728               (s->refcount_table[last_entry + 1] & REFT_OFFSET_MASK) == 0);
2729        for (i = 0; i <= last_entry; i++) {
2730            if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2731                overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2732                s->cluster_size)) {
2733                return QCOW2_OL_REFCOUNT_BLOCK;
2734            }
2735        }
2736    }
2737
2738    if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2739        for (i = 0; i < s->nb_snapshots; i++) {
2740            uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2741            uint32_t l1_sz  = s->snapshots[i].l1_size;
2742            uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
2743            uint64_t *l1;
2744            int ret;
2745
2746            ret = qcow2_validate_table(bs, l1_ofs, l1_sz, sizeof(uint64_t),
2747                                       QCOW_MAX_L1_SIZE, "", NULL);
2748            if (ret < 0) {
2749                return ret;
2750            }
2751
2752            l1 = g_try_malloc(l1_sz2);
2753
2754            if (l1_sz2 && l1 == NULL) {
2755                return -ENOMEM;
2756            }
2757
2758            ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
2759            if (ret < 0) {
2760                g_free(l1);
2761                return ret;
2762            }
2763
2764            for (j = 0; j < l1_sz; j++) {
2765                uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
2766                if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
2767                    g_free(l1);
2768                    return QCOW2_OL_INACTIVE_L2;
2769                }
2770            }
2771
2772            g_free(l1);
2773        }
2774    }
2775
2776    if ((chk & QCOW2_OL_BITMAP_DIRECTORY) &&
2777        (s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS))
2778    {
2779        if (overlaps_with(s->bitmap_directory_offset,
2780                          s->bitmap_directory_size))
2781        {
2782            return QCOW2_OL_BITMAP_DIRECTORY;
2783        }
2784    }
2785
2786    return 0;
2787}
2788
2789static const char *metadata_ol_names[] = {
2790    [QCOW2_OL_MAIN_HEADER_BITNR]        = "qcow2_header",
2791    [QCOW2_OL_ACTIVE_L1_BITNR]          = "active L1 table",
2792    [QCOW2_OL_ACTIVE_L2_BITNR]          = "active L2 table",
2793    [QCOW2_OL_REFCOUNT_TABLE_BITNR]     = "refcount table",
2794    [QCOW2_OL_REFCOUNT_BLOCK_BITNR]     = "refcount block",
2795    [QCOW2_OL_SNAPSHOT_TABLE_BITNR]     = "snapshot table",
2796    [QCOW2_OL_INACTIVE_L1_BITNR]        = "inactive L1 table",
2797    [QCOW2_OL_INACTIVE_L2_BITNR]        = "inactive L2 table",
2798    [QCOW2_OL_BITMAP_DIRECTORY_BITNR]   = "bitmap directory",
2799};
2800QEMU_BUILD_BUG_ON(QCOW2_OL_MAX_BITNR != ARRAY_SIZE(metadata_ol_names));
2801
2802/*
2803 * First performs a check for metadata overlaps (through
2804 * qcow2_check_metadata_overlap); if that fails with a negative value (error
2805 * while performing a check), that value is returned. If an impending overlap
2806 * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
2807 * and -EIO returned.
2808 *
2809 * Returns 0 if there were neither overlaps nor errors while checking for
2810 * overlaps; or a negative value (-errno) on error.
2811 */
2812int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
2813                                  int64_t size, bool data_file)
2814{
2815    int ret;
2816
2817    if (data_file && has_data_file(bs)) {
2818        return 0;
2819    }
2820
2821    ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
2822    if (ret < 0) {
2823        return ret;
2824    } else if (ret > 0) {
2825        int metadata_ol_bitnr = ctz32(ret);
2826        assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
2827
2828        qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
2829                                "write on metadata (overlaps with %s)",
2830                                metadata_ol_names[metadata_ol_bitnr]);
2831        return -EIO;
2832    }
2833
2834    return 0;
2835}
2836
2837/* A pointer to a function of this type is given to walk_over_reftable(). That
2838 * function will create refblocks and pass them to a RefblockFinishOp once they
2839 * are completed (@refblock). @refblock_empty is set if the refblock is
2840 * completely empty.
2841 *
2842 * Along with the refblock, a corresponding reftable entry is passed, in the
2843 * reftable @reftable (which may be reallocated) at @reftable_index.
2844 *
2845 * @allocated should be set to true if a new cluster has been allocated.
2846 */
2847typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
2848                               uint64_t reftable_index, uint64_t *reftable_size,
2849                               void *refblock, bool refblock_empty,
2850                               bool *allocated, Error **errp);
2851
2852/**
2853 * This "operation" for walk_over_reftable() allocates the refblock on disk (if
2854 * it is not empty) and inserts its offset into the new reftable. The size of
2855 * this new reftable is increased as required.
2856 */
2857static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
2858                          uint64_t reftable_index, uint64_t *reftable_size,
2859                          void *refblock, bool refblock_empty, bool *allocated,
2860                          Error **errp)
2861{
2862    BDRVQcow2State *s = bs->opaque;
2863    int64_t offset;
2864
2865    if (!refblock_empty && reftable_index >= *reftable_size) {
2866        uint64_t *new_reftable;
2867        uint64_t new_reftable_size;
2868
2869        new_reftable_size = ROUND_UP(reftable_index + 1,
2870                                     s->cluster_size / sizeof(uint64_t));
2871        if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
2872            error_setg(errp,
2873                       "This operation would make the refcount table grow "
2874                       "beyond the maximum size supported by QEMU, aborting");
2875            return -ENOTSUP;
2876        }
2877
2878        new_reftable = g_try_realloc(*reftable, new_reftable_size *
2879                                                sizeof(uint64_t));
2880        if (!new_reftable) {
2881            error_setg(errp, "Failed to increase reftable buffer size");
2882            return -ENOMEM;
2883        }
2884
2885        memset(new_reftable + *reftable_size, 0,
2886               (new_reftable_size - *reftable_size) * sizeof(uint64_t));
2887
2888        *reftable      = new_reftable;
2889        *reftable_size = new_reftable_size;
2890    }
2891
2892    if (!refblock_empty && !(*reftable)[reftable_index]) {
2893        offset = qcow2_alloc_clusters(bs, s->cluster_size);
2894        if (offset < 0) {
2895            error_setg_errno(errp, -offset, "Failed to allocate refblock");
2896            return offset;
2897        }
2898        (*reftable)[reftable_index] = offset;
2899        *allocated = true;
2900    }
2901
2902    return 0;
2903}
2904
2905/**
2906 * This "operation" for walk_over_reftable() writes the refblock to disk at the
2907 * offset specified by the new reftable's entry. It does not modify the new
2908 * reftable or change any refcounts.
2909 */
2910static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
2911                          uint64_t reftable_index, uint64_t *reftable_size,
2912                          void *refblock, bool refblock_empty, bool *allocated,
2913                          Error **errp)
2914{
2915    BDRVQcow2State *s = bs->opaque;
2916    int64_t offset;
2917    int ret;
2918
2919    if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
2920        offset = (*reftable)[reftable_index];
2921
2922        ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size,
2923                                            false);
2924        if (ret < 0) {
2925            error_setg_errno(errp, -ret, "Overlap check failed");
2926            return ret;
2927        }
2928
2929        ret = bdrv_pwrite(bs->file, offset, refblock, s->cluster_size);
2930        if (ret < 0) {
2931            error_setg_errno(errp, -ret, "Failed to write refblock");
2932            return ret;
2933        }
2934    } else {
2935        assert(refblock_empty);
2936    }
2937
2938    return 0;
2939}
2940
2941/**
2942 * This function walks over the existing reftable and every referenced refblock;
2943 * if @new_set_refcount is non-NULL, it is called for every refcount entry to
2944 * create an equal new entry in the passed @new_refblock. Once that
2945 * @new_refblock is completely filled, @operation will be called.
2946 *
2947 * @status_cb and @cb_opaque are used for the amend operation's status callback.
2948 * @index is the index of the walk_over_reftable() calls and @total is the total
2949 * number of walk_over_reftable() calls per amend operation. Both are used for
2950 * calculating the parameters for the status callback.
2951 *
2952 * @allocated is set to true if a new cluster has been allocated.
2953 */
2954static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
2955                              uint64_t *new_reftable_index,
2956                              uint64_t *new_reftable_size,
2957                              void *new_refblock, int new_refblock_size,
2958                              int new_refcount_bits,
2959                              RefblockFinishOp *operation, bool *allocated,
2960                              Qcow2SetRefcountFunc *new_set_refcount,
2961                              BlockDriverAmendStatusCB *status_cb,
2962                              void *cb_opaque, int index, int total,
2963                              Error **errp)
2964{
2965    BDRVQcow2State *s = bs->opaque;
2966    uint64_t reftable_index;
2967    bool new_refblock_empty = true;
2968    int refblock_index;
2969    int new_refblock_index = 0;
2970    int ret;
2971
2972    for (reftable_index = 0; reftable_index < s->refcount_table_size;
2973         reftable_index++)
2974    {
2975        uint64_t refblock_offset = s->refcount_table[reftable_index]
2976                                 & REFT_OFFSET_MASK;
2977
2978        status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
2979                  (uint64_t)total * s->refcount_table_size, cb_opaque);
2980
2981        if (refblock_offset) {
2982            void *refblock;
2983
2984            if (offset_into_cluster(s, refblock_offset)) {
2985                qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
2986                                        PRIx64 " unaligned (reftable index: %#"
2987                                        PRIx64 ")", refblock_offset,
2988                                        reftable_index);
2989                error_setg(errp,
2990                           "Image is corrupt (unaligned refblock offset)");
2991                return -EIO;
2992            }
2993
2994            ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
2995                                  &refblock);
2996            if (ret < 0) {
2997                error_setg_errno(errp, -ret, "Failed to retrieve refblock");
2998                return ret;
2999            }
3000
3001            for (refblock_index = 0; refblock_index < s->refcount_block_size;
3002                 refblock_index++)
3003            {
3004                uint64_t refcount;
3005
3006                if (new_refblock_index >= new_refblock_size) {
3007                    /* new_refblock is now complete */
3008                    ret = operation(bs, new_reftable, *new_reftable_index,
3009                                    new_reftable_size, new_refblock,
3010                                    new_refblock_empty, allocated, errp);
3011                    if (ret < 0) {
3012                        qcow2_cache_put(s->refcount_block_cache, &refblock);
3013                        return ret;
3014                    }
3015
3016                    (*new_reftable_index)++;
3017                    new_refblock_index = 0;
3018                    new_refblock_empty = true;
3019                }
3020
3021                refcount = s->get_refcount(refblock, refblock_index);
3022                if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
3023                    uint64_t offset;
3024
3025                    qcow2_cache_put(s->refcount_block_cache, &refblock);
3026
3027                    offset = ((reftable_index << s->refcount_block_bits)
3028                              + refblock_index) << s->cluster_bits;
3029
3030                    error_setg(errp, "Cannot decrease refcount entry width to "
3031                               "%i bits: Cluster at offset %#" PRIx64 " has a "
3032                               "refcount of %" PRIu64, new_refcount_bits,
3033                               offset, refcount);
3034                    return -EINVAL;
3035                }
3036
3037                if (new_set_refcount) {
3038                    new_set_refcount(new_refblock, new_refblock_index++,
3039                                     refcount);
3040                } else {
3041                    new_refblock_index++;
3042                }
3043                new_refblock_empty = new_refblock_empty && refcount == 0;
3044            }
3045
3046            qcow2_cache_put(s->refcount_block_cache, &refblock);
3047        } else {
3048            /* No refblock means every refcount is 0 */
3049            for (refblock_index = 0; refblock_index < s->refcount_block_size;
3050                 refblock_index++)
3051            {
3052                if (new_refblock_index >= new_refblock_size) {
3053                    /* new_refblock is now complete */
3054                    ret = operation(bs, new_reftable, *new_reftable_index,
3055                                    new_reftable_size, new_refblock,
3056                                    new_refblock_empty, allocated, errp);
3057                    if (ret < 0) {
3058                        return ret;
3059                    }
3060
3061                    (*new_reftable_index)++;
3062                    new_refblock_index = 0;
3063                    new_refblock_empty = true;
3064                }
3065
3066                if (new_set_refcount) {
3067                    new_set_refcount(new_refblock, new_refblock_index++, 0);
3068                } else {
3069                    new_refblock_index++;
3070                }
3071            }
3072        }
3073    }
3074
3075    if (new_refblock_index > 0) {
3076        /* Complete the potentially existing partially filled final refblock */
3077        if (new_set_refcount) {
3078            for (; new_refblock_index < new_refblock_size;
3079                 new_refblock_index++)
3080            {
3081                new_set_refcount(new_refblock, new_refblock_index, 0);
3082            }
3083        }
3084
3085        ret = operation(bs, new_reftable, *new_reftable_index,
3086                        new_reftable_size, new_refblock, new_refblock_empty,
3087                        allocated, errp);
3088        if (ret < 0) {
3089            return ret;
3090        }
3091
3092        (*new_reftable_index)++;
3093    }
3094
3095    status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
3096              (uint64_t)total * s->refcount_table_size, cb_opaque);
3097
3098    return 0;
3099}
3100
3101int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
3102                                BlockDriverAmendStatusCB *status_cb,
3103                                void *cb_opaque, Error **errp)
3104{
3105    BDRVQcow2State *s = bs->opaque;
3106    Qcow2GetRefcountFunc *new_get_refcount;
3107    Qcow2SetRefcountFunc *new_set_refcount;
3108    void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
3109    uint64_t *new_reftable = NULL, new_reftable_size = 0;
3110    uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
3111    uint64_t new_reftable_index = 0;
3112    uint64_t i;
3113    int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
3114    int new_refblock_size, new_refcount_bits = 1 << refcount_order;
3115    int old_refcount_order;
3116    int walk_index = 0;
3117    int ret;
3118    bool new_allocation;
3119
3120    assert(s->qcow_version >= 3);
3121    assert(refcount_order >= 0 && refcount_order <= 6);
3122
3123    /* see qcow2_open() */
3124    new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
3125
3126    new_get_refcount = get_refcount_funcs[refcount_order];
3127    new_set_refcount = set_refcount_funcs[refcount_order];
3128
3129
3130    do {
3131        int total_walks;
3132
3133        new_allocation = false;
3134
3135        /* At least we have to do this walk and the one which writes the
3136         * refblocks; also, at least we have to do this loop here at least
3137         * twice (normally), first to do the allocations, and second to
3138         * determine that everything is correctly allocated, this then makes
3139         * three walks in total */
3140        total_walks = MAX(walk_index + 2, 3);
3141
3142        /* First, allocate the structures so they are present in the refcount
3143         * structures */
3144        ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3145                                 &new_reftable_size, NULL, new_refblock_size,
3146                                 new_refcount_bits, &alloc_refblock,
3147                                 &new_allocation, NULL, status_cb, cb_opaque,
3148                                 walk_index++, total_walks, errp);
3149        if (ret < 0) {
3150            goto done;
3151        }
3152
3153        new_reftable_index = 0;
3154
3155        if (new_allocation) {
3156            if (new_reftable_offset) {
3157                qcow2_free_clusters(bs, new_reftable_offset,
3158                                    allocated_reftable_size * sizeof(uint64_t),
3159                                    QCOW2_DISCARD_NEVER);
3160            }
3161
3162            new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
3163                                                           sizeof(uint64_t));
3164            if (new_reftable_offset < 0) {
3165                error_setg_errno(errp, -new_reftable_offset,
3166                                 "Failed to allocate the new reftable");
3167                ret = new_reftable_offset;
3168                goto done;
3169            }
3170            allocated_reftable_size = new_reftable_size;
3171        }
3172    } while (new_allocation);
3173
3174    /* Second, write the new refblocks */
3175    ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3176                             &new_reftable_size, new_refblock,
3177                             new_refblock_size, new_refcount_bits,
3178                             &flush_refblock, &new_allocation, new_set_refcount,
3179                             status_cb, cb_opaque, walk_index, walk_index + 1,
3180                             errp);
3181    if (ret < 0) {
3182        goto done;
3183    }
3184    assert(!new_allocation);
3185
3186
3187    /* Write the new reftable */
3188    ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
3189                                        new_reftable_size * sizeof(uint64_t),
3190                                        false);
3191    if (ret < 0) {
3192        error_setg_errno(errp, -ret, "Overlap check failed");
3193        goto done;
3194    }
3195
3196    for (i = 0; i < new_reftable_size; i++) {
3197        cpu_to_be64s(&new_reftable[i]);
3198    }
3199
3200    ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable,
3201                      new_reftable_size * sizeof(uint64_t));
3202
3203    for (i = 0; i < new_reftable_size; i++) {
3204        be64_to_cpus(&new_reftable[i]);
3205    }
3206
3207    if (ret < 0) {
3208        error_setg_errno(errp, -ret, "Failed to write the new reftable");
3209        goto done;
3210    }
3211
3212
3213    /* Empty the refcount cache */
3214    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
3215    if (ret < 0) {
3216        error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
3217        goto done;
3218    }
3219
3220    /* Update the image header to point to the new reftable; this only updates
3221     * the fields which are relevant to qcow2_update_header(); other fields
3222     * such as s->refcount_table or s->refcount_bits stay stale for now
3223     * (because we have to restore everything if qcow2_update_header() fails) */
3224    old_refcount_order  = s->refcount_order;
3225    old_reftable_size   = s->refcount_table_size;
3226    old_reftable_offset = s->refcount_table_offset;
3227
3228    s->refcount_order        = refcount_order;
3229    s->refcount_table_size   = new_reftable_size;
3230    s->refcount_table_offset = new_reftable_offset;
3231
3232    ret = qcow2_update_header(bs);
3233    if (ret < 0) {
3234        s->refcount_order        = old_refcount_order;
3235        s->refcount_table_size   = old_reftable_size;
3236        s->refcount_table_offset = old_reftable_offset;
3237        error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
3238        goto done;
3239    }
3240
3241    /* Now update the rest of the in-memory information */
3242    old_reftable = s->refcount_table;
3243    s->refcount_table = new_reftable;
3244    update_max_refcount_table_index(s);
3245
3246    s->refcount_bits = 1 << refcount_order;
3247    s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
3248    s->refcount_max += s->refcount_max - 1;
3249
3250    s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
3251    s->refcount_block_size = 1 << s->refcount_block_bits;
3252
3253    s->get_refcount = new_get_refcount;
3254    s->set_refcount = new_set_refcount;
3255
3256    /* For cleaning up all old refblocks and the old reftable below the "done"
3257     * label */
3258    new_reftable        = old_reftable;
3259    new_reftable_size   = old_reftable_size;
3260    new_reftable_offset = old_reftable_offset;
3261
3262done:
3263    if (new_reftable) {
3264        /* On success, new_reftable actually points to the old reftable (and
3265         * new_reftable_size is the old reftable's size); but that is just
3266         * fine */
3267        for (i = 0; i < new_reftable_size; i++) {
3268            uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
3269            if (offset) {
3270                qcow2_free_clusters(bs, offset, s->cluster_size,
3271                                    QCOW2_DISCARD_OTHER);
3272            }
3273        }
3274        g_free(new_reftable);
3275
3276        if (new_reftable_offset > 0) {
3277            qcow2_free_clusters(bs, new_reftable_offset,
3278                                new_reftable_size * sizeof(uint64_t),
3279                                QCOW2_DISCARD_OTHER);
3280        }
3281    }
3282
3283    qemu_vfree(new_refblock);
3284    return ret;
3285}
3286
3287static int64_t get_refblock_offset(BlockDriverState *bs, uint64_t offset)
3288{
3289    BDRVQcow2State *s = bs->opaque;
3290    uint32_t index = offset_to_reftable_index(s, offset);
3291    int64_t covering_refblock_offset = 0;
3292
3293    if (index < s->refcount_table_size) {
3294        covering_refblock_offset = s->refcount_table[index] & REFT_OFFSET_MASK;
3295    }
3296    if (!covering_refblock_offset) {
3297        qcow2_signal_corruption(bs, true, -1, -1, "Refblock at %#" PRIx64 " is "
3298                                "not covered by the refcount structures",
3299                                offset);
3300        return -EIO;
3301    }
3302
3303    return covering_refblock_offset;
3304}
3305
3306static int qcow2_discard_refcount_block(BlockDriverState *bs,
3307                                        uint64_t discard_block_offs)
3308{
3309    BDRVQcow2State *s = bs->opaque;
3310    int64_t refblock_offs;
3311    uint64_t cluster_index = discard_block_offs >> s->cluster_bits;
3312    uint32_t block_index = cluster_index & (s->refcount_block_size - 1);
3313    void *refblock;
3314    int ret;
3315
3316    refblock_offs = get_refblock_offset(bs, discard_block_offs);
3317    if (refblock_offs < 0) {
3318        return refblock_offs;
3319    }
3320
3321    assert(discard_block_offs != 0);
3322
3323    ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3324                          &refblock);
3325    if (ret < 0) {
3326        return ret;
3327    }
3328
3329    if (s->get_refcount(refblock, block_index) != 1) {
3330        qcow2_signal_corruption(bs, true, -1, -1, "Invalid refcount:"
3331                                " refblock offset %#" PRIx64
3332                                ", reftable index %u"
3333                                ", block offset %#" PRIx64
3334                                ", refcount %#" PRIx64,
3335                                refblock_offs,
3336                                offset_to_reftable_index(s, discard_block_offs),
3337                                discard_block_offs,
3338                                s->get_refcount(refblock, block_index));
3339        qcow2_cache_put(s->refcount_block_cache, &refblock);
3340        return -EINVAL;
3341    }
3342    s->set_refcount(refblock, block_index, 0);
3343
3344    qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refblock);
3345
3346    qcow2_cache_put(s->refcount_block_cache, &refblock);
3347
3348    if (cluster_index < s->free_cluster_index) {
3349        s->free_cluster_index = cluster_index;
3350    }
3351
3352    refblock = qcow2_cache_is_table_offset(s->refcount_block_cache,
3353                                           discard_block_offs);
3354    if (refblock) {
3355        /* discard refblock from the cache if refblock is cached */
3356        qcow2_cache_discard(s->refcount_block_cache, refblock);
3357    }
3358    update_refcount_discard(bs, discard_block_offs, s->cluster_size);
3359
3360    return 0;
3361}
3362
3363int qcow2_shrink_reftable(BlockDriverState *bs)
3364{
3365    BDRVQcow2State *s = bs->opaque;
3366    uint64_t *reftable_tmp =
3367        g_malloc(s->refcount_table_size * sizeof(uint64_t));
3368    int i, ret;
3369
3370    for (i = 0; i < s->refcount_table_size; i++) {
3371        int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK;
3372        void *refblock;
3373        bool unused_block;
3374
3375        if (refblock_offs == 0) {
3376            reftable_tmp[i] = 0;
3377            continue;
3378        }
3379        ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3380                              &refblock);
3381        if (ret < 0) {
3382            goto out;
3383        }
3384
3385        /* the refblock has own reference */
3386        if (i == offset_to_reftable_index(s, refblock_offs)) {
3387            uint64_t block_index = (refblock_offs >> s->cluster_bits) &
3388                                   (s->refcount_block_size - 1);
3389            uint64_t refcount = s->get_refcount(refblock, block_index);
3390
3391            s->set_refcount(refblock, block_index, 0);
3392
3393            unused_block = buffer_is_zero(refblock, s->cluster_size);
3394
3395            s->set_refcount(refblock, block_index, refcount);
3396        } else {
3397            unused_block = buffer_is_zero(refblock, s->cluster_size);
3398        }
3399        qcow2_cache_put(s->refcount_block_cache, &refblock);
3400
3401        reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
3402    }
3403
3404    ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset, reftable_tmp,
3405                           s->refcount_table_size * sizeof(uint64_t));
3406    /*
3407     * If the write in the reftable failed the image may contain a partially
3408     * overwritten reftable. In this case it would be better to clear the
3409     * reftable in memory to avoid possible image corruption.
3410     */
3411    for (i = 0; i < s->refcount_table_size; i++) {
3412        if (s->refcount_table[i] && !reftable_tmp[i]) {
3413            if (ret == 0) {
3414                ret = qcow2_discard_refcount_block(bs, s->refcount_table[i] &
3415                                                       REFT_OFFSET_MASK);
3416            }
3417            s->refcount_table[i] = 0;
3418        }
3419    }
3420
3421    if (!s->cache_discards) {
3422        qcow2_process_discards(bs, ret);
3423    }
3424
3425out:
3426    g_free(reftable_tmp);
3427    return ret;
3428}
3429
3430int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
3431{
3432    BDRVQcow2State *s = bs->opaque;
3433    int64_t i;
3434
3435    for (i = size_to_clusters(s, size) - 1; i >= 0; i--) {
3436        uint64_t refcount;
3437        int ret = qcow2_get_refcount(bs, i, &refcount);
3438        if (ret < 0) {
3439            fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
3440                    i, strerror(-ret));
3441            return ret;
3442        }
3443        if (refcount > 0) {
3444            return i;
3445        }
3446    }
3447    qcow2_signal_corruption(bs, true, -1, -1,
3448                            "There are no references in the refcount table.");
3449    return -EIO;
3450}
3451
3452int qcow2_detect_metadata_preallocation(BlockDriverState *bs)
3453{
3454    BDRVQcow2State *s = bs->opaque;
3455    int64_t i, end_cluster, cluster_count = 0, threshold;
3456    int64_t file_length, real_allocation, real_clusters;
3457
3458    qemu_co_mutex_assert_locked(&s->lock);
3459
3460    file_length = bdrv_getlength(bs->file->bs);
3461    if (file_length < 0) {
3462        return file_length;
3463    }
3464
3465    real_allocation = bdrv_get_allocated_file_size(bs->file->bs);
3466    if (real_allocation < 0) {
3467        return real_allocation;
3468    }
3469
3470    real_clusters = real_allocation / s->cluster_size;
3471    threshold = MAX(real_clusters * 10 / 9, real_clusters + 2);
3472
3473    end_cluster = size_to_clusters(s, file_length);
3474    for (i = 0; i < end_cluster && cluster_count < threshold; i++) {
3475        uint64_t refcount;
3476        int ret = qcow2_get_refcount(bs, i, &refcount);
3477        if (ret < 0) {
3478            return ret;
3479        }
3480        cluster_count += !!refcount;
3481    }
3482
3483    return cluster_count >= threshold;
3484}
3485