qemu/block/qcow2-refcount.c
<<
>>
Prefs
   1/*
   2 * Block driver for the QCOW version 2 format
   3 *
   4 * Copyright (c) 2004-2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu-common.h"
  26#include "block/block_int.h"
  27#include "block/qcow2.h"
  28#include "qemu/range.h"
  29#include "qapi/qmp/types.h"
  30
  31static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
  32static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
  33                            int64_t offset, int64_t length,
  34                            int addend, enum qcow2_discard_type type);
  35
  36
  37/*********************************************************/
  38/* refcount handling */
  39
  40int qcow2_refcount_init(BlockDriverState *bs)
  41{
  42    BDRVQcowState *s = bs->opaque;
  43    int ret, refcount_table_size2, i;
  44
  45    refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
  46    s->refcount_table = g_malloc(refcount_table_size2);
  47    if (s->refcount_table_size > 0) {
  48        BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
  49        ret = bdrv_pread(bs->file, s->refcount_table_offset,
  50                         s->refcount_table, refcount_table_size2);
  51        if (ret != refcount_table_size2)
  52            goto fail;
  53        for(i = 0; i < s->refcount_table_size; i++)
  54            be64_to_cpus(&s->refcount_table[i]);
  55    }
  56    return 0;
  57 fail:
  58    return -ENOMEM;
  59}
  60
  61void qcow2_refcount_close(BlockDriverState *bs)
  62{
  63    BDRVQcowState *s = bs->opaque;
  64    g_free(s->refcount_table);
  65}
  66
  67
  68static int load_refcount_block(BlockDriverState *bs,
  69                               int64_t refcount_block_offset,
  70                               void **refcount_block)
  71{
  72    BDRVQcowState *s = bs->opaque;
  73    int ret;
  74
  75    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
  76    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
  77        refcount_block);
  78
  79    return ret;
  80}
  81
  82/*
  83 * Returns the refcount of the cluster given by its index. Any non-negative
  84 * return value is the refcount of the cluster, negative values are -errno
  85 * and indicate an error.
  86 */
  87static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
  88{
  89    BDRVQcowState *s = bs->opaque;
  90    int refcount_table_index, block_index;
  91    int64_t refcount_block_offset;
  92    int ret;
  93    uint16_t *refcount_block;
  94    uint16_t refcount;
  95
  96    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
  97    if (refcount_table_index >= s->refcount_table_size)
  98        return 0;
  99    refcount_block_offset = s->refcount_table[refcount_table_index];
 100    if (!refcount_block_offset)
 101        return 0;
 102
 103    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
 104        (void**) &refcount_block);
 105    if (ret < 0) {
 106        return ret;
 107    }
 108
 109    block_index = cluster_index &
 110        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
 111    refcount = be16_to_cpu(refcount_block[block_index]);
 112
 113    ret = qcow2_cache_put(bs, s->refcount_block_cache,
 114        (void**) &refcount_block);
 115    if (ret < 0) {
 116        return ret;
 117    }
 118
 119    return refcount;
 120}
 121
 122/*
 123 * Rounds the refcount table size up to avoid growing the table for each single
 124 * refcount block that is allocated.
 125 */
 126static unsigned int next_refcount_table_size(BDRVQcowState *s,
 127    unsigned int min_size)
 128{
 129    unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1;
 130    unsigned int refcount_table_clusters =
 131        MAX(1, s->refcount_table_size >> (s->cluster_bits - 3));
 132
 133    while (min_clusters > refcount_table_clusters) {
 134        refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
 135    }
 136
 137    return refcount_table_clusters << (s->cluster_bits - 3);
 138}
 139
 140
 141/* Checks if two offsets are described by the same refcount block */
 142static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a,
 143    uint64_t offset_b)
 144{
 145    uint64_t block_a = offset_a >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
 146    uint64_t block_b = offset_b >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
 147
 148    return (block_a == block_b);
 149}
 150
 151/*
 152 * Loads a refcount block. If it doesn't exist yet, it is allocated first
 153 * (including growing the refcount table if needed).
 154 *
 155 * Returns 0 on success or -errno in error case
 156 */
 157static int alloc_refcount_block(BlockDriverState *bs,
 158    int64_t cluster_index, uint16_t **refcount_block)
 159{
 160    BDRVQcowState *s = bs->opaque;
 161    unsigned int refcount_table_index;
 162    int ret;
 163
 164    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
 165
 166    /* Find the refcount block for the given cluster */
 167    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
 168
 169    if (refcount_table_index < s->refcount_table_size) {
 170
 171        uint64_t refcount_block_offset =
 172            s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
 173
 174        /* If it's already there, we're done */
 175        if (refcount_block_offset) {
 176             return load_refcount_block(bs, refcount_block_offset,
 177                 (void**) refcount_block);
 178        }
 179    }
 180
 181    /*
 182     * If we came here, we need to allocate something. Something is at least
 183     * a cluster for the new refcount block. It may also include a new refcount
 184     * table if the old refcount table is too small.
 185     *
 186     * Note that allocating clusters here needs some special care:
 187     *
 188     * - We can't use the normal qcow2_alloc_clusters(), it would try to
 189     *   increase the refcount and very likely we would end up with an endless
 190     *   recursion. Instead we must place the refcount blocks in a way that
 191     *   they can describe them themselves.
 192     *
 193     * - We need to consider that at this point we are inside update_refcounts
 194     *   and doing the initial refcount increase. This means that some clusters
 195     *   have already been allocated by the caller, but their refcount isn't
 196     *   accurate yet. free_cluster_index tells us where this allocation ends
 197     *   as long as we don't overwrite it by freeing clusters.
 198     *
 199     * - alloc_clusters_noref and qcow2_free_clusters may load a different
 200     *   refcount block into the cache
 201     */
 202
 203    *refcount_block = NULL;
 204
 205    /* We write to the refcount table, so we might depend on L2 tables */
 206    ret = qcow2_cache_flush(bs, s->l2_table_cache);
 207    if (ret < 0) {
 208        return ret;
 209    }
 210
 211    /* Allocate the refcount block itself and mark it as used */
 212    int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
 213    if (new_block < 0) {
 214        return new_block;
 215    }
 216
 217#ifdef DEBUG_ALLOC2
 218    fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
 219        " at %" PRIx64 "\n",
 220        refcount_table_index, cluster_index << s->cluster_bits, new_block);
 221#endif
 222
 223    if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
 224        /* Zero the new refcount block before updating it */
 225        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
 226            (void**) refcount_block);
 227        if (ret < 0) {
 228            goto fail_block;
 229        }
 230
 231        memset(*refcount_block, 0, s->cluster_size);
 232
 233        /* The block describes itself, need to update the cache */
 234        int block_index = (new_block >> s->cluster_bits) &
 235            ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
 236        (*refcount_block)[block_index] = cpu_to_be16(1);
 237    } else {
 238        /* Described somewhere else. This can recurse at most twice before we
 239         * arrive at a block that describes itself. */
 240        ret = update_refcount(bs, new_block, s->cluster_size, 1,
 241                              QCOW2_DISCARD_NEVER);
 242        if (ret < 0) {
 243            goto fail_block;
 244        }
 245
 246        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 247        if (ret < 0) {
 248            goto fail_block;
 249        }
 250
 251        /* Initialize the new refcount block only after updating its refcount,
 252         * update_refcount uses the refcount cache itself */
 253        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
 254            (void**) refcount_block);
 255        if (ret < 0) {
 256            goto fail_block;
 257        }
 258
 259        memset(*refcount_block, 0, s->cluster_size);
 260    }
 261
 262    /* Now the new refcount block needs to be written to disk */
 263    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
 264    qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
 265    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 266    if (ret < 0) {
 267        goto fail_block;
 268    }
 269
 270    /* If the refcount table is big enough, just hook the block up there */
 271    if (refcount_table_index < s->refcount_table_size) {
 272        uint64_t data64 = cpu_to_be64(new_block);
 273        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
 274        ret = bdrv_pwrite_sync(bs->file,
 275            s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
 276            &data64, sizeof(data64));
 277        if (ret < 0) {
 278            goto fail_block;
 279        }
 280
 281        s->refcount_table[refcount_table_index] = new_block;
 282        return 0;
 283    }
 284
 285    ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
 286    if (ret < 0) {
 287        goto fail_block;
 288    }
 289
 290    /*
 291     * If we come here, we need to grow the refcount table. Again, a new
 292     * refcount table needs some space and we can't simply allocate to avoid
 293     * endless recursion.
 294     *
 295     * Therefore let's grab new refcount blocks at the end of the image, which
 296     * will describe themselves and the new refcount table. This way we can
 297     * reference them only in the new table and do the switch to the new
 298     * refcount table at once without producing an inconsistent state in
 299     * between.
 300     */
 301    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
 302
 303    /* Calculate the number of refcount blocks needed so far */
 304    uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT);
 305    uint64_t blocks_used = (s->free_cluster_index +
 306        refcount_block_clusters - 1) / refcount_block_clusters;
 307
 308    /* And now we need at least one block more for the new metadata */
 309    uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
 310    uint64_t last_table_size;
 311    uint64_t blocks_clusters;
 312    do {
 313        uint64_t table_clusters =
 314            size_to_clusters(s, table_size * sizeof(uint64_t));
 315        blocks_clusters = 1 +
 316            ((table_clusters + refcount_block_clusters - 1)
 317            / refcount_block_clusters);
 318        uint64_t meta_clusters = table_clusters + blocks_clusters;
 319
 320        last_table_size = table_size;
 321        table_size = next_refcount_table_size(s, blocks_used +
 322            ((meta_clusters + refcount_block_clusters - 1)
 323            / refcount_block_clusters));
 324
 325    } while (last_table_size != table_size);
 326
 327#ifdef DEBUG_ALLOC2
 328    fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n",
 329        s->refcount_table_size, table_size);
 330#endif
 331
 332    /* Create the new refcount table and blocks */
 333    uint64_t meta_offset = (blocks_used * refcount_block_clusters) *
 334        s->cluster_size;
 335    uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
 336    uint16_t *new_blocks = g_malloc0(blocks_clusters * s->cluster_size);
 337    uint64_t *new_table = g_malloc0(table_size * sizeof(uint64_t));
 338
 339    assert(meta_offset >= (s->free_cluster_index * s->cluster_size));
 340
 341    /* Fill the new refcount table */
 342    memcpy(new_table, s->refcount_table,
 343        s->refcount_table_size * sizeof(uint64_t));
 344    new_table[refcount_table_index] = new_block;
 345
 346    int i;
 347    for (i = 0; i < blocks_clusters; i++) {
 348        new_table[blocks_used + i] = meta_offset + (i * s->cluster_size);
 349    }
 350
 351    /* Fill the refcount blocks */
 352    uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t));
 353    int block = 0;
 354    for (i = 0; i < table_clusters + blocks_clusters; i++) {
 355        new_blocks[block++] = cpu_to_be16(1);
 356    }
 357
 358    /* Write refcount blocks to disk */
 359    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
 360    ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks,
 361        blocks_clusters * s->cluster_size);
 362    g_free(new_blocks);
 363    if (ret < 0) {
 364        goto fail_table;
 365    }
 366
 367    /* Write refcount table to disk */
 368    for(i = 0; i < table_size; i++) {
 369        cpu_to_be64s(&new_table[i]);
 370    }
 371
 372    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
 373    ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
 374        table_size * sizeof(uint64_t));
 375    if (ret < 0) {
 376        goto fail_table;
 377    }
 378
 379    for(i = 0; i < table_size; i++) {
 380        be64_to_cpus(&new_table[i]);
 381    }
 382
 383    /* Hook up the new refcount table in the qcow2 header */
 384    uint8_t data[12];
 385    cpu_to_be64w((uint64_t*)data, table_offset);
 386    cpu_to_be32w((uint32_t*)(data + 8), table_clusters);
 387    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
 388    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, refcount_table_offset),
 389        data, sizeof(data));
 390    if (ret < 0) {
 391        goto fail_table;
 392    }
 393
 394    /* And switch it in memory */
 395    uint64_t old_table_offset = s->refcount_table_offset;
 396    uint64_t old_table_size = s->refcount_table_size;
 397
 398    g_free(s->refcount_table);
 399    s->refcount_table = new_table;
 400    s->refcount_table_size = table_size;
 401    s->refcount_table_offset = table_offset;
 402
 403    /* Free old table. Remember, we must not change free_cluster_index */
 404    uint64_t old_free_cluster_index = s->free_cluster_index;
 405    qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
 406                        QCOW2_DISCARD_OTHER);
 407    s->free_cluster_index = old_free_cluster_index;
 408
 409    ret = load_refcount_block(bs, new_block, (void**) refcount_block);
 410    if (ret < 0) {
 411        return ret;
 412    }
 413
 414    return 0;
 415
 416fail_table:
 417    g_free(new_table);
 418fail_block:
 419    if (*refcount_block != NULL) {
 420        qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
 421    }
 422    return ret;
 423}
 424
 425void qcow2_process_discards(BlockDriverState *bs, int ret)
 426{
 427    BDRVQcowState *s = bs->opaque;
 428    Qcow2DiscardRegion *d, *next;
 429
 430    QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
 431        QTAILQ_REMOVE(&s->discards, d, next);
 432
 433        /* Discard is optional, ignore the return value */
 434        if (ret >= 0) {
 435            bdrv_discard(bs->file,
 436                         d->offset >> BDRV_SECTOR_BITS,
 437                         d->bytes >> BDRV_SECTOR_BITS);
 438        }
 439
 440        g_free(d);
 441    }
 442}
 443
 444static void update_refcount_discard(BlockDriverState *bs,
 445                                    uint64_t offset, uint64_t length)
 446{
 447    BDRVQcowState *s = bs->opaque;
 448    Qcow2DiscardRegion *d, *p, *next;
 449
 450    QTAILQ_FOREACH(d, &s->discards, next) {
 451        uint64_t new_start = MIN(offset, d->offset);
 452        uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
 453
 454        if (new_end - new_start <= length + d->bytes) {
 455            /* There can't be any overlap, areas ending up here have no
 456             * references any more and therefore shouldn't get freed another
 457             * time. */
 458            assert(d->bytes + length == new_end - new_start);
 459            d->offset = new_start;
 460            d->bytes = new_end - new_start;
 461            goto found;
 462        }
 463    }
 464
 465    d = g_malloc(sizeof(*d));
 466    *d = (Qcow2DiscardRegion) {
 467        .bs     = bs,
 468        .offset = offset,
 469        .bytes  = length,
 470    };
 471    QTAILQ_INSERT_TAIL(&s->discards, d, next);
 472
 473found:
 474    /* Merge discard requests if they are adjacent now */
 475    QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
 476        if (p == d
 477            || p->offset > d->offset + d->bytes
 478            || d->offset > p->offset + p->bytes)
 479        {
 480            continue;
 481        }
 482
 483        /* Still no overlap possible */
 484        assert(p->offset == d->offset + d->bytes
 485            || d->offset == p->offset + p->bytes);
 486
 487        QTAILQ_REMOVE(&s->discards, p, next);
 488        d->offset = MIN(d->offset, p->offset);
 489        d->bytes += p->bytes;
 490    }
 491}
 492
 493/* XXX: cache several refcount block clusters ? */
 494static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
 495    int64_t offset, int64_t length, int addend, enum qcow2_discard_type type)
 496{
 497    BDRVQcowState *s = bs->opaque;
 498    int64_t start, last, cluster_offset;
 499    uint16_t *refcount_block = NULL;
 500    int64_t old_table_index = -1;
 501    int ret;
 502
 503#ifdef DEBUG_ALLOC2
 504    fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n",
 505           offset, length, addend);
 506#endif
 507    if (length < 0) {
 508        return -EINVAL;
 509    } else if (length == 0) {
 510        return 0;
 511    }
 512
 513    if (addend < 0) {
 514        qcow2_cache_set_dependency(bs, s->refcount_block_cache,
 515            s->l2_table_cache);
 516    }
 517
 518    start = offset & ~(s->cluster_size - 1);
 519    last = (offset + length - 1) & ~(s->cluster_size - 1);
 520    for(cluster_offset = start; cluster_offset <= last;
 521        cluster_offset += s->cluster_size)
 522    {
 523        int block_index, refcount;
 524        int64_t cluster_index = cluster_offset >> s->cluster_bits;
 525        int64_t table_index =
 526            cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
 527
 528        /* Load the refcount block and allocate it if needed */
 529        if (table_index != old_table_index) {
 530            if (refcount_block) {
 531                ret = qcow2_cache_put(bs, s->refcount_block_cache,
 532                    (void**) &refcount_block);
 533                if (ret < 0) {
 534                    goto fail;
 535                }
 536            }
 537
 538            ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
 539            if (ret < 0) {
 540                goto fail;
 541            }
 542        }
 543        old_table_index = table_index;
 544
 545        qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
 546
 547        /* we can update the count and save it */
 548        block_index = cluster_index &
 549            ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
 550
 551        refcount = be16_to_cpu(refcount_block[block_index]);
 552        refcount += addend;
 553        if (refcount < 0 || refcount > 0xffff) {
 554            ret = -EINVAL;
 555            goto fail;
 556        }
 557        if (refcount == 0 && cluster_index < s->free_cluster_index) {
 558            s->free_cluster_index = cluster_index;
 559        }
 560        refcount_block[block_index] = cpu_to_be16(refcount);
 561
 562        if (refcount == 0 && s->discard_passthrough[type]) {
 563            update_refcount_discard(bs, cluster_offset, s->cluster_size);
 564        }
 565    }
 566
 567    ret = 0;
 568fail:
 569    if (!s->cache_discards) {
 570        qcow2_process_discards(bs, ret);
 571    }
 572
 573    /* Write last changed block to disk */
 574    if (refcount_block) {
 575        int wret;
 576        wret = qcow2_cache_put(bs, s->refcount_block_cache,
 577            (void**) &refcount_block);
 578        if (wret < 0) {
 579            return ret < 0 ? ret : wret;
 580        }
 581    }
 582
 583    /*
 584     * Try do undo any updates if an error is returned (This may succeed in
 585     * some cases like ENOSPC for allocating a new refcount block)
 586     */
 587    if (ret < 0) {
 588        int dummy;
 589        dummy = update_refcount(bs, offset, cluster_offset - offset, -addend,
 590                                QCOW2_DISCARD_NEVER);
 591        (void)dummy;
 592    }
 593
 594    return ret;
 595}
 596
 597/*
 598 * Increases or decreases the refcount of a given cluster by one.
 599 * addend must be 1 or -1.
 600 *
 601 * If the return value is non-negative, it is the new refcount of the cluster.
 602 * If it is negative, it is -errno and indicates an error.
 603 */
 604int qcow2_update_cluster_refcount(BlockDriverState *bs,
 605                                  int64_t cluster_index,
 606                                  int addend,
 607                                  enum qcow2_discard_type type)
 608{
 609    BDRVQcowState *s = bs->opaque;
 610    int ret;
 611
 612    ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
 613                          type);
 614    if (ret < 0) {
 615        return ret;
 616    }
 617
 618    return get_refcount(bs, cluster_index);
 619}
 620
 621
 622
 623/*********************************************************/
 624/* cluster allocation functions */
 625
 626
 627
 628/* return < 0 if error */
 629static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
 630{
 631    BDRVQcowState *s = bs->opaque;
 632    int i, nb_clusters, refcount;
 633
 634    nb_clusters = size_to_clusters(s, size);
 635retry:
 636    for(i = 0; i < nb_clusters; i++) {
 637        int64_t next_cluster_index = s->free_cluster_index++;
 638        refcount = get_refcount(bs, next_cluster_index);
 639
 640        if (refcount < 0) {
 641            return refcount;
 642        } else if (refcount != 0) {
 643            goto retry;
 644        }
 645    }
 646#ifdef DEBUG_ALLOC2
 647    fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
 648            size,
 649            (s->free_cluster_index - nb_clusters) << s->cluster_bits);
 650#endif
 651    return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
 652}
 653
 654int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size)
 655{
 656    int64_t offset;
 657    int ret;
 658
 659    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
 660    offset = alloc_clusters_noref(bs, size);
 661    if (offset < 0) {
 662        return offset;
 663    }
 664
 665    ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER);
 666    if (ret < 0) {
 667        return ret;
 668    }
 669
 670    return offset;
 671}
 672
 673int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
 674    int nb_clusters)
 675{
 676    BDRVQcowState *s = bs->opaque;
 677    uint64_t cluster_index;
 678    uint64_t old_free_cluster_index;
 679    int i, refcount, ret;
 680
 681    /* Check how many clusters there are free */
 682    cluster_index = offset >> s->cluster_bits;
 683    for(i = 0; i < nb_clusters; i++) {
 684        refcount = get_refcount(bs, cluster_index++);
 685
 686        if (refcount < 0) {
 687            return refcount;
 688        } else if (refcount != 0) {
 689            break;
 690        }
 691    }
 692
 693    /* And then allocate them */
 694    old_free_cluster_index = s->free_cluster_index;
 695    s->free_cluster_index = cluster_index + i;
 696
 697    ret = update_refcount(bs, offset, i << s->cluster_bits, 1,
 698                          QCOW2_DISCARD_NEVER);
 699    if (ret < 0) {
 700        return ret;
 701    }
 702
 703    s->free_cluster_index = old_free_cluster_index;
 704
 705    return i;
 706}
 707
 708/* only used to allocate compressed sectors. We try to allocate
 709   contiguous sectors. size must be <= cluster_size */
 710int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
 711{
 712    BDRVQcowState *s = bs->opaque;
 713    int64_t offset, cluster_offset;
 714    int free_in_cluster;
 715
 716    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
 717    assert(size > 0 && size <= s->cluster_size);
 718    if (s->free_byte_offset == 0) {
 719        offset = qcow2_alloc_clusters(bs, s->cluster_size);
 720        if (offset < 0) {
 721            return offset;
 722        }
 723        s->free_byte_offset = offset;
 724    }
 725 redo:
 726    free_in_cluster = s->cluster_size -
 727        (s->free_byte_offset & (s->cluster_size - 1));
 728    if (size <= free_in_cluster) {
 729        /* enough space in current cluster */
 730        offset = s->free_byte_offset;
 731        s->free_byte_offset += size;
 732        free_in_cluster -= size;
 733        if (free_in_cluster == 0)
 734            s->free_byte_offset = 0;
 735        if ((offset & (s->cluster_size - 1)) != 0)
 736            qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
 737                                          QCOW2_DISCARD_NEVER);
 738    } else {
 739        offset = qcow2_alloc_clusters(bs, s->cluster_size);
 740        if (offset < 0) {
 741            return offset;
 742        }
 743        cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1);
 744        if ((cluster_offset + s->cluster_size) == offset) {
 745            /* we are lucky: contiguous data */
 746            offset = s->free_byte_offset;
 747            qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
 748                                          QCOW2_DISCARD_NEVER);
 749            s->free_byte_offset += size;
 750        } else {
 751            s->free_byte_offset = offset;
 752            goto redo;
 753        }
 754    }
 755
 756    /* The cluster refcount was incremented, either by qcow2_alloc_clusters()
 757     * or explicitly by qcow2_update_cluster_refcount().  Refcount blocks must
 758     * be flushed before the caller's L2 table updates.
 759     */
 760    qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
 761    return offset;
 762}
 763
 764void qcow2_free_clusters(BlockDriverState *bs,
 765                          int64_t offset, int64_t size,
 766                          enum qcow2_discard_type type)
 767{
 768    int ret;
 769
 770    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
 771    ret = update_refcount(bs, offset, size, -1, type);
 772    if (ret < 0) {
 773        fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
 774        /* TODO Remember the clusters to free them later and avoid leaking */
 775    }
 776}
 777
 778/*
 779 * Free a cluster using its L2 entry (handles clusters of all types, e.g.
 780 * normal cluster, compressed cluster, etc.)
 781 */
 782void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
 783                             int nb_clusters, enum qcow2_discard_type type)
 784{
 785    BDRVQcowState *s = bs->opaque;
 786
 787    switch (qcow2_get_cluster_type(l2_entry)) {
 788    case QCOW2_CLUSTER_COMPRESSED:
 789        {
 790            int nb_csectors;
 791            nb_csectors = ((l2_entry >> s->csize_shift) &
 792                           s->csize_mask) + 1;
 793            qcow2_free_clusters(bs,
 794                (l2_entry & s->cluster_offset_mask) & ~511,
 795                nb_csectors * 512, type);
 796        }
 797        break;
 798    case QCOW2_CLUSTER_NORMAL:
 799    case QCOW2_CLUSTER_ZERO:
 800        if (l2_entry & L2E_OFFSET_MASK) {
 801            qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
 802                                nb_clusters << s->cluster_bits, type);
 803        }
 804        break;
 805    case QCOW2_CLUSTER_UNALLOCATED:
 806        break;
 807    default:
 808        abort();
 809    }
 810}
 811
 812
 813
 814/*********************************************************/
 815/* snapshots and image creation */
 816
 817
 818
 819/* update the refcounts of snapshots and the copied flag */
 820int qcow2_update_snapshot_refcount(BlockDriverState *bs,
 821    int64_t l1_table_offset, int l1_size, int addend)
 822{
 823    BDRVQcowState *s = bs->opaque;
 824    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
 825    int64_t old_offset, old_l2_offset;
 826    int i, j, l1_modified = 0, nb_csectors, refcount;
 827    int ret;
 828
 829    l2_table = NULL;
 830    l1_table = NULL;
 831    l1_size2 = l1_size * sizeof(uint64_t);
 832
 833    s->cache_discards = true;
 834
 835    /* WARNING: qcow2_snapshot_goto relies on this function not using the
 836     * l1_table_offset when it is the current s->l1_table_offset! Be careful
 837     * when changing this! */
 838    if (l1_table_offset != s->l1_table_offset) {
 839        l1_table = g_malloc0(align_offset(l1_size2, 512));
 840        l1_allocated = 1;
 841
 842        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
 843        if (ret < 0) {
 844            goto fail;
 845        }
 846
 847        for(i = 0;i < l1_size; i++)
 848            be64_to_cpus(&l1_table[i]);
 849    } else {
 850        assert(l1_size == s->l1_size);
 851        l1_table = s->l1_table;
 852        l1_allocated = 0;
 853    }
 854
 855    for(i = 0; i < l1_size; i++) {
 856        l2_offset = l1_table[i];
 857        if (l2_offset) {
 858            old_l2_offset = l2_offset;
 859            l2_offset &= L1E_OFFSET_MASK;
 860
 861            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
 862                (void**) &l2_table);
 863            if (ret < 0) {
 864                goto fail;
 865            }
 866
 867            for(j = 0; j < s->l2_size; j++) {
 868                uint64_t cluster_index;
 869
 870                offset = be64_to_cpu(l2_table[j]);
 871                old_offset = offset;
 872                offset &= ~QCOW_OFLAG_COPIED;
 873
 874                switch (qcow2_get_cluster_type(offset)) {
 875                    case QCOW2_CLUSTER_COMPRESSED:
 876                        nb_csectors = ((offset >> s->csize_shift) &
 877                                       s->csize_mask) + 1;
 878                        if (addend != 0) {
 879                            ret = update_refcount(bs,
 880                                (offset & s->cluster_offset_mask) & ~511,
 881                                nb_csectors * 512, addend,
 882                                QCOW2_DISCARD_SNAPSHOT);
 883                            if (ret < 0) {
 884                                goto fail;
 885                            }
 886                        }
 887                        /* compressed clusters are never modified */
 888                        refcount = 2;
 889                        break;
 890
 891                    case QCOW2_CLUSTER_NORMAL:
 892                    case QCOW2_CLUSTER_ZERO:
 893                        cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
 894                        if (!cluster_index) {
 895                            /* unallocated */
 896                            refcount = 0;
 897                            break;
 898                        }
 899                        if (addend != 0) {
 900                            refcount = qcow2_update_cluster_refcount(bs,
 901                                    cluster_index, addend,
 902                                    QCOW2_DISCARD_SNAPSHOT);
 903                        } else {
 904                            refcount = get_refcount(bs, cluster_index);
 905                        }
 906
 907                        if (refcount < 0) {
 908                            ret = refcount;
 909                            goto fail;
 910                        }
 911                        break;
 912
 913                    case QCOW2_CLUSTER_UNALLOCATED:
 914                        refcount = 0;
 915                        break;
 916
 917                    default:
 918                        abort();
 919                }
 920
 921                if (refcount == 1) {
 922                    offset |= QCOW_OFLAG_COPIED;
 923                }
 924                if (offset != old_offset) {
 925                    if (addend > 0) {
 926                        qcow2_cache_set_dependency(bs, s->l2_table_cache,
 927                            s->refcount_block_cache);
 928                    }
 929                    l2_table[j] = cpu_to_be64(offset);
 930                    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
 931                }
 932            }
 933
 934            ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
 935            if (ret < 0) {
 936                goto fail;
 937            }
 938
 939
 940            if (addend != 0) {
 941                refcount = qcow2_update_cluster_refcount(bs, l2_offset >>
 942                        s->cluster_bits, addend, QCOW2_DISCARD_SNAPSHOT);
 943            } else {
 944                refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
 945            }
 946            if (refcount < 0) {
 947                ret = refcount;
 948                goto fail;
 949            } else if (refcount == 1) {
 950                l2_offset |= QCOW_OFLAG_COPIED;
 951            }
 952            if (l2_offset != old_l2_offset) {
 953                l1_table[i] = l2_offset;
 954                l1_modified = 1;
 955            }
 956        }
 957    }
 958
 959    ret = bdrv_flush(bs);
 960fail:
 961    if (l2_table) {
 962        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
 963    }
 964
 965    s->cache_discards = false;
 966    qcow2_process_discards(bs, ret);
 967
 968    /* Update L1 only if it isn't deleted anyway (addend = -1) */
 969    if (ret == 0 && addend >= 0 && l1_modified) {
 970        for (i = 0; i < l1_size; i++) {
 971            cpu_to_be64s(&l1_table[i]);
 972        }
 973
 974        ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, l1_size2);
 975
 976        for (i = 0; i < l1_size; i++) {
 977            be64_to_cpus(&l1_table[i]);
 978        }
 979    }
 980    if (l1_allocated)
 981        g_free(l1_table);
 982    return ret;
 983}
 984
 985
 986
 987
 988/*********************************************************/
 989/* refcount checking functions */
 990
 991
 992
 993/*
 994 * Increases the refcount for a range of clusters in a given refcount table.
 995 * This is used to construct a temporary refcount table out of L1 and L2 tables
 996 * which can be compared the the refcount table saved in the image.
 997 *
 998 * Modifies the number of errors in res.
 999 */
1000static void inc_refcounts(BlockDriverState *bs,
1001                          BdrvCheckResult *res,
1002                          uint16_t *refcount_table,
1003                          int refcount_table_size,
1004                          int64_t offset, int64_t size)
1005{
1006    BDRVQcowState *s = bs->opaque;
1007    int64_t start, last, cluster_offset;
1008    int k;
1009
1010    if (size <= 0)
1011        return;
1012
1013    start = offset & ~(s->cluster_size - 1);
1014    last = (offset + size - 1) & ~(s->cluster_size - 1);
1015    for(cluster_offset = start; cluster_offset <= last;
1016        cluster_offset += s->cluster_size) {
1017        k = cluster_offset >> s->cluster_bits;
1018        if (k < 0) {
1019            fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n",
1020                cluster_offset);
1021            res->corruptions++;
1022        } else if (k >= refcount_table_size) {
1023            fprintf(stderr, "Warning: cluster offset=0x%" PRIx64 " is after "
1024                "the end of the image file, can't properly check refcounts.\n",
1025                cluster_offset);
1026            res->check_errors++;
1027        } else {
1028            if (++refcount_table[k] == 0) {
1029                fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1030                    "\n", cluster_offset);
1031                res->corruptions++;
1032            }
1033        }
1034    }
1035}
1036
1037/* Flags for check_refcounts_l1() and check_refcounts_l2() */
1038enum {
1039    CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1040};
1041
1042/*
1043 * Increases the refcount in the given refcount table for the all clusters
1044 * referenced in the L2 table. While doing so, performs some checks on L2
1045 * entries.
1046 *
1047 * Returns the number of errors found by the checks or -errno if an internal
1048 * error occurred.
1049 */
1050static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1051    uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset,
1052    int flags)
1053{
1054    BDRVQcowState *s = bs->opaque;
1055    uint64_t *l2_table, l2_entry;
1056    uint64_t next_contiguous_offset = 0;
1057    int i, l2_size, nb_csectors;
1058
1059    /* Read L2 table from disk */
1060    l2_size = s->l2_size * sizeof(uint64_t);
1061    l2_table = g_malloc(l2_size);
1062
1063    if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size)
1064        goto fail;
1065
1066    /* Do the actual checks */
1067    for(i = 0; i < s->l2_size; i++) {
1068        l2_entry = be64_to_cpu(l2_table[i]);
1069
1070        switch (qcow2_get_cluster_type(l2_entry)) {
1071        case QCOW2_CLUSTER_COMPRESSED:
1072            /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1073            if (l2_entry & QCOW_OFLAG_COPIED) {
1074                fprintf(stderr, "ERROR: cluster %" PRId64 ": "
1075                    "copied flag must never be set for compressed "
1076                    "clusters\n", l2_entry >> s->cluster_bits);
1077                l2_entry &= ~QCOW_OFLAG_COPIED;
1078                res->corruptions++;
1079            }
1080
1081            /* Mark cluster as used */
1082            nb_csectors = ((l2_entry >> s->csize_shift) &
1083                           s->csize_mask) + 1;
1084            l2_entry &= s->cluster_offset_mask;
1085            inc_refcounts(bs, res, refcount_table, refcount_table_size,
1086                l2_entry & ~511, nb_csectors * 512);
1087
1088            if (flags & CHECK_FRAG_INFO) {
1089                res->bfi.allocated_clusters++;
1090                res->bfi.compressed_clusters++;
1091
1092                /* Compressed clusters are fragmented by nature.  Since they
1093                 * take up sub-sector space but we only have sector granularity
1094                 * I/O we need to re-read the same sectors even for adjacent
1095                 * compressed clusters.
1096                 */
1097                res->bfi.fragmented_clusters++;
1098            }
1099            break;
1100
1101        case QCOW2_CLUSTER_ZERO:
1102            if ((l2_entry & L2E_OFFSET_MASK) == 0) {
1103                break;
1104            }
1105            /* fall through */
1106
1107        case QCOW2_CLUSTER_NORMAL:
1108        {
1109            uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1110
1111            if (flags & CHECK_FRAG_INFO) {
1112                res->bfi.allocated_clusters++;
1113                if (next_contiguous_offset &&
1114                    offset != next_contiguous_offset) {
1115                    res->bfi.fragmented_clusters++;
1116                }
1117                next_contiguous_offset = offset + s->cluster_size;
1118            }
1119
1120            /* Mark cluster as used */
1121            inc_refcounts(bs, res, refcount_table,refcount_table_size,
1122                offset, s->cluster_size);
1123
1124            /* Correct offsets are cluster aligned */
1125            if (offset & (s->cluster_size - 1)) {
1126                fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
1127                    "properly aligned; L2 entry corrupted.\n", offset);
1128                res->corruptions++;
1129            }
1130            break;
1131        }
1132
1133        case QCOW2_CLUSTER_UNALLOCATED:
1134            break;
1135
1136        default:
1137            abort();
1138        }
1139    }
1140
1141    g_free(l2_table);
1142    return 0;
1143
1144fail:
1145    fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1146    g_free(l2_table);
1147    return -EIO;
1148}
1149
1150/*
1151 * Increases the refcount for the L1 table, its L2 tables and all referenced
1152 * clusters in the given refcount table. While doing so, performs some checks
1153 * on L1 and L2 entries.
1154 *
1155 * Returns the number of errors found by the checks or -errno if an internal
1156 * error occurred.
1157 */
1158static int check_refcounts_l1(BlockDriverState *bs,
1159                              BdrvCheckResult *res,
1160                              uint16_t *refcount_table,
1161                              int refcount_table_size,
1162                              int64_t l1_table_offset, int l1_size,
1163                              int flags)
1164{
1165    BDRVQcowState *s = bs->opaque;
1166    uint64_t *l1_table, l2_offset, l1_size2;
1167    int i, ret;
1168
1169    l1_size2 = l1_size * sizeof(uint64_t);
1170
1171    /* Mark L1 table as used */
1172    inc_refcounts(bs, res, refcount_table, refcount_table_size,
1173        l1_table_offset, l1_size2);
1174
1175    /* Read L1 table entries from disk */
1176    if (l1_size2 == 0) {
1177        l1_table = NULL;
1178    } else {
1179        l1_table = g_malloc(l1_size2);
1180        if (bdrv_pread(bs->file, l1_table_offset,
1181                       l1_table, l1_size2) != l1_size2)
1182            goto fail;
1183        for(i = 0;i < l1_size; i++)
1184            be64_to_cpus(&l1_table[i]);
1185    }
1186
1187    /* Do the actual checks */
1188    for(i = 0; i < l1_size; i++) {
1189        l2_offset = l1_table[i];
1190        if (l2_offset) {
1191            /* Mark L2 table as used */
1192            l2_offset &= L1E_OFFSET_MASK;
1193            inc_refcounts(bs, res, refcount_table, refcount_table_size,
1194                l2_offset, s->cluster_size);
1195
1196            /* L2 tables are cluster aligned */
1197            if (l2_offset & (s->cluster_size - 1)) {
1198                fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1199                    "cluster aligned; L1 entry corrupted\n", l2_offset);
1200                res->corruptions++;
1201            }
1202
1203            /* Process and check L2 entries */
1204            ret = check_refcounts_l2(bs, res, refcount_table,
1205                                     refcount_table_size, l2_offset, flags);
1206            if (ret < 0) {
1207                goto fail;
1208            }
1209        }
1210    }
1211    g_free(l1_table);
1212    return 0;
1213
1214fail:
1215    fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1216    res->check_errors++;
1217    g_free(l1_table);
1218    return -EIO;
1219}
1220
1221/*
1222 * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1223 *
1224 * This function does not print an error message nor does it increment
1225 * check_errors if get_refcount fails (this is because such an error will have
1226 * been already detected and sufficiently signaled by the calling function
1227 * (qcow2_check_refcounts) by the time this function is called).
1228 */
1229static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1230                              BdrvCheckMode fix)
1231{
1232    BDRVQcowState *s = bs->opaque;
1233    uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1234    int ret;
1235    int refcount;
1236    int i, j;
1237
1238    for (i = 0; i < s->l1_size; i++) {
1239        uint64_t l1_entry = s->l1_table[i];
1240        uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1241        bool l2_dirty = false;
1242
1243        if (!l2_offset) {
1244            continue;
1245        }
1246
1247        refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
1248        if (refcount < 0) {
1249            /* don't print message nor increment check_errors */
1250            continue;
1251        }
1252        if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1253            fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1254                    "l1_entry=%" PRIx64 " refcount=%d\n",
1255                    fix & BDRV_FIX_ERRORS ? "Repairing" :
1256                                            "ERROR",
1257                    i, l1_entry, refcount);
1258            if (fix & BDRV_FIX_ERRORS) {
1259                s->l1_table[i] = refcount == 1
1260                               ? l1_entry |  QCOW_OFLAG_COPIED
1261                               : l1_entry & ~QCOW_OFLAG_COPIED;
1262                ret = qcow2_write_l1_entry(bs, i);
1263                if (ret < 0) {
1264                    res->check_errors++;
1265                    goto fail;
1266                }
1267                res->corruptions_fixed++;
1268            } else {
1269                res->corruptions++;
1270            }
1271        }
1272
1273        ret = bdrv_pread(bs->file, l2_offset, l2_table,
1274                         s->l2_size * sizeof(uint64_t));
1275        if (ret < 0) {
1276            fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1277                    strerror(-ret));
1278            res->check_errors++;
1279            goto fail;
1280        }
1281
1282        for (j = 0; j < s->l2_size; j++) {
1283            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1284            uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
1285            int cluster_type = qcow2_get_cluster_type(l2_entry);
1286
1287            if ((cluster_type == QCOW2_CLUSTER_NORMAL) ||
1288                ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) {
1289                refcount = get_refcount(bs, data_offset >> s->cluster_bits);
1290                if (refcount < 0) {
1291                    /* don't print message nor increment check_errors */
1292                    continue;
1293                }
1294                if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
1295                    fprintf(stderr, "%s OFLAG_COPIED data cluster: "
1296                            "l2_entry=%" PRIx64 " refcount=%d\n",
1297                            fix & BDRV_FIX_ERRORS ? "Repairing" :
1298                                                    "ERROR",
1299                            l2_entry, refcount);
1300                    if (fix & BDRV_FIX_ERRORS) {
1301                        l2_table[j] = cpu_to_be64(refcount == 1
1302                                    ? l2_entry |  QCOW_OFLAG_COPIED
1303                                    : l2_entry & ~QCOW_OFLAG_COPIED);
1304                        l2_dirty = true;
1305                        res->corruptions_fixed++;
1306                    } else {
1307                        res->corruptions++;
1308                    }
1309                }
1310            }
1311        }
1312
1313        if (l2_dirty) {
1314            ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1315                                                l2_offset, s->cluster_size);
1316            if (ret < 0) {
1317                fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1318                        "overlap check failed: %s\n", strerror(-ret));
1319                res->check_errors++;
1320                goto fail;
1321            }
1322
1323            ret = bdrv_pwrite(bs->file, l2_offset, l2_table, s->cluster_size);
1324            if (ret < 0) {
1325                fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1326                        strerror(-ret));
1327                res->check_errors++;
1328                goto fail;
1329            }
1330        }
1331    }
1332
1333    ret = 0;
1334
1335fail:
1336    qemu_vfree(l2_table);
1337    return ret;
1338}
1339
1340/*
1341 * Writes one sector of the refcount table to the disk
1342 */
1343#define RT_ENTRIES_PER_SECTOR (512 / sizeof(uint64_t))
1344static int write_reftable_entry(BlockDriverState *bs, int rt_index)
1345{
1346    BDRVQcowState *s = bs->opaque;
1347    uint64_t buf[RT_ENTRIES_PER_SECTOR];
1348    int rt_start_index;
1349    int i, ret;
1350
1351    rt_start_index = rt_index & ~(RT_ENTRIES_PER_SECTOR - 1);
1352    for (i = 0; i < RT_ENTRIES_PER_SECTOR; i++) {
1353        buf[i] = cpu_to_be64(s->refcount_table[rt_start_index + i]);
1354    }
1355
1356    ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_REFCOUNT_TABLE,
1357            s->refcount_table_offset + rt_start_index * sizeof(uint64_t),
1358            sizeof(buf));
1359    if (ret < 0) {
1360        return ret;
1361    }
1362
1363    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE);
1364    ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset +
1365            rt_start_index * sizeof(uint64_t), buf, sizeof(buf));
1366    if (ret < 0) {
1367        return ret;
1368    }
1369
1370    return 0;
1371}
1372
1373/*
1374 * Allocates a new cluster for the given refcount block (represented by its
1375 * offset in the image file) and copies the current content there. This function
1376 * does _not_ decrement the reference count for the currently occupied cluster.
1377 *
1378 * This function prints an informative message to stderr on error (and returns
1379 * -errno); on success, 0 is returned.
1380 */
1381static int64_t realloc_refcount_block(BlockDriverState *bs, int reftable_index,
1382                                      uint64_t offset)
1383{
1384    BDRVQcowState *s = bs->opaque;
1385    int64_t new_offset = 0;
1386    void *refcount_block = NULL;
1387    int ret;
1388
1389    /* allocate new refcount block */
1390    new_offset = qcow2_alloc_clusters(bs, s->cluster_size);
1391    if (new_offset < 0) {
1392        fprintf(stderr, "Could not allocate new cluster: %s\n",
1393                strerror(-new_offset));
1394        ret = new_offset;
1395        goto fail;
1396    }
1397
1398    /* fetch current refcount block content */
1399    ret = qcow2_cache_get(bs, s->refcount_block_cache, offset, &refcount_block);
1400    if (ret < 0) {
1401        fprintf(stderr, "Could not fetch refcount block: %s\n", strerror(-ret));
1402        goto fail;
1403    }
1404
1405    /* new block has not yet been entered into refcount table, therefore it is
1406     * no refcount block yet (regarding this check) */
1407    ret = qcow2_pre_write_overlap_check(bs, 0, new_offset, s->cluster_size);
1408    if (ret < 0) {
1409        fprintf(stderr, "Could not write refcount block; metadata overlap "
1410                "check failed: %s\n", strerror(-ret));
1411        /* the image will be marked corrupt, so don't even attempt on freeing
1412         * the cluster */
1413        new_offset = 0;
1414        goto fail;
1415    }
1416
1417    /* write to new block */
1418    ret = bdrv_write(bs->file, new_offset / BDRV_SECTOR_SIZE, refcount_block,
1419            s->cluster_sectors);
1420    if (ret < 0) {
1421        fprintf(stderr, "Could not write refcount block: %s\n", strerror(-ret));
1422        goto fail;
1423    }
1424
1425    /* update refcount table */
1426    assert(!(new_offset & (s->cluster_size - 1)));
1427    s->refcount_table[reftable_index] = new_offset;
1428    ret = write_reftable_entry(bs, reftable_index);
1429    if (ret < 0) {
1430        fprintf(stderr, "Could not update refcount table: %s\n",
1431                strerror(-ret));
1432        goto fail;
1433    }
1434
1435fail:
1436    if (new_offset && (ret < 0)) {
1437        qcow2_free_clusters(bs, new_offset, s->cluster_size,
1438                QCOW2_DISCARD_ALWAYS);
1439    }
1440    if (refcount_block) {
1441        if (ret < 0) {
1442            qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
1443        } else {
1444            ret = qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
1445        }
1446    }
1447    if (ret < 0) {
1448        return ret;
1449    }
1450    return new_offset;
1451}
1452
1453/*
1454 * Checks an image for refcount consistency.
1455 *
1456 * Returns 0 if no errors are found, the number of errors in case the image is
1457 * detected as corrupted, and -errno when an internal error occurred.
1458 */
1459int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1460                          BdrvCheckMode fix)
1461{
1462    BDRVQcowState *s = bs->opaque;
1463    int64_t size, i, highest_cluster;
1464    int nb_clusters, refcount1, refcount2;
1465    QCowSnapshot *sn;
1466    uint16_t *refcount_table;
1467    int ret;
1468
1469    size = bdrv_getlength(bs->file);
1470    nb_clusters = size_to_clusters(s, size);
1471    refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t));
1472
1473    res->bfi.total_clusters =
1474        size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
1475
1476    /* header */
1477    inc_refcounts(bs, res, refcount_table, nb_clusters,
1478        0, s->cluster_size);
1479
1480    /* current L1 table */
1481    ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1482                             s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO);
1483    if (ret < 0) {
1484        goto fail;
1485    }
1486
1487    /* snapshots */
1488    for(i = 0; i < s->nb_snapshots; i++) {
1489        sn = s->snapshots + i;
1490        ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1491            sn->l1_table_offset, sn->l1_size, 0);
1492        if (ret < 0) {
1493            goto fail;
1494        }
1495    }
1496    inc_refcounts(bs, res, refcount_table, nb_clusters,
1497        s->snapshots_offset, s->snapshots_size);
1498
1499    /* refcount data */
1500    inc_refcounts(bs, res, refcount_table, nb_clusters,
1501        s->refcount_table_offset,
1502        s->refcount_table_size * sizeof(uint64_t));
1503
1504    for(i = 0; i < s->refcount_table_size; i++) {
1505        uint64_t offset, cluster;
1506        offset = s->refcount_table[i];
1507        cluster = offset >> s->cluster_bits;
1508
1509        /* Refcount blocks are cluster aligned */
1510        if (offset & (s->cluster_size - 1)) {
1511            fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
1512                "cluster aligned; refcount table entry corrupted\n", i);
1513            res->corruptions++;
1514            continue;
1515        }
1516
1517        if (cluster >= nb_clusters) {
1518            fprintf(stderr, "ERROR refcount block %" PRId64
1519                    " is outside image\n", i);
1520            res->corruptions++;
1521            continue;
1522        }
1523
1524        if (offset != 0) {
1525            inc_refcounts(bs, res, refcount_table, nb_clusters,
1526                offset, s->cluster_size);
1527            if (refcount_table[cluster] != 1) {
1528                fprintf(stderr, "%s refcount block %" PRId64
1529                    " refcount=%d\n",
1530                    fix & BDRV_FIX_ERRORS ? "Repairing" :
1531                                            "ERROR",
1532                    i, refcount_table[cluster]);
1533
1534                if (fix & BDRV_FIX_ERRORS) {
1535                    int64_t new_offset;
1536
1537                    new_offset = realloc_refcount_block(bs, i, offset);
1538                    if (new_offset < 0) {
1539                        res->corruptions++;
1540                        continue;
1541                    }
1542
1543                    /* update refcounts */
1544                    if ((new_offset >> s->cluster_bits) >= nb_clusters) {
1545                        /* increase refcount_table size if necessary */
1546                        int old_nb_clusters = nb_clusters;
1547                        nb_clusters = (new_offset >> s->cluster_bits) + 1;
1548                        refcount_table = g_realloc(refcount_table,
1549                                nb_clusters * sizeof(uint16_t));
1550                        memset(&refcount_table[old_nb_clusters], 0, (nb_clusters
1551                                - old_nb_clusters) * sizeof(uint16_t));
1552                    }
1553                    refcount_table[cluster]--;
1554                    inc_refcounts(bs, res, refcount_table, nb_clusters,
1555                            new_offset, s->cluster_size);
1556
1557                    res->corruptions_fixed++;
1558                } else {
1559                    res->corruptions++;
1560                }
1561            }
1562        }
1563    }
1564
1565    /* compare ref counts */
1566    for (i = 0, highest_cluster = 0; i < nb_clusters; i++) {
1567        refcount1 = get_refcount(bs, i);
1568        if (refcount1 < 0) {
1569            fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
1570                i, strerror(-refcount1));
1571            res->check_errors++;
1572            continue;
1573        }
1574
1575        refcount2 = refcount_table[i];
1576
1577        if (refcount1 > 0 || refcount2 > 0) {
1578            highest_cluster = i;
1579        }
1580
1581        if (refcount1 != refcount2) {
1582
1583            /* Check if we're allowed to fix the mismatch */
1584            int *num_fixed = NULL;
1585            if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
1586                num_fixed = &res->leaks_fixed;
1587            } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
1588                num_fixed = &res->corruptions_fixed;
1589            }
1590
1591            fprintf(stderr, "%s cluster %" PRId64 " refcount=%d reference=%d\n",
1592                   num_fixed != NULL     ? "Repairing" :
1593                   refcount1 < refcount2 ? "ERROR" :
1594                                           "Leaked",
1595                   i, refcount1, refcount2);
1596
1597            if (num_fixed) {
1598                ret = update_refcount(bs, i << s->cluster_bits, 1,
1599                                      refcount2 - refcount1,
1600                                      QCOW2_DISCARD_ALWAYS);
1601                if (ret >= 0) {
1602                    (*num_fixed)++;
1603                    continue;
1604                }
1605            }
1606
1607            /* And if we couldn't, print an error */
1608            if (refcount1 < refcount2) {
1609                res->corruptions++;
1610            } else {
1611                res->leaks++;
1612            }
1613        }
1614    }
1615
1616    /* check OFLAG_COPIED */
1617    ret = check_oflag_copied(bs, res, fix);
1618    if (ret < 0) {
1619        goto fail;
1620    }
1621
1622    res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
1623    ret = 0;
1624
1625fail:
1626    g_free(refcount_table);
1627
1628    return ret;
1629}
1630
1631#define overlaps_with(ofs, sz) \
1632    ranges_overlap(offset, size, ofs, sz)
1633
1634/*
1635 * Checks if the given offset into the image file is actually free to use by
1636 * looking for overlaps with important metadata sections (L1/L2 tables etc.),
1637 * i.e. a sanity check without relying on the refcount tables.
1638 *
1639 * The ign parameter specifies what checks not to perform (being a bitmask of
1640 * QCow2MetadataOverlap values), i.e., what sections to ignore.
1641 *
1642 * Returns:
1643 * - 0 if writing to this offset will not affect the mentioned metadata
1644 * - a positive QCow2MetadataOverlap value indicating one overlapping section
1645 * - a negative value (-errno) indicating an error while performing a check,
1646 *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
1647 */
1648int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
1649                                 int64_t size)
1650{
1651    BDRVQcowState *s = bs->opaque;
1652    int chk = s->overlap_check & ~ign;
1653    int i, j;
1654
1655    if (!size) {
1656        return 0;
1657    }
1658
1659    if (chk & QCOW2_OL_MAIN_HEADER) {
1660        if (offset < s->cluster_size) {
1661            return QCOW2_OL_MAIN_HEADER;
1662        }
1663    }
1664
1665    /* align range to test to cluster boundaries */
1666    size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
1667    offset = start_of_cluster(s, offset);
1668
1669    if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
1670        if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
1671            return QCOW2_OL_ACTIVE_L1;
1672        }
1673    }
1674
1675    if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
1676        if (overlaps_with(s->refcount_table_offset,
1677            s->refcount_table_size * sizeof(uint64_t))) {
1678            return QCOW2_OL_REFCOUNT_TABLE;
1679        }
1680    }
1681
1682    if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
1683        if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
1684            return QCOW2_OL_SNAPSHOT_TABLE;
1685        }
1686    }
1687
1688    if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
1689        for (i = 0; i < s->nb_snapshots; i++) {
1690            if (s->snapshots[i].l1_size &&
1691                overlaps_with(s->snapshots[i].l1_table_offset,
1692                s->snapshots[i].l1_size * sizeof(uint64_t))) {
1693                return QCOW2_OL_INACTIVE_L1;
1694            }
1695        }
1696    }
1697
1698    if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
1699        for (i = 0; i < s->l1_size; i++) {
1700            if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
1701                overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
1702                s->cluster_size)) {
1703                return QCOW2_OL_ACTIVE_L2;
1704            }
1705        }
1706    }
1707
1708    if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
1709        for (i = 0; i < s->refcount_table_size; i++) {
1710            if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
1711                overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
1712                s->cluster_size)) {
1713                return QCOW2_OL_REFCOUNT_BLOCK;
1714            }
1715        }
1716    }
1717
1718    if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
1719        for (i = 0; i < s->nb_snapshots; i++) {
1720            uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
1721            uint32_t l1_sz  = s->snapshots[i].l1_size;
1722            uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
1723            uint64_t *l1 = g_malloc(l1_sz2);
1724            int ret;
1725
1726            ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
1727            if (ret < 0) {
1728                g_free(l1);
1729                return ret;
1730            }
1731
1732            for (j = 0; j < l1_sz; j++) {
1733                uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
1734                if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
1735                    g_free(l1);
1736                    return QCOW2_OL_INACTIVE_L2;
1737                }
1738            }
1739
1740            g_free(l1);
1741        }
1742    }
1743
1744    return 0;
1745}
1746
1747static const char *metadata_ol_names[] = {
1748    [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header",
1749    [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table",
1750    [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table",
1751    [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
1752    [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
1753    [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
1754    [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table",
1755    [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table",
1756};
1757
1758/*
1759 * First performs a check for metadata overlaps (through
1760 * qcow2_check_metadata_overlap); if that fails with a negative value (error
1761 * while performing a check), that value is returned. If an impending overlap
1762 * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
1763 * and -EIO returned.
1764 *
1765 * Returns 0 if there were neither overlaps nor errors while checking for
1766 * overlaps; or a negative value (-errno) on error.
1767 */
1768int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
1769                                  int64_t size)
1770{
1771    int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
1772
1773    if (ret < 0) {
1774        return ret;
1775    } else if (ret > 0) {
1776        int metadata_ol_bitnr = ffs(ret) - 1;
1777        char *message;
1778        QObject *data;
1779
1780        assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
1781
1782        fprintf(stderr, "qcow2: Preventing invalid write on metadata (overlaps "
1783                "with %s); image marked as corrupt.\n",
1784                metadata_ol_names[metadata_ol_bitnr]);
1785        message = g_strdup_printf("Prevented %s overwrite",
1786                metadata_ol_names[metadata_ol_bitnr]);
1787        data = qobject_from_jsonf("{ 'device': %s, 'msg': %s, 'offset': %"
1788                PRId64 ", 'size': %" PRId64 " }", bs->device_name, message,
1789                offset, size);
1790        monitor_protocol_event(QEVENT_BLOCK_IMAGE_CORRUPTED, data);
1791        g_free(message);
1792        qobject_decref(data);
1793
1794        qcow2_mark_corrupt(bs);
1795        bs->drv = NULL; /* make BDS unusable */
1796        return -EIO;
1797    }
1798
1799    return 0;
1800}
1801