qemu/block/qcow2-cluster.c
<<
>>
Prefs
   1/*
   2 * Block driver for the QCOW version 2 format
   3 *
   4 * Copyright (c) 2004-2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include <zlib.h>
  26
  27#include "qemu-common.h"
  28#include "block/block_int.h"
  29#include "block/qcow2.h"
  30#include "trace.h"
  31
  32int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
  33                        bool exact_size)
  34{
  35    BDRVQcow2State *s = bs->opaque;
  36    int new_l1_size2, ret, i;
  37    uint64_t *new_l1_table;
  38    int64_t old_l1_table_offset, old_l1_size;
  39    int64_t new_l1_table_offset, new_l1_size;
  40    uint8_t data[12];
  41
  42    if (min_size <= s->l1_size)
  43        return 0;
  44
  45    /* Do a sanity check on min_size before trying to calculate new_l1_size
  46     * (this prevents overflows during the while loop for the calculation of
  47     * new_l1_size) */
  48    if (min_size > INT_MAX / sizeof(uint64_t)) {
  49        return -EFBIG;
  50    }
  51
  52    if (exact_size) {
  53        new_l1_size = min_size;
  54    } else {
  55        /* Bump size up to reduce the number of times we have to grow */
  56        new_l1_size = s->l1_size;
  57        if (new_l1_size == 0) {
  58            new_l1_size = 1;
  59        }
  60        while (min_size > new_l1_size) {
  61            new_l1_size = (new_l1_size * 3 + 1) / 2;
  62        }
  63    }
  64
  65    if (new_l1_size > INT_MAX / sizeof(uint64_t)) {
  66        return -EFBIG;
  67    }
  68
  69#ifdef DEBUG_ALLOC2
  70    fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n",
  71            s->l1_size, new_l1_size);
  72#endif
  73
  74    new_l1_size2 = sizeof(uint64_t) * new_l1_size;
  75    new_l1_table = qemu_try_blockalign(bs->file->bs,
  76                                       align_offset(new_l1_size2, 512));
  77    if (new_l1_table == NULL) {
  78        return -ENOMEM;
  79    }
  80    memset(new_l1_table, 0, align_offset(new_l1_size2, 512));
  81
  82    memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
  83
  84    /* write new table (align to cluster) */
  85    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE);
  86    new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2);
  87    if (new_l1_table_offset < 0) {
  88        qemu_vfree(new_l1_table);
  89        return new_l1_table_offset;
  90    }
  91
  92    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
  93    if (ret < 0) {
  94        goto fail;
  95    }
  96
  97    /* the L1 position has not yet been updated, so these clusters must
  98     * indeed be completely free */
  99    ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset,
 100                                        new_l1_size2);
 101    if (ret < 0) {
 102        goto fail;
 103    }
 104
 105    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
 106    for(i = 0; i < s->l1_size; i++)
 107        new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
 108    ret = bdrv_pwrite_sync(bs->file->bs, new_l1_table_offset,
 109                           new_l1_table, new_l1_size2);
 110    if (ret < 0)
 111        goto fail;
 112    for(i = 0; i < s->l1_size; i++)
 113        new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
 114
 115    /* set new table */
 116    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE);
 117    cpu_to_be32w((uint32_t*)data, new_l1_size);
 118    stq_be_p(data + 4, new_l1_table_offset);
 119    ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, l1_size),
 120                           data, sizeof(data));
 121    if (ret < 0) {
 122        goto fail;
 123    }
 124    qemu_vfree(s->l1_table);
 125    old_l1_table_offset = s->l1_table_offset;
 126    s->l1_table_offset = new_l1_table_offset;
 127    s->l1_table = new_l1_table;
 128    old_l1_size = s->l1_size;
 129    s->l1_size = new_l1_size;
 130    qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * sizeof(uint64_t),
 131                        QCOW2_DISCARD_OTHER);
 132    return 0;
 133 fail:
 134    qemu_vfree(new_l1_table);
 135    qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2,
 136                        QCOW2_DISCARD_OTHER);
 137    return ret;
 138}
 139
 140/*
 141 * l2_load
 142 *
 143 * Loads a L2 table into memory. If the table is in the cache, the cache
 144 * is used; otherwise the L2 table is loaded from the image file.
 145 *
 146 * Returns a pointer to the L2 table on success, or NULL if the read from
 147 * the image file failed.
 148 */
 149
 150static int l2_load(BlockDriverState *bs, uint64_t l2_offset,
 151    uint64_t **l2_table)
 152{
 153    BDRVQcow2State *s = bs->opaque;
 154    int ret;
 155
 156    ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) l2_table);
 157
 158    return ret;
 159}
 160
 161/*
 162 * Writes one sector of the L1 table to the disk (can't update single entries
 163 * and we really don't want bdrv_pread to perform a read-modify-write)
 164 */
 165#define L1_ENTRIES_PER_SECTOR (512 / 8)
 166int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index)
 167{
 168    BDRVQcow2State *s = bs->opaque;
 169    uint64_t buf[L1_ENTRIES_PER_SECTOR] = { 0 };
 170    int l1_start_index;
 171    int i, ret;
 172
 173    l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1);
 174    for (i = 0; i < L1_ENTRIES_PER_SECTOR && l1_start_index + i < s->l1_size;
 175         i++)
 176    {
 177        buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]);
 178    }
 179
 180    ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1,
 181            s->l1_table_offset + 8 * l1_start_index, sizeof(buf));
 182    if (ret < 0) {
 183        return ret;
 184    }
 185
 186    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
 187    ret = bdrv_pwrite_sync(bs->file->bs,
 188                           s->l1_table_offset + 8 * l1_start_index,
 189                           buf, sizeof(buf));
 190    if (ret < 0) {
 191        return ret;
 192    }
 193
 194    return 0;
 195}
 196
 197/*
 198 * l2_allocate
 199 *
 200 * Allocate a new l2 entry in the file. If l1_index points to an already
 201 * used entry in the L2 table (i.e. we are doing a copy on write for the L2
 202 * table) copy the contents of the old L2 table into the newly allocated one.
 203 * Otherwise the new table is initialized with zeros.
 204 *
 205 */
 206
 207static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
 208{
 209    BDRVQcow2State *s = bs->opaque;
 210    uint64_t old_l2_offset;
 211    uint64_t *l2_table = NULL;
 212    int64_t l2_offset;
 213    int ret;
 214
 215    old_l2_offset = s->l1_table[l1_index];
 216
 217    trace_qcow2_l2_allocate(bs, l1_index);
 218
 219    /* allocate a new l2 entry */
 220
 221    l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
 222    if (l2_offset < 0) {
 223        ret = l2_offset;
 224        goto fail;
 225    }
 226
 227    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 228    if (ret < 0) {
 229        goto fail;
 230    }
 231
 232    /* allocate a new entry in the l2 cache */
 233
 234    trace_qcow2_l2_allocate_get_empty(bs, l1_index);
 235    ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table);
 236    if (ret < 0) {
 237        goto fail;
 238    }
 239
 240    l2_table = *table;
 241
 242    if ((old_l2_offset & L1E_OFFSET_MASK) == 0) {
 243        /* if there was no old l2 table, clear the new table */
 244        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
 245    } else {
 246        uint64_t* old_table;
 247
 248        /* if there was an old l2 table, read it from the disk */
 249        BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ);
 250        ret = qcow2_cache_get(bs, s->l2_table_cache,
 251            old_l2_offset & L1E_OFFSET_MASK,
 252            (void**) &old_table);
 253        if (ret < 0) {
 254            goto fail;
 255        }
 256
 257        memcpy(l2_table, old_table, s->cluster_size);
 258
 259        qcow2_cache_put(bs, s->l2_table_cache, (void **) &old_table);
 260    }
 261
 262    /* write the l2 table to the file */
 263    BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
 264
 265    trace_qcow2_l2_allocate_write_l2(bs, l1_index);
 266    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
 267    ret = qcow2_cache_flush(bs, s->l2_table_cache);
 268    if (ret < 0) {
 269        goto fail;
 270    }
 271
 272    /* update the L1 entry */
 273    trace_qcow2_l2_allocate_write_l1(bs, l1_index);
 274    s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
 275    ret = qcow2_write_l1_entry(bs, l1_index);
 276    if (ret < 0) {
 277        goto fail;
 278    }
 279
 280    *table = l2_table;
 281    trace_qcow2_l2_allocate_done(bs, l1_index, 0);
 282    return 0;
 283
 284fail:
 285    trace_qcow2_l2_allocate_done(bs, l1_index, ret);
 286    if (l2_table != NULL) {
 287        qcow2_cache_put(bs, s->l2_table_cache, (void**) table);
 288    }
 289    s->l1_table[l1_index] = old_l2_offset;
 290    if (l2_offset > 0) {
 291        qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t),
 292                            QCOW2_DISCARD_ALWAYS);
 293    }
 294    return ret;
 295}
 296
 297/*
 298 * Checks how many clusters in a given L2 table are contiguous in the image
 299 * file. As soon as one of the flags in the bitmask stop_flags changes compared
 300 * to the first cluster, the search is stopped and the cluster is not counted
 301 * as contiguous. (This allows it, for example, to stop at the first compressed
 302 * cluster which may require a different handling)
 303 */
 304static int count_contiguous_clusters(int nb_clusters, int cluster_size,
 305        uint64_t *l2_table, uint64_t stop_flags)
 306{
 307    int i;
 308    uint64_t mask = stop_flags | L2E_OFFSET_MASK | QCOW_OFLAG_COMPRESSED;
 309    uint64_t first_entry = be64_to_cpu(l2_table[0]);
 310    uint64_t offset = first_entry & mask;
 311
 312    if (!offset)
 313        return 0;
 314
 315    assert(qcow2_get_cluster_type(first_entry) == QCOW2_CLUSTER_NORMAL);
 316
 317    for (i = 0; i < nb_clusters; i++) {
 318        uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask;
 319        if (offset + (uint64_t) i * cluster_size != l2_entry) {
 320            break;
 321        }
 322    }
 323
 324        return i;
 325}
 326
 327static int count_contiguous_clusters_by_type(int nb_clusters,
 328                                             uint64_t *l2_table,
 329                                             int wanted_type)
 330{
 331    int i;
 332
 333    for (i = 0; i < nb_clusters; i++) {
 334        int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i]));
 335
 336        if (type != wanted_type) {
 337            break;
 338        }
 339    }
 340
 341    return i;
 342}
 343
 344/* The crypt function is compatible with the linux cryptoloop
 345   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
 346   supported */
 347int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
 348                          uint8_t *out_buf, const uint8_t *in_buf,
 349                          int nb_sectors, bool enc,
 350                          Error **errp)
 351{
 352    union {
 353        uint64_t ll[2];
 354        uint8_t b[16];
 355    } ivec;
 356    int i;
 357    int ret;
 358
 359    for(i = 0; i < nb_sectors; i++) {
 360        ivec.ll[0] = cpu_to_le64(sector_num);
 361        ivec.ll[1] = 0;
 362        if (qcrypto_cipher_setiv(s->cipher,
 363                                 ivec.b, G_N_ELEMENTS(ivec.b),
 364                                 errp) < 0) {
 365            return -1;
 366        }
 367        if (enc) {
 368            ret = qcrypto_cipher_encrypt(s->cipher,
 369                                         in_buf,
 370                                         out_buf,
 371                                         512,
 372                                         errp);
 373        } else {
 374            ret = qcrypto_cipher_decrypt(s->cipher,
 375                                         in_buf,
 376                                         out_buf,
 377                                         512,
 378                                         errp);
 379        }
 380        if (ret < 0) {
 381            return -1;
 382        }
 383        sector_num++;
 384        in_buf += 512;
 385        out_buf += 512;
 386    }
 387    return 0;
 388}
 389
 390static int coroutine_fn copy_sectors(BlockDriverState *bs,
 391                                     uint64_t start_sect,
 392                                     uint64_t cluster_offset,
 393                                     int n_start, int n_end)
 394{
 395    BDRVQcow2State *s = bs->opaque;
 396    QEMUIOVector qiov;
 397    struct iovec iov;
 398    int n, ret;
 399
 400    n = n_end - n_start;
 401    if (n <= 0) {
 402        return 0;
 403    }
 404
 405    iov.iov_len = n * BDRV_SECTOR_SIZE;
 406    iov.iov_base = qemu_try_blockalign(bs, iov.iov_len);
 407    if (iov.iov_base == NULL) {
 408        return -ENOMEM;
 409    }
 410
 411    qemu_iovec_init_external(&qiov, &iov, 1);
 412
 413    BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
 414
 415    if (!bs->drv) {
 416        ret = -ENOMEDIUM;
 417        goto out;
 418    }
 419
 420    /* Call .bdrv_co_readv() directly instead of using the public block-layer
 421     * interface.  This avoids double I/O throttling and request tracking,
 422     * which can lead to deadlock when block layer copy-on-read is enabled.
 423     */
 424    ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov);
 425    if (ret < 0) {
 426        goto out;
 427    }
 428
 429    if (bs->encrypted) {
 430        Error *err = NULL;
 431        assert(s->cipher);
 432        if (qcow2_encrypt_sectors(s, start_sect + n_start,
 433                                  iov.iov_base, iov.iov_base, n,
 434                                  true, &err) < 0) {
 435            ret = -EIO;
 436            error_free(err);
 437            goto out;
 438        }
 439    }
 440
 441    ret = qcow2_pre_write_overlap_check(bs, 0,
 442            cluster_offset + n_start * BDRV_SECTOR_SIZE, n * BDRV_SECTOR_SIZE);
 443    if (ret < 0) {
 444        goto out;
 445    }
 446
 447    BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
 448    ret = bdrv_co_writev(bs->file->bs, (cluster_offset >> 9) + n_start, n,
 449                         &qiov);
 450    if (ret < 0) {
 451        goto out;
 452    }
 453
 454    ret = 0;
 455out:
 456    qemu_vfree(iov.iov_base);
 457    return ret;
 458}
 459
 460
 461/*
 462 * get_cluster_offset
 463 *
 464 * For a given offset of the disk image, find the cluster offset in
 465 * qcow2 file. The offset is stored in *cluster_offset.
 466 *
 467 * on entry, *num is the number of contiguous sectors we'd like to
 468 * access following offset.
 469 *
 470 * on exit, *num is the number of contiguous sectors we can read.
 471 *
 472 * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error
 473 * cases.
 474 */
 475int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
 476    int *num, uint64_t *cluster_offset)
 477{
 478    BDRVQcow2State *s = bs->opaque;
 479    unsigned int l2_index;
 480    uint64_t l1_index, l2_offset, *l2_table;
 481    int l1_bits, c;
 482    unsigned int index_in_cluster, nb_clusters;
 483    uint64_t nb_available, nb_needed;
 484    int ret;
 485
 486    index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
 487    nb_needed = *num + index_in_cluster;
 488
 489    l1_bits = s->l2_bits + s->cluster_bits;
 490
 491    /* compute how many bytes there are between the offset and
 492     * the end of the l1 entry
 493     */
 494
 495    nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1));
 496
 497    /* compute the number of available sectors */
 498
 499    nb_available = (nb_available >> 9) + index_in_cluster;
 500
 501    if (nb_needed > nb_available) {
 502        nb_needed = nb_available;
 503    }
 504    assert(nb_needed <= INT_MAX);
 505
 506    *cluster_offset = 0;
 507
 508    /* seek to the l2 offset in the l1 table */
 509
 510    l1_index = offset >> l1_bits;
 511    if (l1_index >= s->l1_size) {
 512        ret = QCOW2_CLUSTER_UNALLOCATED;
 513        goto out;
 514    }
 515
 516    l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
 517    if (!l2_offset) {
 518        ret = QCOW2_CLUSTER_UNALLOCATED;
 519        goto out;
 520    }
 521
 522    if (offset_into_cluster(s, l2_offset)) {
 523        qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64
 524                                " unaligned (L1 index: %#" PRIx64 ")",
 525                                l2_offset, l1_index);
 526        return -EIO;
 527    }
 528
 529    /* load the l2 table in memory */
 530
 531    ret = l2_load(bs, l2_offset, &l2_table);
 532    if (ret < 0) {
 533        return ret;
 534    }
 535
 536    /* find the cluster offset for the given disk offset */
 537
 538    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
 539    *cluster_offset = be64_to_cpu(l2_table[l2_index]);
 540
 541    /* nb_needed <= INT_MAX, thus nb_clusters <= INT_MAX, too */
 542    nb_clusters = size_to_clusters(s, nb_needed << 9);
 543
 544    ret = qcow2_get_cluster_type(*cluster_offset);
 545    switch (ret) {
 546    case QCOW2_CLUSTER_COMPRESSED:
 547        /* Compressed clusters can only be processed one by one */
 548        c = 1;
 549        *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK;
 550        break;
 551    case QCOW2_CLUSTER_ZERO:
 552        if (s->qcow_version < 3) {
 553            qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found"
 554                                    " in pre-v3 image (L2 offset: %#" PRIx64
 555                                    ", L2 index: %#x)", l2_offset, l2_index);
 556            ret = -EIO;
 557            goto fail;
 558        }
 559        c = count_contiguous_clusters_by_type(nb_clusters, &l2_table[l2_index],
 560                                              QCOW2_CLUSTER_ZERO);
 561        *cluster_offset = 0;
 562        break;
 563    case QCOW2_CLUSTER_UNALLOCATED:
 564        /* how many empty clusters ? */
 565        c = count_contiguous_clusters_by_type(nb_clusters, &l2_table[l2_index],
 566                                              QCOW2_CLUSTER_UNALLOCATED);
 567        *cluster_offset = 0;
 568        break;
 569    case QCOW2_CLUSTER_NORMAL:
 570        /* how many allocated clusters ? */
 571        c = count_contiguous_clusters(nb_clusters, s->cluster_size,
 572                &l2_table[l2_index], QCOW_OFLAG_ZERO);
 573        *cluster_offset &= L2E_OFFSET_MASK;
 574        if (offset_into_cluster(s, *cluster_offset)) {
 575            qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset %#"
 576                                    PRIx64 " unaligned (L2 offset: %#" PRIx64
 577                                    ", L2 index: %#x)", *cluster_offset,
 578                                    l2_offset, l2_index);
 579            ret = -EIO;
 580            goto fail;
 581        }
 582        break;
 583    default:
 584        abort();
 585    }
 586
 587    qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
 588
 589    nb_available = (c * s->cluster_sectors);
 590
 591out:
 592    if (nb_available > nb_needed)
 593        nb_available = nb_needed;
 594
 595    *num = nb_available - index_in_cluster;
 596
 597    return ret;
 598
 599fail:
 600    qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
 601    return ret;
 602}
 603
 604/*
 605 * get_cluster_table
 606 *
 607 * for a given disk offset, load (and allocate if needed)
 608 * the l2 table.
 609 *
 610 * the l2 table offset in the qcow2 file and the cluster index
 611 * in the l2 table are given to the caller.
 612 *
 613 * Returns 0 on success, -errno in failure case
 614 */
 615static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
 616                             uint64_t **new_l2_table,
 617                             int *new_l2_index)
 618{
 619    BDRVQcow2State *s = bs->opaque;
 620    unsigned int l2_index;
 621    uint64_t l1_index, l2_offset;
 622    uint64_t *l2_table = NULL;
 623    int ret;
 624
 625    /* seek to the l2 offset in the l1 table */
 626
 627    l1_index = offset >> (s->l2_bits + s->cluster_bits);
 628    if (l1_index >= s->l1_size) {
 629        ret = qcow2_grow_l1_table(bs, l1_index + 1, false);
 630        if (ret < 0) {
 631            return ret;
 632        }
 633    }
 634
 635    assert(l1_index < s->l1_size);
 636    l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
 637    if (offset_into_cluster(s, l2_offset)) {
 638        qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64
 639                                " unaligned (L1 index: %#" PRIx64 ")",
 640                                l2_offset, l1_index);
 641        return -EIO;
 642    }
 643
 644    /* seek the l2 table of the given l2 offset */
 645
 646    if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) {
 647        /* load the l2 table in memory */
 648        ret = l2_load(bs, l2_offset, &l2_table);
 649        if (ret < 0) {
 650            return ret;
 651        }
 652    } else {
 653        /* First allocate a new L2 table (and do COW if needed) */
 654        ret = l2_allocate(bs, l1_index, &l2_table);
 655        if (ret < 0) {
 656            return ret;
 657        }
 658
 659        /* Then decrease the refcount of the old table */
 660        if (l2_offset) {
 661            qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t),
 662                                QCOW2_DISCARD_OTHER);
 663        }
 664    }
 665
 666    /* find the cluster offset for the given disk offset */
 667
 668    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
 669
 670    *new_l2_table = l2_table;
 671    *new_l2_index = l2_index;
 672
 673    return 0;
 674}
 675
 676/*
 677 * alloc_compressed_cluster_offset
 678 *
 679 * For a given offset of the disk image, return cluster offset in
 680 * qcow2 file.
 681 *
 682 * If the offset is not found, allocate a new compressed cluster.
 683 *
 684 * Return the cluster offset if successful,
 685 * Return 0, otherwise.
 686 *
 687 */
 688
 689uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
 690                                               uint64_t offset,
 691                                               int compressed_size)
 692{
 693    BDRVQcow2State *s = bs->opaque;
 694    int l2_index, ret;
 695    uint64_t *l2_table;
 696    int64_t cluster_offset;
 697    int nb_csectors;
 698
 699    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
 700    if (ret < 0) {
 701        return 0;
 702    }
 703
 704    /* Compression can't overwrite anything. Fail if the cluster was already
 705     * allocated. */
 706    cluster_offset = be64_to_cpu(l2_table[l2_index]);
 707    if (cluster_offset & L2E_OFFSET_MASK) {
 708        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
 709        return 0;
 710    }
 711
 712    cluster_offset = qcow2_alloc_bytes(bs, compressed_size);
 713    if (cluster_offset < 0) {
 714        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
 715        return 0;
 716    }
 717
 718    nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
 719                  (cluster_offset >> 9);
 720
 721    cluster_offset |= QCOW_OFLAG_COMPRESSED |
 722                      ((uint64_t)nb_csectors << s->csize_shift);
 723
 724    /* update L2 table */
 725
 726    /* compressed clusters never have the copied flag */
 727
 728    BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
 729    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
 730    l2_table[l2_index] = cpu_to_be64(cluster_offset);
 731    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
 732
 733    return cluster_offset;
 734}
 735
 736static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r)
 737{
 738    BDRVQcow2State *s = bs->opaque;
 739    int ret;
 740
 741    if (r->nb_sectors == 0) {
 742        return 0;
 743    }
 744
 745    qemu_co_mutex_unlock(&s->lock);
 746    ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset,
 747                       r->offset / BDRV_SECTOR_SIZE,
 748                       r->offset / BDRV_SECTOR_SIZE + r->nb_sectors);
 749    qemu_co_mutex_lock(&s->lock);
 750
 751    if (ret < 0) {
 752        return ret;
 753    }
 754
 755    /*
 756     * Before we update the L2 table to actually point to the new cluster, we
 757     * need to be sure that the refcounts have been increased and COW was
 758     * handled.
 759     */
 760    qcow2_cache_depends_on_flush(s->l2_table_cache);
 761
 762    return 0;
 763}
 764
 765int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
 766{
 767    BDRVQcow2State *s = bs->opaque;
 768    int i, j = 0, l2_index, ret;
 769    uint64_t *old_cluster, *l2_table;
 770    uint64_t cluster_offset = m->alloc_offset;
 771
 772    trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
 773    assert(m->nb_clusters > 0);
 774
 775    old_cluster = g_try_new(uint64_t, m->nb_clusters);
 776    if (old_cluster == NULL) {
 777        ret = -ENOMEM;
 778        goto err;
 779    }
 780
 781    /* copy content of unmodified sectors */
 782    ret = perform_cow(bs, m, &m->cow_start);
 783    if (ret < 0) {
 784        goto err;
 785    }
 786
 787    ret = perform_cow(bs, m, &m->cow_end);
 788    if (ret < 0) {
 789        goto err;
 790    }
 791
 792    /* Update L2 table. */
 793    if (s->use_lazy_refcounts) {
 794        qcow2_mark_dirty(bs);
 795    }
 796    if (qcow2_need_accurate_refcounts(s)) {
 797        qcow2_cache_set_dependency(bs, s->l2_table_cache,
 798                                   s->refcount_block_cache);
 799    }
 800
 801    ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index);
 802    if (ret < 0) {
 803        goto err;
 804    }
 805    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
 806
 807    assert(l2_index + m->nb_clusters <= s->l2_size);
 808    for (i = 0; i < m->nb_clusters; i++) {
 809        /* if two concurrent writes happen to the same unallocated cluster
 810         * each write allocates separate cluster and writes data concurrently.
 811         * The first one to complete updates l2 table with pointer to its
 812         * cluster the second one has to do RMW (which is done above by
 813         * copy_sectors()), update l2 table with its cluster pointer and free
 814         * old cluster. This is what this loop does */
 815        if(l2_table[l2_index + i] != 0)
 816            old_cluster[j++] = l2_table[l2_index + i];
 817
 818        l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
 819                    (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
 820     }
 821
 822
 823    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
 824
 825    /*
 826     * If this was a COW, we need to decrease the refcount of the old cluster.
 827     *
 828     * Don't discard clusters that reach a refcount of 0 (e.g. compressed
 829     * clusters), the next write will reuse them anyway.
 830     */
 831    if (j != 0) {
 832        for (i = 0; i < j; i++) {
 833            qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1,
 834                                    QCOW2_DISCARD_NEVER);
 835        }
 836    }
 837
 838    ret = 0;
 839err:
 840    g_free(old_cluster);
 841    return ret;
 842 }
 843
 844/*
 845 * Returns the number of contiguous clusters that can be used for an allocating
 846 * write, but require COW to be performed (this includes yet unallocated space,
 847 * which must copy from the backing file)
 848 */
 849static int count_cow_clusters(BDRVQcow2State *s, int nb_clusters,
 850    uint64_t *l2_table, int l2_index)
 851{
 852    int i;
 853
 854    for (i = 0; i < nb_clusters; i++) {
 855        uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]);
 856        int cluster_type = qcow2_get_cluster_type(l2_entry);
 857
 858        switch(cluster_type) {
 859        case QCOW2_CLUSTER_NORMAL:
 860            if (l2_entry & QCOW_OFLAG_COPIED) {
 861                goto out;
 862            }
 863            break;
 864        case QCOW2_CLUSTER_UNALLOCATED:
 865        case QCOW2_CLUSTER_COMPRESSED:
 866        case QCOW2_CLUSTER_ZERO:
 867            break;
 868        default:
 869            abort();
 870        }
 871    }
 872
 873out:
 874    assert(i <= nb_clusters);
 875    return i;
 876}
 877
 878/*
 879 * Check if there already is an AIO write request in flight which allocates
 880 * the same cluster. In this case we need to wait until the previous
 881 * request has completed and updated the L2 table accordingly.
 882 *
 883 * Returns:
 884 *   0       if there was no dependency. *cur_bytes indicates the number of
 885 *           bytes from guest_offset that can be read before the next
 886 *           dependency must be processed (or the request is complete)
 887 *
 888 *   -EAGAIN if we had to wait for another request, previously gathered
 889 *           information on cluster allocation may be invalid now. The caller
 890 *           must start over anyway, so consider *cur_bytes undefined.
 891 */
 892static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
 893    uint64_t *cur_bytes, QCowL2Meta **m)
 894{
 895    BDRVQcow2State *s = bs->opaque;
 896    QCowL2Meta *old_alloc;
 897    uint64_t bytes = *cur_bytes;
 898
 899    QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) {
 900
 901        uint64_t start = guest_offset;
 902        uint64_t end = start + bytes;
 903        uint64_t old_start = l2meta_cow_start(old_alloc);
 904        uint64_t old_end = l2meta_cow_end(old_alloc);
 905
 906        if (end <= old_start || start >= old_end) {
 907            /* No intersection */
 908        } else {
 909            if (start < old_start) {
 910                /* Stop at the start of a running allocation */
 911                bytes = old_start - start;
 912            } else {
 913                bytes = 0;
 914            }
 915
 916            /* Stop if already an l2meta exists. After yielding, it wouldn't
 917             * be valid any more, so we'd have to clean up the old L2Metas
 918             * and deal with requests depending on them before starting to
 919             * gather new ones. Not worth the trouble. */
 920            if (bytes == 0 && *m) {
 921                *cur_bytes = 0;
 922                return 0;
 923            }
 924
 925            if (bytes == 0) {
 926                /* Wait for the dependency to complete. We need to recheck
 927                 * the free/allocated clusters when we continue. */
 928                qemu_co_mutex_unlock(&s->lock);
 929                qemu_co_queue_wait(&old_alloc->dependent_requests);
 930                qemu_co_mutex_lock(&s->lock);
 931                return -EAGAIN;
 932            }
 933        }
 934    }
 935
 936    /* Make sure that existing clusters and new allocations are only used up to
 937     * the next dependency if we shortened the request above */
 938    *cur_bytes = bytes;
 939
 940    return 0;
 941}
 942
 943/*
 944 * Checks how many already allocated clusters that don't require a copy on
 945 * write there are at the given guest_offset (up to *bytes). If
 946 * *host_offset is not zero, only physically contiguous clusters beginning at
 947 * this host offset are counted.
 948 *
 949 * Note that guest_offset may not be cluster aligned. In this case, the
 950 * returned *host_offset points to exact byte referenced by guest_offset and
 951 * therefore isn't cluster aligned as well.
 952 *
 953 * Returns:
 954 *   0:     if no allocated clusters are available at the given offset.
 955 *          *bytes is normally unchanged. It is set to 0 if the cluster
 956 *          is allocated and doesn't need COW, but doesn't have the right
 957 *          physical offset.
 958 *
 959 *   1:     if allocated clusters that don't require a COW are available at
 960 *          the requested offset. *bytes may have decreased and describes
 961 *          the length of the area that can be written to.
 962 *
 963 *  -errno: in error cases
 964 */
 965static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
 966    uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
 967{
 968    BDRVQcow2State *s = bs->opaque;
 969    int l2_index;
 970    uint64_t cluster_offset;
 971    uint64_t *l2_table;
 972    uint64_t nb_clusters;
 973    unsigned int keep_clusters;
 974    int ret;
 975
 976    trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset,
 977                              *bytes);
 978
 979    assert(*host_offset == 0 ||    offset_into_cluster(s, guest_offset)
 980                                == offset_into_cluster(s, *host_offset));
 981
 982    /*
 983     * Calculate the number of clusters to look for. We stop at L2 table
 984     * boundaries to keep things simple.
 985     */
 986    nb_clusters =
 987        size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
 988
 989    l2_index = offset_to_l2_index(s, guest_offset);
 990    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
 991    assert(nb_clusters <= INT_MAX);
 992
 993    /* Find L2 entry for the first involved cluster */
 994    ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
 995    if (ret < 0) {
 996        return ret;
 997    }
 998
 999    cluster_offset = be64_to_cpu(l2_table[l2_index]);
1000
1001    /* Check how many clusters are already allocated and don't need COW */
1002    if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL
1003        && (cluster_offset & QCOW_OFLAG_COPIED))
1004    {
1005        /* If a specific host_offset is required, check it */
1006        bool offset_matches =
1007            (cluster_offset & L2E_OFFSET_MASK) == *host_offset;
1008
1009        if (offset_into_cluster(s, cluster_offset & L2E_OFFSET_MASK)) {
1010            qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset "
1011                                    "%#llx unaligned (guest offset: %#" PRIx64
1012                                    ")", cluster_offset & L2E_OFFSET_MASK,
1013                                    guest_offset);
1014            ret = -EIO;
1015            goto out;
1016        }
1017
1018        if (*host_offset != 0 && !offset_matches) {
1019            *bytes = 0;
1020            ret = 0;
1021            goto out;
1022        }
1023
1024        /* We keep all QCOW_OFLAG_COPIED clusters */
1025        keep_clusters =
1026            count_contiguous_clusters(nb_clusters, s->cluster_size,
1027                                      &l2_table[l2_index],
1028                                      QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO);
1029        assert(keep_clusters <= nb_clusters);
1030
1031        *bytes = MIN(*bytes,
1032                 keep_clusters * s->cluster_size
1033                 - offset_into_cluster(s, guest_offset));
1034
1035        ret = 1;
1036    } else {
1037        ret = 0;
1038    }
1039
1040    /* Cleanup */
1041out:
1042    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
1043
1044    /* Only return a host offset if we actually made progress. Otherwise we
1045     * would make requirements for handle_alloc() that it can't fulfill */
1046    if (ret > 0) {
1047        *host_offset = (cluster_offset & L2E_OFFSET_MASK)
1048                     + offset_into_cluster(s, guest_offset);
1049    }
1050
1051    return ret;
1052}
1053
1054/*
1055 * Allocates new clusters for the given guest_offset.
1056 *
1057 * At most *nb_clusters are allocated, and on return *nb_clusters is updated to
1058 * contain the number of clusters that have been allocated and are contiguous
1059 * in the image file.
1060 *
1061 * If *host_offset is non-zero, it specifies the offset in the image file at
1062 * which the new clusters must start. *nb_clusters can be 0 on return in this
1063 * case if the cluster at host_offset is already in use. If *host_offset is
1064 * zero, the clusters can be allocated anywhere in the image file.
1065 *
1066 * *host_offset is updated to contain the offset into the image file at which
1067 * the first allocated cluster starts.
1068 *
1069 * Return 0 on success and -errno in error cases. -EAGAIN means that the
1070 * function has been waiting for another request and the allocation must be
1071 * restarted, but the whole request should not be failed.
1072 */
1073static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
1074                                   uint64_t *host_offset, uint64_t *nb_clusters)
1075{
1076    BDRVQcow2State *s = bs->opaque;
1077
1078    trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset,
1079                                         *host_offset, *nb_clusters);
1080
1081    /* Allocate new clusters */
1082    trace_qcow2_cluster_alloc_phys(qemu_coroutine_self());
1083    if (*host_offset == 0) {
1084        int64_t cluster_offset =
1085            qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size);
1086        if (cluster_offset < 0) {
1087            return cluster_offset;
1088        }
1089        *host_offset = cluster_offset;
1090        return 0;
1091    } else {
1092        int64_t ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters);
1093        if (ret < 0) {
1094            return ret;
1095        }
1096        *nb_clusters = ret;
1097        return 0;
1098    }
1099}
1100
1101/*
1102 * Allocates new clusters for an area that either is yet unallocated or needs a
1103 * copy on write. If *host_offset is non-zero, clusters are only allocated if
1104 * the new allocation can match the specified host offset.
1105 *
1106 * Note that guest_offset may not be cluster aligned. In this case, the
1107 * returned *host_offset points to exact byte referenced by guest_offset and
1108 * therefore isn't cluster aligned as well.
1109 *
1110 * Returns:
1111 *   0:     if no clusters could be allocated. *bytes is set to 0,
1112 *          *host_offset is left unchanged.
1113 *
1114 *   1:     if new clusters were allocated. *bytes may be decreased if the
1115 *          new allocation doesn't cover all of the requested area.
1116 *          *host_offset is updated to contain the host offset of the first
1117 *          newly allocated cluster.
1118 *
1119 *  -errno: in error cases
1120 */
1121static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
1122    uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
1123{
1124    BDRVQcow2State *s = bs->opaque;
1125    int l2_index;
1126    uint64_t *l2_table;
1127    uint64_t entry;
1128    uint64_t nb_clusters;
1129    int ret;
1130
1131    uint64_t alloc_cluster_offset;
1132
1133    trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset,
1134                             *bytes);
1135    assert(*bytes > 0);
1136
1137    /*
1138     * Calculate the number of clusters to look for. We stop at L2 table
1139     * boundaries to keep things simple.
1140     */
1141    nb_clusters =
1142        size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
1143
1144    l2_index = offset_to_l2_index(s, guest_offset);
1145    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
1146    assert(nb_clusters <= INT_MAX);
1147
1148    /* Find L2 entry for the first involved cluster */
1149    ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
1150    if (ret < 0) {
1151        return ret;
1152    }
1153
1154    entry = be64_to_cpu(l2_table[l2_index]);
1155
1156    /* For the moment, overwrite compressed clusters one by one */
1157    if (entry & QCOW_OFLAG_COMPRESSED) {
1158        nb_clusters = 1;
1159    } else {
1160        nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index);
1161    }
1162
1163    /* This function is only called when there were no non-COW clusters, so if
1164     * we can't find any unallocated or COW clusters either, something is
1165     * wrong with our code. */
1166    assert(nb_clusters > 0);
1167
1168    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
1169
1170    /* Allocate, if necessary at a given offset in the image file */
1171    alloc_cluster_offset = start_of_cluster(s, *host_offset);
1172    ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
1173                                  &nb_clusters);
1174    if (ret < 0) {
1175        goto fail;
1176    }
1177
1178    /* Can't extend contiguous allocation */
1179    if (nb_clusters == 0) {
1180        *bytes = 0;
1181        return 0;
1182    }
1183
1184    /* !*host_offset would overwrite the image header and is reserved for "no
1185     * host offset preferred". If 0 was a valid host offset, it'd trigger the
1186     * following overlap check; do that now to avoid having an invalid value in
1187     * *host_offset. */
1188    if (!alloc_cluster_offset) {
1189        ret = qcow2_pre_write_overlap_check(bs, 0, alloc_cluster_offset,
1190                                            nb_clusters * s->cluster_size);
1191        assert(ret < 0);
1192        goto fail;
1193    }
1194
1195    /*
1196     * Save info needed for meta data update.
1197     *
1198     * requested_sectors: Number of sectors from the start of the first
1199     * newly allocated cluster to the end of the (possibly shortened
1200     * before) write request.
1201     *
1202     * avail_sectors: Number of sectors from the start of the first
1203     * newly allocated to the end of the last newly allocated cluster.
1204     *
1205     * nb_sectors: The number of sectors from the start of the first
1206     * newly allocated cluster to the end of the area that the write
1207     * request actually writes to (excluding COW at the end)
1208     */
1209    int requested_sectors =
1210        (*bytes + offset_into_cluster(s, guest_offset))
1211        >> BDRV_SECTOR_BITS;
1212    int avail_sectors = nb_clusters
1213                        << (s->cluster_bits - BDRV_SECTOR_BITS);
1214    int alloc_n_start = offset_into_cluster(s, guest_offset)
1215                        >> BDRV_SECTOR_BITS;
1216    int nb_sectors = MIN(requested_sectors, avail_sectors);
1217    QCowL2Meta *old_m = *m;
1218
1219    *m = g_malloc0(sizeof(**m));
1220
1221    **m = (QCowL2Meta) {
1222        .next           = old_m,
1223
1224        .alloc_offset   = alloc_cluster_offset,
1225        .offset         = start_of_cluster(s, guest_offset),
1226        .nb_clusters    = nb_clusters,
1227        .nb_available   = nb_sectors,
1228
1229        .cow_start = {
1230            .offset     = 0,
1231            .nb_sectors = alloc_n_start,
1232        },
1233        .cow_end = {
1234            .offset     = nb_sectors * BDRV_SECTOR_SIZE,
1235            .nb_sectors = avail_sectors - nb_sectors,
1236        },
1237    };
1238    qemu_co_queue_init(&(*m)->dependent_requests);
1239    QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
1240
1241    *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset);
1242    *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE)
1243                         - offset_into_cluster(s, guest_offset));
1244    assert(*bytes != 0);
1245
1246    return 1;
1247
1248fail:
1249    if (*m && (*m)->nb_clusters > 0) {
1250        QLIST_REMOVE(*m, next_in_flight);
1251    }
1252    return ret;
1253}
1254
1255/*
1256 * alloc_cluster_offset
1257 *
1258 * For a given offset on the virtual disk, find the cluster offset in qcow2
1259 * file. If the offset is not found, allocate a new cluster.
1260 *
1261 * If the cluster was already allocated, m->nb_clusters is set to 0 and
1262 * other fields in m are meaningless.
1263 *
1264 * If the cluster is newly allocated, m->nb_clusters is set to the number of
1265 * contiguous clusters that have been allocated. In this case, the other
1266 * fields of m are valid and contain information about the first allocated
1267 * cluster.
1268 *
1269 * If the request conflicts with another write request in flight, the coroutine
1270 * is queued and will be reentered when the dependency has completed.
1271 *
1272 * Return 0 on success and -errno in error cases
1273 */
1274int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
1275    int *num, uint64_t *host_offset, QCowL2Meta **m)
1276{
1277    BDRVQcow2State *s = bs->opaque;
1278    uint64_t start, remaining;
1279    uint64_t cluster_offset;
1280    uint64_t cur_bytes;
1281    int ret;
1282
1283    trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *num);
1284
1285    assert((offset & ~BDRV_SECTOR_MASK) == 0);
1286
1287again:
1288    start = offset;
1289    remaining = (uint64_t)*num << BDRV_SECTOR_BITS;
1290    cluster_offset = 0;
1291    *host_offset = 0;
1292    cur_bytes = 0;
1293    *m = NULL;
1294
1295    while (true) {
1296
1297        if (!*host_offset) {
1298            *host_offset = start_of_cluster(s, cluster_offset);
1299        }
1300
1301        assert(remaining >= cur_bytes);
1302
1303        start           += cur_bytes;
1304        remaining       -= cur_bytes;
1305        cluster_offset  += cur_bytes;
1306
1307        if (remaining == 0) {
1308            break;
1309        }
1310
1311        cur_bytes = remaining;
1312
1313        /*
1314         * Now start gathering as many contiguous clusters as possible:
1315         *
1316         * 1. Check for overlaps with in-flight allocations
1317         *
1318         *      a) Overlap not in the first cluster -> shorten this request and
1319         *         let the caller handle the rest in its next loop iteration.
1320         *
1321         *      b) Real overlaps of two requests. Yield and restart the search
1322         *         for contiguous clusters (the situation could have changed
1323         *         while we were sleeping)
1324         *
1325         *      c) TODO: Request starts in the same cluster as the in-flight
1326         *         allocation ends. Shorten the COW of the in-fight allocation,
1327         *         set cluster_offset to write to the same cluster and set up
1328         *         the right synchronisation between the in-flight request and
1329         *         the new one.
1330         */
1331        ret = handle_dependencies(bs, start, &cur_bytes, m);
1332        if (ret == -EAGAIN) {
1333            /* Currently handle_dependencies() doesn't yield if we already had
1334             * an allocation. If it did, we would have to clean up the L2Meta
1335             * structs before starting over. */
1336            assert(*m == NULL);
1337            goto again;
1338        } else if (ret < 0) {
1339            return ret;
1340        } else if (cur_bytes == 0) {
1341            break;
1342        } else {
1343            /* handle_dependencies() may have decreased cur_bytes (shortened
1344             * the allocations below) so that the next dependency is processed
1345             * correctly during the next loop iteration. */
1346        }
1347
1348        /*
1349         * 2. Count contiguous COPIED clusters.
1350         */
1351        ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m);
1352        if (ret < 0) {
1353            return ret;
1354        } else if (ret) {
1355            continue;
1356        } else if (cur_bytes == 0) {
1357            break;
1358        }
1359
1360        /*
1361         * 3. If the request still hasn't completed, allocate new clusters,
1362         *    considering any cluster_offset of steps 1c or 2.
1363         */
1364        ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m);
1365        if (ret < 0) {
1366            return ret;
1367        } else if (ret) {
1368            continue;
1369        } else {
1370            assert(cur_bytes == 0);
1371            break;
1372        }
1373    }
1374
1375    *num -= remaining >> BDRV_SECTOR_BITS;
1376    assert(*num > 0);
1377    assert(*host_offset != 0);
1378
1379    return 0;
1380}
1381
1382static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
1383                             const uint8_t *buf, int buf_size)
1384{
1385    z_stream strm1, *strm = &strm1;
1386    int ret, out_len;
1387
1388    memset(strm, 0, sizeof(*strm));
1389
1390    strm->next_in = (uint8_t *)buf;
1391    strm->avail_in = buf_size;
1392    strm->next_out = out_buf;
1393    strm->avail_out = out_buf_size;
1394
1395    ret = inflateInit2(strm, -12);
1396    if (ret != Z_OK)
1397        return -1;
1398    ret = inflate(strm, Z_FINISH);
1399    out_len = strm->next_out - out_buf;
1400    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
1401        out_len != out_buf_size) {
1402        inflateEnd(strm);
1403        return -1;
1404    }
1405    inflateEnd(strm);
1406    return 0;
1407}
1408
1409int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
1410{
1411    BDRVQcow2State *s = bs->opaque;
1412    int ret, csize, nb_csectors, sector_offset;
1413    uint64_t coffset;
1414
1415    coffset = cluster_offset & s->cluster_offset_mask;
1416    if (s->cluster_cache_offset != coffset) {
1417        nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
1418        sector_offset = coffset & 511;
1419        csize = nb_csectors * 512 - sector_offset;
1420        BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
1421        ret = bdrv_read(bs->file->bs, coffset >> 9, s->cluster_data,
1422                        nb_csectors);
1423        if (ret < 0) {
1424            return ret;
1425        }
1426        if (decompress_buffer(s->cluster_cache, s->cluster_size,
1427                              s->cluster_data + sector_offset, csize) < 0) {
1428            return -EIO;
1429        }
1430        s->cluster_cache_offset = coffset;
1431    }
1432    return 0;
1433}
1434
1435/*
1436 * This discards as many clusters of nb_clusters as possible at once (i.e.
1437 * all clusters in the same L2 table) and returns the number of discarded
1438 * clusters.
1439 */
1440static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
1441                             uint64_t nb_clusters, enum qcow2_discard_type type,
1442                             bool full_discard)
1443{
1444    BDRVQcow2State *s = bs->opaque;
1445    uint64_t *l2_table;
1446    int l2_index;
1447    int ret;
1448    int i;
1449
1450    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
1451    if (ret < 0) {
1452        return ret;
1453    }
1454
1455    /* Limit nb_clusters to one L2 table */
1456    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
1457    assert(nb_clusters <= INT_MAX);
1458
1459    for (i = 0; i < nb_clusters; i++) {
1460        uint64_t old_l2_entry;
1461
1462        old_l2_entry = be64_to_cpu(l2_table[l2_index + i]);
1463
1464        /*
1465         * If full_discard is false, make sure that a discarded area reads back
1466         * as zeroes for v3 images (we cannot do it for v2 without actually
1467         * writing a zero-filled buffer). We can skip the operation if the
1468         * cluster is already marked as zero, or if it's unallocated and we
1469         * don't have a backing file.
1470         *
1471         * TODO We might want to use bdrv_get_block_status(bs) here, but we're
1472         * holding s->lock, so that doesn't work today.
1473         *
1474         * If full_discard is true, the sector should not read back as zeroes,
1475         * but rather fall through to the backing file.
1476         */
1477        switch (qcow2_get_cluster_type(old_l2_entry)) {
1478            case QCOW2_CLUSTER_UNALLOCATED:
1479                if (full_discard || !bs->backing) {
1480                    continue;
1481                }
1482                break;
1483
1484            case QCOW2_CLUSTER_ZERO:
1485                if (!full_discard) {
1486                    continue;
1487                }
1488                break;
1489
1490            case QCOW2_CLUSTER_NORMAL:
1491            case QCOW2_CLUSTER_COMPRESSED:
1492                break;
1493
1494            default:
1495                abort();
1496        }
1497
1498        /* First remove L2 entries */
1499        qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
1500        if (!full_discard && s->qcow_version >= 3) {
1501            l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
1502        } else {
1503            l2_table[l2_index + i] = cpu_to_be64(0);
1504        }
1505
1506        /* Then decrease the refcount */
1507        qcow2_free_any_clusters(bs, old_l2_entry, 1, type);
1508    }
1509
1510    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
1511
1512    return nb_clusters;
1513}
1514
1515int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
1516    int nb_sectors, enum qcow2_discard_type type, bool full_discard)
1517{
1518    BDRVQcow2State *s = bs->opaque;
1519    uint64_t end_offset;
1520    uint64_t nb_clusters;
1521    int ret;
1522
1523    end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS);
1524
1525    /* Round start up and end down */
1526    offset = align_offset(offset, s->cluster_size);
1527    end_offset = start_of_cluster(s, end_offset);
1528
1529    if (offset > end_offset) {
1530        return 0;
1531    }
1532
1533    nb_clusters = size_to_clusters(s, end_offset - offset);
1534
1535    s->cache_discards = true;
1536
1537    /* Each L2 table is handled by its own loop iteration */
1538    while (nb_clusters > 0) {
1539        ret = discard_single_l2(bs, offset, nb_clusters, type, full_discard);
1540        if (ret < 0) {
1541            goto fail;
1542        }
1543
1544        nb_clusters -= ret;
1545        offset += (ret * s->cluster_size);
1546    }
1547
1548    ret = 0;
1549fail:
1550    s->cache_discards = false;
1551    qcow2_process_discards(bs, ret);
1552
1553    return ret;
1554}
1555
1556/*
1557 * This zeroes as many clusters of nb_clusters as possible at once (i.e.
1558 * all clusters in the same L2 table) and returns the number of zeroed
1559 * clusters.
1560 */
1561static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
1562                          uint64_t nb_clusters)
1563{
1564    BDRVQcow2State *s = bs->opaque;
1565    uint64_t *l2_table;
1566    int l2_index;
1567    int ret;
1568    int i;
1569
1570    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
1571    if (ret < 0) {
1572        return ret;
1573    }
1574
1575    /* Limit nb_clusters to one L2 table */
1576    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
1577    assert(nb_clusters <= INT_MAX);
1578
1579    for (i = 0; i < nb_clusters; i++) {
1580        uint64_t old_offset;
1581
1582        old_offset = be64_to_cpu(l2_table[l2_index + i]);
1583
1584        /* Update L2 entries */
1585        qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
1586        if (old_offset & QCOW_OFLAG_COMPRESSED) {
1587            l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
1588            qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
1589        } else {
1590            l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO);
1591        }
1592    }
1593
1594    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
1595
1596    return nb_clusters;
1597}
1598
1599int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors)
1600{
1601    BDRVQcow2State *s = bs->opaque;
1602    uint64_t nb_clusters;
1603    int ret;
1604
1605    /* The zero flag is only supported by version 3 and newer */
1606    if (s->qcow_version < 3) {
1607        return -ENOTSUP;
1608    }
1609
1610    /* Each L2 table is handled by its own loop iteration */
1611    nb_clusters = size_to_clusters(s, nb_sectors << BDRV_SECTOR_BITS);
1612
1613    s->cache_discards = true;
1614
1615    while (nb_clusters > 0) {
1616        ret = zero_single_l2(bs, offset, nb_clusters);
1617        if (ret < 0) {
1618            goto fail;
1619        }
1620
1621        nb_clusters -= ret;
1622        offset += (ret * s->cluster_size);
1623    }
1624
1625    ret = 0;
1626fail:
1627    s->cache_discards = false;
1628    qcow2_process_discards(bs, ret);
1629
1630    return ret;
1631}
1632
1633/*
1634 * Expands all zero clusters in a specific L1 table (or deallocates them, for
1635 * non-backed non-pre-allocated zero clusters).
1636 *
1637 * l1_entries and *visited_l1_entries are used to keep track of progress for
1638 * status_cb(). l1_entries contains the total number of L1 entries and
1639 * *visited_l1_entries counts all visited L1 entries.
1640 */
1641static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
1642                                      int l1_size, int64_t *visited_l1_entries,
1643                                      int64_t l1_entries,
1644                                      BlockDriverAmendStatusCB *status_cb)
1645{
1646    BDRVQcow2State *s = bs->opaque;
1647    bool is_active_l1 = (l1_table == s->l1_table);
1648    uint64_t *l2_table = NULL;
1649    int ret;
1650    int i, j;
1651
1652    if (!is_active_l1) {
1653        /* inactive L2 tables require a buffer to be stored in when loading
1654         * them from disk */
1655        l2_table = qemu_try_blockalign(bs->file->bs, s->cluster_size);
1656        if (l2_table == NULL) {
1657            return -ENOMEM;
1658        }
1659    }
1660
1661    for (i = 0; i < l1_size; i++) {
1662        uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK;
1663        bool l2_dirty = false;
1664        uint64_t l2_refcount;
1665
1666        if (!l2_offset) {
1667            /* unallocated */
1668            (*visited_l1_entries)++;
1669            if (status_cb) {
1670                status_cb(bs, *visited_l1_entries, l1_entries);
1671            }
1672            continue;
1673        }
1674
1675        if (offset_into_cluster(s, l2_offset)) {
1676            qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1677                                    PRIx64 " unaligned (L1 index: %#x)",
1678                                    l2_offset, i);
1679            ret = -EIO;
1680            goto fail;
1681        }
1682
1683        if (is_active_l1) {
1684            /* get active L2 tables from cache */
1685            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
1686                    (void **)&l2_table);
1687        } else {
1688            /* load inactive L2 tables from disk */
1689            ret = bdrv_read(bs->file->bs, l2_offset / BDRV_SECTOR_SIZE,
1690                            (void *)l2_table, s->cluster_sectors);
1691        }
1692        if (ret < 0) {
1693            goto fail;
1694        }
1695
1696        ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1697                                 &l2_refcount);
1698        if (ret < 0) {
1699            goto fail;
1700        }
1701
1702        for (j = 0; j < s->l2_size; j++) {
1703            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1704            int64_t offset = l2_entry & L2E_OFFSET_MASK;
1705            int cluster_type = qcow2_get_cluster_type(l2_entry);
1706            bool preallocated = offset != 0;
1707
1708            if (cluster_type != QCOW2_CLUSTER_ZERO) {
1709                continue;
1710            }
1711
1712            if (!preallocated) {
1713                if (!bs->backing) {
1714                    /* not backed; therefore we can simply deallocate the
1715                     * cluster */
1716                    l2_table[j] = 0;
1717                    l2_dirty = true;
1718                    continue;
1719                }
1720
1721                offset = qcow2_alloc_clusters(bs, s->cluster_size);
1722                if (offset < 0) {
1723                    ret = offset;
1724                    goto fail;
1725                }
1726
1727                if (l2_refcount > 1) {
1728                    /* For shared L2 tables, set the refcount accordingly (it is
1729                     * already 1 and needs to be l2_refcount) */
1730                    ret = qcow2_update_cluster_refcount(bs,
1731                            offset >> s->cluster_bits,
1732                            refcount_diff(1, l2_refcount), false,
1733                            QCOW2_DISCARD_OTHER);
1734                    if (ret < 0) {
1735                        qcow2_free_clusters(bs, offset, s->cluster_size,
1736                                            QCOW2_DISCARD_OTHER);
1737                        goto fail;
1738                    }
1739                }
1740            }
1741
1742            if (offset_into_cluster(s, offset)) {
1743                qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset "
1744                                        "%#" PRIx64 " unaligned (L2 offset: %#"
1745                                        PRIx64 ", L2 index: %#x)", offset,
1746                                        l2_offset, j);
1747                if (!preallocated) {
1748                    qcow2_free_clusters(bs, offset, s->cluster_size,
1749                                        QCOW2_DISCARD_ALWAYS);
1750                }
1751                ret = -EIO;
1752                goto fail;
1753            }
1754
1755            ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
1756            if (ret < 0) {
1757                if (!preallocated) {
1758                    qcow2_free_clusters(bs, offset, s->cluster_size,
1759                                        QCOW2_DISCARD_ALWAYS);
1760                }
1761                goto fail;
1762            }
1763
1764            ret = bdrv_write_zeroes(bs->file->bs, offset / BDRV_SECTOR_SIZE,
1765                                    s->cluster_sectors, 0);
1766            if (ret < 0) {
1767                if (!preallocated) {
1768                    qcow2_free_clusters(bs, offset, s->cluster_size,
1769                                        QCOW2_DISCARD_ALWAYS);
1770                }
1771                goto fail;
1772            }
1773
1774            if (l2_refcount == 1) {
1775                l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
1776            } else {
1777                l2_table[j] = cpu_to_be64(offset);
1778            }
1779            l2_dirty = true;
1780        }
1781
1782        if (is_active_l1) {
1783            if (l2_dirty) {
1784                qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
1785                qcow2_cache_depends_on_flush(s->l2_table_cache);
1786            }
1787            qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
1788        } else {
1789            if (l2_dirty) {
1790                ret = qcow2_pre_write_overlap_check(bs,
1791                        QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2, l2_offset,
1792                        s->cluster_size);
1793                if (ret < 0) {
1794                    goto fail;
1795                }
1796
1797                ret = bdrv_write(bs->file->bs, l2_offset / BDRV_SECTOR_SIZE,
1798                                 (void *)l2_table, s->cluster_sectors);
1799                if (ret < 0) {
1800                    goto fail;
1801                }
1802            }
1803        }
1804
1805        (*visited_l1_entries)++;
1806        if (status_cb) {
1807            status_cb(bs, *visited_l1_entries, l1_entries);
1808        }
1809    }
1810
1811    ret = 0;
1812
1813fail:
1814    if (l2_table) {
1815        if (!is_active_l1) {
1816            qemu_vfree(l2_table);
1817        } else {
1818            qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
1819        }
1820    }
1821    return ret;
1822}
1823
1824/*
1825 * For backed images, expands all zero clusters on the image. For non-backed
1826 * images, deallocates all non-pre-allocated zero clusters (and claims the
1827 * allocation for pre-allocated ones). This is important for downgrading to a
1828 * qcow2 version which doesn't yet support metadata zero clusters.
1829 */
1830int qcow2_expand_zero_clusters(BlockDriverState *bs,
1831                               BlockDriverAmendStatusCB *status_cb)
1832{
1833    BDRVQcow2State *s = bs->opaque;
1834    uint64_t *l1_table = NULL;
1835    int64_t l1_entries = 0, visited_l1_entries = 0;
1836    int ret;
1837    int i, j;
1838
1839    if (status_cb) {
1840        l1_entries = s->l1_size;
1841        for (i = 0; i < s->nb_snapshots; i++) {
1842            l1_entries += s->snapshots[i].l1_size;
1843        }
1844    }
1845
1846    ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size,
1847                                     &visited_l1_entries, l1_entries,
1848                                     status_cb);
1849    if (ret < 0) {
1850        goto fail;
1851    }
1852
1853    /* Inactive L1 tables may point to active L2 tables - therefore it is
1854     * necessary to flush the L2 table cache before trying to access the L2
1855     * tables pointed to by inactive L1 entries (else we might try to expand
1856     * zero clusters that have already been expanded); furthermore, it is also
1857     * necessary to empty the L2 table cache, since it may contain tables which
1858     * are now going to be modified directly on disk, bypassing the cache.
1859     * qcow2_cache_empty() does both for us. */
1860    ret = qcow2_cache_empty(bs, s->l2_table_cache);
1861    if (ret < 0) {
1862        goto fail;
1863    }
1864
1865    for (i = 0; i < s->nb_snapshots; i++) {
1866        int l1_sectors = (s->snapshots[i].l1_size * sizeof(uint64_t) +
1867                BDRV_SECTOR_SIZE - 1) / BDRV_SECTOR_SIZE;
1868
1869        l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE);
1870
1871        ret = bdrv_read(bs->file->bs,
1872                        s->snapshots[i].l1_table_offset / BDRV_SECTOR_SIZE,
1873                        (void *)l1_table, l1_sectors);
1874        if (ret < 0) {
1875            goto fail;
1876        }
1877
1878        for (j = 0; j < s->snapshots[i].l1_size; j++) {
1879            be64_to_cpus(&l1_table[j]);
1880        }
1881
1882        ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size,
1883                                         &visited_l1_entries, l1_entries,
1884                                         status_cb);
1885        if (ret < 0) {
1886            goto fail;
1887        }
1888    }
1889
1890    ret = 0;
1891
1892fail:
1893    g_free(l1_table);
1894    return ret;
1895}
1896