qemu/block/qcow2-cluster.c
<<
>>
Prefs
   1/*
   2 * Block driver for the QCOW version 2 format
   3 *
   4 * Copyright (c) 2004-2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include <zlib.h>
  27
  28#include "qapi/error.h"
  29#include "qemu-common.h"
  30#include "block/block_int.h"
  31#include "block/qcow2.h"
  32#include "qemu/bswap.h"
  33#include "trace.h"
  34
  35int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
  36                        bool exact_size)
  37{
  38    BDRVQcow2State *s = bs->opaque;
  39    int new_l1_size2, ret, i;
  40    uint64_t *new_l1_table;
  41    int64_t old_l1_table_offset, old_l1_size;
  42    int64_t new_l1_table_offset, new_l1_size;
  43    uint8_t data[12];
  44
  45    if (min_size <= s->l1_size)
  46        return 0;
  47
  48    /* Do a sanity check on min_size before trying to calculate new_l1_size
  49     * (this prevents overflows during the while loop for the calculation of
  50     * new_l1_size) */
  51    if (min_size > INT_MAX / sizeof(uint64_t)) {
  52        return -EFBIG;
  53    }
  54
  55    if (exact_size) {
  56        new_l1_size = min_size;
  57    } else {
  58        /* Bump size up to reduce the number of times we have to grow */
  59        new_l1_size = s->l1_size;
  60        if (new_l1_size == 0) {
  61            new_l1_size = 1;
  62        }
  63        while (min_size > new_l1_size) {
  64            new_l1_size = (new_l1_size * 3 + 1) / 2;
  65        }
  66    }
  67
  68    if (new_l1_size > INT_MAX / sizeof(uint64_t)) {
  69        return -EFBIG;
  70    }
  71
  72#ifdef DEBUG_ALLOC2
  73    fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n",
  74            s->l1_size, new_l1_size);
  75#endif
  76
  77    new_l1_size2 = sizeof(uint64_t) * new_l1_size;
  78    new_l1_table = qemu_try_blockalign(bs->file->bs,
  79                                       align_offset(new_l1_size2, 512));
  80    if (new_l1_table == NULL) {
  81        return -ENOMEM;
  82    }
  83    memset(new_l1_table, 0, align_offset(new_l1_size2, 512));
  84
  85    memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
  86
  87    /* write new table (align to cluster) */
  88    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE);
  89    new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2);
  90    if (new_l1_table_offset < 0) {
  91        qemu_vfree(new_l1_table);
  92        return new_l1_table_offset;
  93    }
  94
  95    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
  96    if (ret < 0) {
  97        goto fail;
  98    }
  99
 100    /* the L1 position has not yet been updated, so these clusters must
 101     * indeed be completely free */
 102    ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset,
 103                                        new_l1_size2);
 104    if (ret < 0) {
 105        goto fail;
 106    }
 107
 108    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
 109    for(i = 0; i < s->l1_size; i++)
 110        new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
 111    ret = bdrv_pwrite_sync(bs->file->bs, new_l1_table_offset,
 112                           new_l1_table, new_l1_size2);
 113    if (ret < 0)
 114        goto fail;
 115    for(i = 0; i < s->l1_size; i++)
 116        new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
 117
 118    /* set new table */
 119    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE);
 120    cpu_to_be32w((uint32_t*)data, new_l1_size);
 121    stq_be_p(data + 4, new_l1_table_offset);
 122    ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, l1_size),
 123                           data, sizeof(data));
 124    if (ret < 0) {
 125        goto fail;
 126    }
 127    qemu_vfree(s->l1_table);
 128    old_l1_table_offset = s->l1_table_offset;
 129    s->l1_table_offset = new_l1_table_offset;
 130    s->l1_table = new_l1_table;
 131    old_l1_size = s->l1_size;
 132    s->l1_size = new_l1_size;
 133    qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * sizeof(uint64_t),
 134                        QCOW2_DISCARD_OTHER);
 135    return 0;
 136 fail:
 137    qemu_vfree(new_l1_table);
 138    qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2,
 139                        QCOW2_DISCARD_OTHER);
 140    return ret;
 141}
 142
 143/*
 144 * l2_load
 145 *
 146 * Loads a L2 table into memory. If the table is in the cache, the cache
 147 * is used; otherwise the L2 table is loaded from the image file.
 148 *
 149 * Returns a pointer to the L2 table on success, or NULL if the read from
 150 * the image file failed.
 151 */
 152
 153static int l2_load(BlockDriverState *bs, uint64_t l2_offset,
 154    uint64_t **l2_table)
 155{
 156    BDRVQcow2State *s = bs->opaque;
 157    int ret;
 158
 159    ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) l2_table);
 160
 161    return ret;
 162}
 163
 164/*
 165 * Writes one sector of the L1 table to the disk (can't update single entries
 166 * and we really don't want bdrv_pread to perform a read-modify-write)
 167 */
 168#define L1_ENTRIES_PER_SECTOR (512 / 8)
 169int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index)
 170{
 171    BDRVQcow2State *s = bs->opaque;
 172    uint64_t buf[L1_ENTRIES_PER_SECTOR] = { 0 };
 173    int l1_start_index;
 174    int i, ret;
 175
 176    l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1);
 177    for (i = 0; i < L1_ENTRIES_PER_SECTOR && l1_start_index + i < s->l1_size;
 178         i++)
 179    {
 180        buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]);
 181    }
 182
 183    ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1,
 184            s->l1_table_offset + 8 * l1_start_index, sizeof(buf));
 185    if (ret < 0) {
 186        return ret;
 187    }
 188
 189    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
 190    ret = bdrv_pwrite_sync(bs->file->bs,
 191                           s->l1_table_offset + 8 * l1_start_index,
 192                           buf, sizeof(buf));
 193    if (ret < 0) {
 194        return ret;
 195    }
 196
 197    return 0;
 198}
 199
 200/*
 201 * l2_allocate
 202 *
 203 * Allocate a new l2 entry in the file. If l1_index points to an already
 204 * used entry in the L2 table (i.e. we are doing a copy on write for the L2
 205 * table) copy the contents of the old L2 table into the newly allocated one.
 206 * Otherwise the new table is initialized with zeros.
 207 *
 208 */
 209
 210static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
 211{
 212    BDRVQcow2State *s = bs->opaque;
 213    uint64_t old_l2_offset;
 214    uint64_t *l2_table = NULL;
 215    int64_t l2_offset;
 216    int ret;
 217
 218    old_l2_offset = s->l1_table[l1_index];
 219
 220    trace_qcow2_l2_allocate(bs, l1_index);
 221
 222    /* allocate a new l2 entry */
 223
 224    l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
 225    if (l2_offset < 0) {
 226        ret = l2_offset;
 227        goto fail;
 228    }
 229
 230    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
 231    if (ret < 0) {
 232        goto fail;
 233    }
 234
 235    /* allocate a new entry in the l2 cache */
 236
 237    trace_qcow2_l2_allocate_get_empty(bs, l1_index);
 238    ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table);
 239    if (ret < 0) {
 240        goto fail;
 241    }
 242
 243    l2_table = *table;
 244
 245    if ((old_l2_offset & L1E_OFFSET_MASK) == 0) {
 246        /* if there was no old l2 table, clear the new table */
 247        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
 248    } else {
 249        uint64_t* old_table;
 250
 251        /* if there was an old l2 table, read it from the disk */
 252        BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ);
 253        ret = qcow2_cache_get(bs, s->l2_table_cache,
 254            old_l2_offset & L1E_OFFSET_MASK,
 255            (void**) &old_table);
 256        if (ret < 0) {
 257            goto fail;
 258        }
 259
 260        memcpy(l2_table, old_table, s->cluster_size);
 261
 262        qcow2_cache_put(bs, s->l2_table_cache, (void **) &old_table);
 263    }
 264
 265    /* write the l2 table to the file */
 266    BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
 267
 268    trace_qcow2_l2_allocate_write_l2(bs, l1_index);
 269    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
 270    ret = qcow2_cache_flush(bs, s->l2_table_cache);
 271    if (ret < 0) {
 272        goto fail;
 273    }
 274
 275    /* update the L1 entry */
 276    trace_qcow2_l2_allocate_write_l1(bs, l1_index);
 277    s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
 278    ret = qcow2_write_l1_entry(bs, l1_index);
 279    if (ret < 0) {
 280        goto fail;
 281    }
 282
 283    *table = l2_table;
 284    trace_qcow2_l2_allocate_done(bs, l1_index, 0);
 285    return 0;
 286
 287fail:
 288    trace_qcow2_l2_allocate_done(bs, l1_index, ret);
 289    if (l2_table != NULL) {
 290        qcow2_cache_put(bs, s->l2_table_cache, (void**) table);
 291    }
 292    s->l1_table[l1_index] = old_l2_offset;
 293    if (l2_offset > 0) {
 294        qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t),
 295                            QCOW2_DISCARD_ALWAYS);
 296    }
 297    return ret;
 298}
 299
 300/*
 301 * Checks how many clusters in a given L2 table are contiguous in the image
 302 * file. As soon as one of the flags in the bitmask stop_flags changes compared
 303 * to the first cluster, the search is stopped and the cluster is not counted
 304 * as contiguous. (This allows it, for example, to stop at the first compressed
 305 * cluster which may require a different handling)
 306 */
 307static int count_contiguous_clusters(int nb_clusters, int cluster_size,
 308        uint64_t *l2_table, uint64_t stop_flags)
 309{
 310    int i;
 311    uint64_t mask = stop_flags | L2E_OFFSET_MASK | QCOW_OFLAG_COMPRESSED;
 312    uint64_t first_entry = be64_to_cpu(l2_table[0]);
 313    uint64_t offset = first_entry & mask;
 314
 315    if (!offset)
 316        return 0;
 317
 318    assert(qcow2_get_cluster_type(first_entry) == QCOW2_CLUSTER_NORMAL);
 319
 320    for (i = 0; i < nb_clusters; i++) {
 321        uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask;
 322        if (offset + (uint64_t) i * cluster_size != l2_entry) {
 323            break;
 324        }
 325    }
 326
 327        return i;
 328}
 329
 330static int count_contiguous_clusters_by_type(int nb_clusters,
 331                                             uint64_t *l2_table,
 332                                             int wanted_type)
 333{
 334    int i;
 335
 336    for (i = 0; i < nb_clusters; i++) {
 337        int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i]));
 338
 339        if (type != wanted_type) {
 340            break;
 341        }
 342    }
 343
 344    return i;
 345}
 346
 347/* The crypt function is compatible with the linux cryptoloop
 348   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
 349   supported */
 350int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
 351                          uint8_t *out_buf, const uint8_t *in_buf,
 352                          int nb_sectors, bool enc,
 353                          Error **errp)
 354{
 355    union {
 356        uint64_t ll[2];
 357        uint8_t b[16];
 358    } ivec;
 359    int i;
 360    int ret;
 361
 362    for(i = 0; i < nb_sectors; i++) {
 363        ivec.ll[0] = cpu_to_le64(sector_num);
 364        ivec.ll[1] = 0;
 365        if (qcrypto_cipher_setiv(s->cipher,
 366                                 ivec.b, G_N_ELEMENTS(ivec.b),
 367                                 errp) < 0) {
 368            return -1;
 369        }
 370        if (enc) {
 371            ret = qcrypto_cipher_encrypt(s->cipher,
 372                                         in_buf,
 373                                         out_buf,
 374                                         512,
 375                                         errp);
 376        } else {
 377            ret = qcrypto_cipher_decrypt(s->cipher,
 378                                         in_buf,
 379                                         out_buf,
 380                                         512,
 381                                         errp);
 382        }
 383        if (ret < 0) {
 384            return -1;
 385        }
 386        sector_num++;
 387        in_buf += 512;
 388        out_buf += 512;
 389    }
 390    return 0;
 391}
 392
 393static int coroutine_fn copy_sectors(BlockDriverState *bs,
 394                                     uint64_t start_sect,
 395                                     uint64_t cluster_offset,
 396                                     int n_start, int n_end)
 397{
 398    BDRVQcow2State *s = bs->opaque;
 399    QEMUIOVector qiov;
 400    struct iovec iov;
 401    int n, ret;
 402
 403    n = n_end - n_start;
 404    if (n <= 0) {
 405        return 0;
 406    }
 407
 408    iov.iov_len = n * BDRV_SECTOR_SIZE;
 409    iov.iov_base = qemu_try_blockalign(bs, iov.iov_len);
 410    if (iov.iov_base == NULL) {
 411        return -ENOMEM;
 412    }
 413
 414    qemu_iovec_init_external(&qiov, &iov, 1);
 415
 416    BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
 417
 418    if (!bs->drv) {
 419        ret = -ENOMEDIUM;
 420        goto out;
 421    }
 422
 423    /* Call .bdrv_co_readv() directly instead of using the public block-layer
 424     * interface.  This avoids double I/O throttling and request tracking,
 425     * which can lead to deadlock when block layer copy-on-read is enabled.
 426     */
 427    ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov);
 428    if (ret < 0) {
 429        goto out;
 430    }
 431
 432    if (bs->encrypted) {
 433        Error *err = NULL;
 434        assert(s->cipher);
 435        if (qcow2_encrypt_sectors(s, start_sect + n_start,
 436                                  iov.iov_base, iov.iov_base, n,
 437                                  true, &err) < 0) {
 438            ret = -EIO;
 439            error_free(err);
 440            goto out;
 441        }
 442    }
 443
 444    ret = qcow2_pre_write_overlap_check(bs, 0,
 445            cluster_offset + n_start * BDRV_SECTOR_SIZE, n * BDRV_SECTOR_SIZE);
 446    if (ret < 0) {
 447        goto out;
 448    }
 449
 450    BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
 451    ret = bdrv_co_writev(bs->file->bs, (cluster_offset >> 9) + n_start, n,
 452                         &qiov);
 453    if (ret < 0) {
 454        goto out;
 455    }
 456
 457    ret = 0;
 458out:
 459    qemu_vfree(iov.iov_base);
 460    return ret;
 461}
 462
 463
 464/*
 465 * get_cluster_offset
 466 *
 467 * For a given offset of the disk image, find the cluster offset in
 468 * qcow2 file. The offset is stored in *cluster_offset.
 469 *
 470 * on entry, *num is the number of contiguous sectors we'd like to
 471 * access following offset.
 472 *
 473 * on exit, *num is the number of contiguous sectors we can read.
 474 *
 475 * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error
 476 * cases.
 477 */
 478int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
 479    int *num, uint64_t *cluster_offset)
 480{
 481    BDRVQcow2State *s = bs->opaque;
 482    unsigned int l2_index;
 483    uint64_t l1_index, l2_offset, *l2_table;
 484    int l1_bits, c;
 485    unsigned int index_in_cluster, nb_clusters;
 486    uint64_t nb_available, nb_needed;
 487    int ret;
 488
 489    index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
 490    nb_needed = *num + index_in_cluster;
 491
 492    l1_bits = s->l2_bits + s->cluster_bits;
 493
 494    /* compute how many bytes there are between the offset and
 495     * the end of the l1 entry
 496     */
 497
 498    nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1));
 499
 500    /* compute the number of available sectors */
 501
 502    nb_available = (nb_available >> 9) + index_in_cluster;
 503
 504    if (nb_needed > nb_available) {
 505        nb_needed = nb_available;
 506    }
 507    assert(nb_needed <= INT_MAX);
 508
 509    *cluster_offset = 0;
 510
 511    /* seek to the l2 offset in the l1 table */
 512
 513    l1_index = offset >> l1_bits;
 514    if (l1_index >= s->l1_size) {
 515        ret = QCOW2_CLUSTER_UNALLOCATED;
 516        goto out;
 517    }
 518
 519    l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
 520    if (!l2_offset) {
 521        ret = QCOW2_CLUSTER_UNALLOCATED;
 522        goto out;
 523    }
 524
 525    if (offset_into_cluster(s, l2_offset)) {
 526        qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64
 527                                " unaligned (L1 index: %#" PRIx64 ")",
 528                                l2_offset, l1_index);
 529        return -EIO;
 530    }
 531
 532    /* load the l2 table in memory */
 533
 534    ret = l2_load(bs, l2_offset, &l2_table);
 535    if (ret < 0) {
 536        return ret;
 537    }
 538
 539    /* find the cluster offset for the given disk offset */
 540
 541    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
 542    *cluster_offset = be64_to_cpu(l2_table[l2_index]);
 543
 544    /* nb_needed <= INT_MAX, thus nb_clusters <= INT_MAX, too */
 545    nb_clusters = size_to_clusters(s, nb_needed << 9);
 546
 547    ret = qcow2_get_cluster_type(*cluster_offset);
 548    switch (ret) {
 549    case QCOW2_CLUSTER_COMPRESSED:
 550        /* Compressed clusters can only be processed one by one */
 551        c = 1;
 552        *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK;
 553        break;
 554    case QCOW2_CLUSTER_ZERO:
 555        if (s->qcow_version < 3) {
 556            qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found"
 557                                    " in pre-v3 image (L2 offset: %#" PRIx64
 558                                    ", L2 index: %#x)", l2_offset, l2_index);
 559            ret = -EIO;
 560            goto fail;
 561        }
 562        c = count_contiguous_clusters_by_type(nb_clusters, &l2_table[l2_index],
 563                                              QCOW2_CLUSTER_ZERO);
 564        *cluster_offset = 0;
 565        break;
 566    case QCOW2_CLUSTER_UNALLOCATED:
 567        /* how many empty clusters ? */
 568        c = count_contiguous_clusters_by_type(nb_clusters, &l2_table[l2_index],
 569                                              QCOW2_CLUSTER_UNALLOCATED);
 570        *cluster_offset = 0;
 571        break;
 572    case QCOW2_CLUSTER_NORMAL:
 573        /* how many allocated clusters ? */
 574        c = count_contiguous_clusters(nb_clusters, s->cluster_size,
 575                &l2_table[l2_index], QCOW_OFLAG_ZERO);
 576        *cluster_offset &= L2E_OFFSET_MASK;
 577        if (offset_into_cluster(s, *cluster_offset)) {
 578            qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset %#"
 579                                    PRIx64 " unaligned (L2 offset: %#" PRIx64
 580                                    ", L2 index: %#x)", *cluster_offset,
 581                                    l2_offset, l2_index);
 582            ret = -EIO;
 583            goto fail;
 584        }
 585        break;
 586    default:
 587        abort();
 588    }
 589
 590    qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
 591
 592    nb_available = (c * s->cluster_sectors);
 593
 594out:
 595    if (nb_available > nb_needed)
 596        nb_available = nb_needed;
 597
 598    *num = nb_available - index_in_cluster;
 599
 600    return ret;
 601
 602fail:
 603    qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
 604    return ret;
 605}
 606
 607/*
 608 * get_cluster_table
 609 *
 610 * for a given disk offset, load (and allocate if needed)
 611 * the l2 table.
 612 *
 613 * the l2 table offset in the qcow2 file and the cluster index
 614 * in the l2 table are given to the caller.
 615 *
 616 * Returns 0 on success, -errno in failure case
 617 */
 618static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
 619                             uint64_t **new_l2_table,
 620                             int *new_l2_index)
 621{
 622    BDRVQcow2State *s = bs->opaque;
 623    unsigned int l2_index;
 624    uint64_t l1_index, l2_offset;
 625    uint64_t *l2_table = NULL;
 626    int ret;
 627
 628    /* seek to the l2 offset in the l1 table */
 629
 630    l1_index = offset >> (s->l2_bits + s->cluster_bits);
 631    if (l1_index >= s->l1_size) {
 632        ret = qcow2_grow_l1_table(bs, l1_index + 1, false);
 633        if (ret < 0) {
 634            return ret;
 635        }
 636    }
 637
 638    assert(l1_index < s->l1_size);
 639    l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
 640    if (offset_into_cluster(s, l2_offset)) {
 641        qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64
 642                                " unaligned (L1 index: %#" PRIx64 ")",
 643                                l2_offset, l1_index);
 644        return -EIO;
 645    }
 646
 647    /* seek the l2 table of the given l2 offset */
 648
 649    if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) {
 650        /* load the l2 table in memory */
 651        ret = l2_load(bs, l2_offset, &l2_table);
 652        if (ret < 0) {
 653            return ret;
 654        }
 655    } else {
 656        /* First allocate a new L2 table (and do COW if needed) */
 657        ret = l2_allocate(bs, l1_index, &l2_table);
 658        if (ret < 0) {
 659            return ret;
 660        }
 661
 662        /* Then decrease the refcount of the old table */
 663        if (l2_offset) {
 664            qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t),
 665                                QCOW2_DISCARD_OTHER);
 666        }
 667    }
 668
 669    /* find the cluster offset for the given disk offset */
 670
 671    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
 672
 673    *new_l2_table = l2_table;
 674    *new_l2_index = l2_index;
 675
 676    return 0;
 677}
 678
 679/*
 680 * alloc_compressed_cluster_offset
 681 *
 682 * For a given offset of the disk image, return cluster offset in
 683 * qcow2 file.
 684 *
 685 * If the offset is not found, allocate a new compressed cluster.
 686 *
 687 * Return the cluster offset if successful,
 688 * Return 0, otherwise.
 689 *
 690 */
 691
 692uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
 693                                               uint64_t offset,
 694                                               int compressed_size)
 695{
 696    BDRVQcow2State *s = bs->opaque;
 697    int l2_index, ret;
 698    uint64_t *l2_table;
 699    int64_t cluster_offset;
 700    int nb_csectors;
 701
 702    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
 703    if (ret < 0) {
 704        return 0;
 705    }
 706
 707    /* Compression can't overwrite anything. Fail if the cluster was already
 708     * allocated. */
 709    cluster_offset = be64_to_cpu(l2_table[l2_index]);
 710    if (cluster_offset & L2E_OFFSET_MASK) {
 711        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
 712        return 0;
 713    }
 714
 715    cluster_offset = qcow2_alloc_bytes(bs, compressed_size);
 716    if (cluster_offset < 0) {
 717        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
 718        return 0;
 719    }
 720
 721    nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
 722                  (cluster_offset >> 9);
 723
 724    cluster_offset |= QCOW_OFLAG_COMPRESSED |
 725                      ((uint64_t)nb_csectors << s->csize_shift);
 726
 727    /* update L2 table */
 728
 729    /* compressed clusters never have the copied flag */
 730
 731    BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
 732    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
 733    l2_table[l2_index] = cpu_to_be64(cluster_offset);
 734    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
 735
 736    return cluster_offset;
 737}
 738
 739static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r)
 740{
 741    BDRVQcow2State *s = bs->opaque;
 742    int ret;
 743
 744    if (r->nb_sectors == 0) {
 745        return 0;
 746    }
 747
 748    qemu_co_mutex_unlock(&s->lock);
 749    ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset,
 750                       r->offset / BDRV_SECTOR_SIZE,
 751                       r->offset / BDRV_SECTOR_SIZE + r->nb_sectors);
 752    qemu_co_mutex_lock(&s->lock);
 753
 754    if (ret < 0) {
 755        return ret;
 756    }
 757
 758    /*
 759     * Before we update the L2 table to actually point to the new cluster, we
 760     * need to be sure that the refcounts have been increased and COW was
 761     * handled.
 762     */
 763    qcow2_cache_depends_on_flush(s->l2_table_cache);
 764
 765    return 0;
 766}
 767
 768int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
 769{
 770    BDRVQcow2State *s = bs->opaque;
 771    int i, j = 0, l2_index, ret;
 772    uint64_t *old_cluster, *l2_table;
 773    uint64_t cluster_offset = m->alloc_offset;
 774
 775    trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
 776    assert(m->nb_clusters > 0);
 777
 778    old_cluster = g_try_new(uint64_t, m->nb_clusters);
 779    if (old_cluster == NULL) {
 780        ret = -ENOMEM;
 781        goto err;
 782    }
 783
 784    /* copy content of unmodified sectors */
 785    ret = perform_cow(bs, m, &m->cow_start);
 786    if (ret < 0) {
 787        goto err;
 788    }
 789
 790    ret = perform_cow(bs, m, &m->cow_end);
 791    if (ret < 0) {
 792        goto err;
 793    }
 794
 795    /* Update L2 table. */
 796    if (s->use_lazy_refcounts) {
 797        qcow2_mark_dirty(bs);
 798    }
 799    if (qcow2_need_accurate_refcounts(s)) {
 800        qcow2_cache_set_dependency(bs, s->l2_table_cache,
 801                                   s->refcount_block_cache);
 802    }
 803
 804    ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index);
 805    if (ret < 0) {
 806        goto err;
 807    }
 808    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
 809
 810    assert(l2_index + m->nb_clusters <= s->l2_size);
 811    for (i = 0; i < m->nb_clusters; i++) {
 812        /* if two concurrent writes happen to the same unallocated cluster
 813         * each write allocates separate cluster and writes data concurrently.
 814         * The first one to complete updates l2 table with pointer to its
 815         * cluster the second one has to do RMW (which is done above by
 816         * copy_sectors()), update l2 table with its cluster pointer and free
 817         * old cluster. This is what this loop does */
 818        if(l2_table[l2_index + i] != 0)
 819            old_cluster[j++] = l2_table[l2_index + i];
 820
 821        l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
 822                    (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
 823     }
 824
 825
 826    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
 827
 828    /*
 829     * If this was a COW, we need to decrease the refcount of the old cluster.
 830     *
 831     * Don't discard clusters that reach a refcount of 0 (e.g. compressed
 832     * clusters), the next write will reuse them anyway.
 833     */
 834    if (j != 0) {
 835        for (i = 0; i < j; i++) {
 836            qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1,
 837                                    QCOW2_DISCARD_NEVER);
 838        }
 839    }
 840
 841    ret = 0;
 842err:
 843    g_free(old_cluster);
 844    return ret;
 845 }
 846
 847/*
 848 * Returns the number of contiguous clusters that can be used for an allocating
 849 * write, but require COW to be performed (this includes yet unallocated space,
 850 * which must copy from the backing file)
 851 */
 852static int count_cow_clusters(BDRVQcow2State *s, int nb_clusters,
 853    uint64_t *l2_table, int l2_index)
 854{
 855    int i;
 856
 857    for (i = 0; i < nb_clusters; i++) {
 858        uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]);
 859        int cluster_type = qcow2_get_cluster_type(l2_entry);
 860
 861        switch(cluster_type) {
 862        case QCOW2_CLUSTER_NORMAL:
 863            if (l2_entry & QCOW_OFLAG_COPIED) {
 864                goto out;
 865            }
 866            break;
 867        case QCOW2_CLUSTER_UNALLOCATED:
 868        case QCOW2_CLUSTER_COMPRESSED:
 869        case QCOW2_CLUSTER_ZERO:
 870            break;
 871        default:
 872            abort();
 873        }
 874    }
 875
 876out:
 877    assert(i <= nb_clusters);
 878    return i;
 879}
 880
 881/*
 882 * Check if there already is an AIO write request in flight which allocates
 883 * the same cluster. In this case we need to wait until the previous
 884 * request has completed and updated the L2 table accordingly.
 885 *
 886 * Returns:
 887 *   0       if there was no dependency. *cur_bytes indicates the number of
 888 *           bytes from guest_offset that can be read before the next
 889 *           dependency must be processed (or the request is complete)
 890 *
 891 *   -EAGAIN if we had to wait for another request, previously gathered
 892 *           information on cluster allocation may be invalid now. The caller
 893 *           must start over anyway, so consider *cur_bytes undefined.
 894 */
 895static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
 896    uint64_t *cur_bytes, QCowL2Meta **m)
 897{
 898    BDRVQcow2State *s = bs->opaque;
 899    QCowL2Meta *old_alloc;
 900    uint64_t bytes = *cur_bytes;
 901
 902    QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) {
 903
 904        uint64_t start = guest_offset;
 905        uint64_t end = start + bytes;
 906        uint64_t old_start = l2meta_cow_start(old_alloc);
 907        uint64_t old_end = l2meta_cow_end(old_alloc);
 908
 909        if (end <= old_start || start >= old_end) {
 910            /* No intersection */
 911        } else {
 912            if (start < old_start) {
 913                /* Stop at the start of a running allocation */
 914                bytes = old_start - start;
 915            } else {
 916                bytes = 0;
 917            }
 918
 919            /* Stop if already an l2meta exists. After yielding, it wouldn't
 920             * be valid any more, so we'd have to clean up the old L2Metas
 921             * and deal with requests depending on them before starting to
 922             * gather new ones. Not worth the trouble. */
 923            if (bytes == 0 && *m) {
 924                *cur_bytes = 0;
 925                return 0;
 926            }
 927
 928            if (bytes == 0) {
 929                /* Wait for the dependency to complete. We need to recheck
 930                 * the free/allocated clusters when we continue. */
 931                qemu_co_mutex_unlock(&s->lock);
 932                qemu_co_queue_wait(&old_alloc->dependent_requests);
 933                qemu_co_mutex_lock(&s->lock);
 934                return -EAGAIN;
 935            }
 936        }
 937    }
 938
 939    /* Make sure that existing clusters and new allocations are only used up to
 940     * the next dependency if we shortened the request above */
 941    *cur_bytes = bytes;
 942
 943    return 0;
 944}
 945
 946/*
 947 * Checks how many already allocated clusters that don't require a copy on
 948 * write there are at the given guest_offset (up to *bytes). If
 949 * *host_offset is not zero, only physically contiguous clusters beginning at
 950 * this host offset are counted.
 951 *
 952 * Note that guest_offset may not be cluster aligned. In this case, the
 953 * returned *host_offset points to exact byte referenced by guest_offset and
 954 * therefore isn't cluster aligned as well.
 955 *
 956 * Returns:
 957 *   0:     if no allocated clusters are available at the given offset.
 958 *          *bytes is normally unchanged. It is set to 0 if the cluster
 959 *          is allocated and doesn't need COW, but doesn't have the right
 960 *          physical offset.
 961 *
 962 *   1:     if allocated clusters that don't require a COW are available at
 963 *          the requested offset. *bytes may have decreased and describes
 964 *          the length of the area that can be written to.
 965 *
 966 *  -errno: in error cases
 967 */
 968static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
 969    uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
 970{
 971    BDRVQcow2State *s = bs->opaque;
 972    int l2_index;
 973    uint64_t cluster_offset;
 974    uint64_t *l2_table;
 975    uint64_t nb_clusters;
 976    unsigned int keep_clusters;
 977    int ret;
 978
 979    trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset,
 980                              *bytes);
 981
 982    assert(*host_offset == 0 ||    offset_into_cluster(s, guest_offset)
 983                                == offset_into_cluster(s, *host_offset));
 984
 985    /*
 986     * Calculate the number of clusters to look for. We stop at L2 table
 987     * boundaries to keep things simple.
 988     */
 989    nb_clusters =
 990        size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
 991
 992    l2_index = offset_to_l2_index(s, guest_offset);
 993    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
 994    assert(nb_clusters <= INT_MAX);
 995
 996    /* Find L2 entry for the first involved cluster */
 997    ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
 998    if (ret < 0) {
 999        return ret;
1000    }
1001
1002    cluster_offset = be64_to_cpu(l2_table[l2_index]);
1003
1004    /* Check how many clusters are already allocated and don't need COW */
1005    if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL
1006        && (cluster_offset & QCOW_OFLAG_COPIED))
1007    {
1008        /* If a specific host_offset is required, check it */
1009        bool offset_matches =
1010            (cluster_offset & L2E_OFFSET_MASK) == *host_offset;
1011
1012        if (offset_into_cluster(s, cluster_offset & L2E_OFFSET_MASK)) {
1013            qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset "
1014                                    "%#llx unaligned (guest offset: %#" PRIx64
1015                                    ")", cluster_offset & L2E_OFFSET_MASK,
1016                                    guest_offset);
1017            ret = -EIO;
1018            goto out;
1019        }
1020
1021        if (*host_offset != 0 && !offset_matches) {
1022            *bytes = 0;
1023            ret = 0;
1024            goto out;
1025        }
1026
1027        /* We keep all QCOW_OFLAG_COPIED clusters */
1028        keep_clusters =
1029            count_contiguous_clusters(nb_clusters, s->cluster_size,
1030                                      &l2_table[l2_index],
1031                                      QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO);
1032        assert(keep_clusters <= nb_clusters);
1033
1034        *bytes = MIN(*bytes,
1035                 keep_clusters * s->cluster_size
1036                 - offset_into_cluster(s, guest_offset));
1037
1038        ret = 1;
1039    } else {
1040        ret = 0;
1041    }
1042
1043    /* Cleanup */
1044out:
1045    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
1046
1047    /* Only return a host offset if we actually made progress. Otherwise we
1048     * would make requirements for handle_alloc() that it can't fulfill */
1049    if (ret > 0) {
1050        *host_offset = (cluster_offset & L2E_OFFSET_MASK)
1051                     + offset_into_cluster(s, guest_offset);
1052    }
1053
1054    return ret;
1055}
1056
1057/*
1058 * Allocates new clusters for the given guest_offset.
1059 *
1060 * At most *nb_clusters are allocated, and on return *nb_clusters is updated to
1061 * contain the number of clusters that have been allocated and are contiguous
1062 * in the image file.
1063 *
1064 * If *host_offset is non-zero, it specifies the offset in the image file at
1065 * which the new clusters must start. *nb_clusters can be 0 on return in this
1066 * case if the cluster at host_offset is already in use. If *host_offset is
1067 * zero, the clusters can be allocated anywhere in the image file.
1068 *
1069 * *host_offset is updated to contain the offset into the image file at which
1070 * the first allocated cluster starts.
1071 *
1072 * Return 0 on success and -errno in error cases. -EAGAIN means that the
1073 * function has been waiting for another request and the allocation must be
1074 * restarted, but the whole request should not be failed.
1075 */
1076static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
1077                                   uint64_t *host_offset, uint64_t *nb_clusters)
1078{
1079    BDRVQcow2State *s = bs->opaque;
1080
1081    trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset,
1082                                         *host_offset, *nb_clusters);
1083
1084    /* Allocate new clusters */
1085    trace_qcow2_cluster_alloc_phys(qemu_coroutine_self());
1086    if (*host_offset == 0) {
1087        int64_t cluster_offset =
1088            qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size);
1089        if (cluster_offset < 0) {
1090            return cluster_offset;
1091        }
1092        *host_offset = cluster_offset;
1093        return 0;
1094    } else {
1095        int64_t ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters);
1096        if (ret < 0) {
1097            return ret;
1098        }
1099        *nb_clusters = ret;
1100        return 0;
1101    }
1102}
1103
1104/*
1105 * Allocates new clusters for an area that either is yet unallocated or needs a
1106 * copy on write. If *host_offset is non-zero, clusters are only allocated if
1107 * the new allocation can match the specified host offset.
1108 *
1109 * Note that guest_offset may not be cluster aligned. In this case, the
1110 * returned *host_offset points to exact byte referenced by guest_offset and
1111 * therefore isn't cluster aligned as well.
1112 *
1113 * Returns:
1114 *   0:     if no clusters could be allocated. *bytes is set to 0,
1115 *          *host_offset is left unchanged.
1116 *
1117 *   1:     if new clusters were allocated. *bytes may be decreased if the
1118 *          new allocation doesn't cover all of the requested area.
1119 *          *host_offset is updated to contain the host offset of the first
1120 *          newly allocated cluster.
1121 *
1122 *  -errno: in error cases
1123 */
1124static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
1125    uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
1126{
1127    BDRVQcow2State *s = bs->opaque;
1128    int l2_index;
1129    uint64_t *l2_table;
1130    uint64_t entry;
1131    uint64_t nb_clusters;
1132    int ret;
1133
1134    uint64_t alloc_cluster_offset;
1135
1136    trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset,
1137                             *bytes);
1138    assert(*bytes > 0);
1139
1140    /*
1141     * Calculate the number of clusters to look for. We stop at L2 table
1142     * boundaries to keep things simple.
1143     */
1144    nb_clusters =
1145        size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
1146
1147    l2_index = offset_to_l2_index(s, guest_offset);
1148    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
1149    assert(nb_clusters <= INT_MAX);
1150
1151    /* Find L2 entry for the first involved cluster */
1152    ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index);
1153    if (ret < 0) {
1154        return ret;
1155    }
1156
1157    entry = be64_to_cpu(l2_table[l2_index]);
1158
1159    /* For the moment, overwrite compressed clusters one by one */
1160    if (entry & QCOW_OFLAG_COMPRESSED) {
1161        nb_clusters = 1;
1162    } else {
1163        nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index);
1164    }
1165
1166    /* This function is only called when there were no non-COW clusters, so if
1167     * we can't find any unallocated or COW clusters either, something is
1168     * wrong with our code. */
1169    assert(nb_clusters > 0);
1170
1171    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
1172
1173    /* Allocate, if necessary at a given offset in the image file */
1174    alloc_cluster_offset = start_of_cluster(s, *host_offset);
1175    ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
1176                                  &nb_clusters);
1177    if (ret < 0) {
1178        goto fail;
1179    }
1180
1181    /* Can't extend contiguous allocation */
1182    if (nb_clusters == 0) {
1183        *bytes = 0;
1184        return 0;
1185    }
1186
1187    /* !*host_offset would overwrite the image header and is reserved for "no
1188     * host offset preferred". If 0 was a valid host offset, it'd trigger the
1189     * following overlap check; do that now to avoid having an invalid value in
1190     * *host_offset. */
1191    if (!alloc_cluster_offset) {
1192        ret = qcow2_pre_write_overlap_check(bs, 0, alloc_cluster_offset,
1193                                            nb_clusters * s->cluster_size);
1194        assert(ret < 0);
1195        goto fail;
1196    }
1197
1198    /*
1199     * Save info needed for meta data update.
1200     *
1201     * requested_sectors: Number of sectors from the start of the first
1202     * newly allocated cluster to the end of the (possibly shortened
1203     * before) write request.
1204     *
1205     * avail_sectors: Number of sectors from the start of the first
1206     * newly allocated to the end of the last newly allocated cluster.
1207     *
1208     * nb_sectors: The number of sectors from the start of the first
1209     * newly allocated cluster to the end of the area that the write
1210     * request actually writes to (excluding COW at the end)
1211     */
1212    int requested_sectors =
1213        (*bytes + offset_into_cluster(s, guest_offset))
1214        >> BDRV_SECTOR_BITS;
1215    int avail_sectors = nb_clusters
1216                        << (s->cluster_bits - BDRV_SECTOR_BITS);
1217    int alloc_n_start = offset_into_cluster(s, guest_offset)
1218                        >> BDRV_SECTOR_BITS;
1219    int nb_sectors = MIN(requested_sectors, avail_sectors);
1220    QCowL2Meta *old_m = *m;
1221
1222    *m = g_malloc0(sizeof(**m));
1223
1224    **m = (QCowL2Meta) {
1225        .next           = old_m,
1226
1227        .alloc_offset   = alloc_cluster_offset,
1228        .offset         = start_of_cluster(s, guest_offset),
1229        .nb_clusters    = nb_clusters,
1230        .nb_available   = nb_sectors,
1231
1232        .cow_start = {
1233            .offset     = 0,
1234            .nb_sectors = alloc_n_start,
1235        },
1236        .cow_end = {
1237            .offset     = nb_sectors * BDRV_SECTOR_SIZE,
1238            .nb_sectors = avail_sectors - nb_sectors,
1239        },
1240    };
1241    qemu_co_queue_init(&(*m)->dependent_requests);
1242    QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
1243
1244    *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset);
1245    *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE)
1246                         - offset_into_cluster(s, guest_offset));
1247    assert(*bytes != 0);
1248
1249    return 1;
1250
1251fail:
1252    if (*m && (*m)->nb_clusters > 0) {
1253        QLIST_REMOVE(*m, next_in_flight);
1254    }
1255    return ret;
1256}
1257
1258/*
1259 * alloc_cluster_offset
1260 *
1261 * For a given offset on the virtual disk, find the cluster offset in qcow2
1262 * file. If the offset is not found, allocate a new cluster.
1263 *
1264 * If the cluster was already allocated, m->nb_clusters is set to 0 and
1265 * other fields in m are meaningless.
1266 *
1267 * If the cluster is newly allocated, m->nb_clusters is set to the number of
1268 * contiguous clusters that have been allocated. In this case, the other
1269 * fields of m are valid and contain information about the first allocated
1270 * cluster.
1271 *
1272 * If the request conflicts with another write request in flight, the coroutine
1273 * is queued and will be reentered when the dependency has completed.
1274 *
1275 * Return 0 on success and -errno in error cases
1276 */
1277int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
1278    int *num, uint64_t *host_offset, QCowL2Meta **m)
1279{
1280    BDRVQcow2State *s = bs->opaque;
1281    uint64_t start, remaining;
1282    uint64_t cluster_offset;
1283    uint64_t cur_bytes;
1284    int ret;
1285
1286    trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *num);
1287
1288    assert((offset & ~BDRV_SECTOR_MASK) == 0);
1289
1290again:
1291    start = offset;
1292    remaining = (uint64_t)*num << BDRV_SECTOR_BITS;
1293    cluster_offset = 0;
1294    *host_offset = 0;
1295    cur_bytes = 0;
1296    *m = NULL;
1297
1298    while (true) {
1299
1300        if (!*host_offset) {
1301            *host_offset = start_of_cluster(s, cluster_offset);
1302        }
1303
1304        assert(remaining >= cur_bytes);
1305
1306        start           += cur_bytes;
1307        remaining       -= cur_bytes;
1308        cluster_offset  += cur_bytes;
1309
1310        if (remaining == 0) {
1311            break;
1312        }
1313
1314        cur_bytes = remaining;
1315
1316        /*
1317         * Now start gathering as many contiguous clusters as possible:
1318         *
1319         * 1. Check for overlaps with in-flight allocations
1320         *
1321         *      a) Overlap not in the first cluster -> shorten this request and
1322         *         let the caller handle the rest in its next loop iteration.
1323         *
1324         *      b) Real overlaps of two requests. Yield and restart the search
1325         *         for contiguous clusters (the situation could have changed
1326         *         while we were sleeping)
1327         *
1328         *      c) TODO: Request starts in the same cluster as the in-flight
1329         *         allocation ends. Shorten the COW of the in-fight allocation,
1330         *         set cluster_offset to write to the same cluster and set up
1331         *         the right synchronisation between the in-flight request and
1332         *         the new one.
1333         */
1334        ret = handle_dependencies(bs, start, &cur_bytes, m);
1335        if (ret == -EAGAIN) {
1336            /* Currently handle_dependencies() doesn't yield if we already had
1337             * an allocation. If it did, we would have to clean up the L2Meta
1338             * structs before starting over. */
1339            assert(*m == NULL);
1340            goto again;
1341        } else if (ret < 0) {
1342            return ret;
1343        } else if (cur_bytes == 0) {
1344            break;
1345        } else {
1346            /* handle_dependencies() may have decreased cur_bytes (shortened
1347             * the allocations below) so that the next dependency is processed
1348             * correctly during the next loop iteration. */
1349        }
1350
1351        /*
1352         * 2. Count contiguous COPIED clusters.
1353         */
1354        ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m);
1355        if (ret < 0) {
1356            return ret;
1357        } else if (ret) {
1358            continue;
1359        } else if (cur_bytes == 0) {
1360            break;
1361        }
1362
1363        /*
1364         * 3. If the request still hasn't completed, allocate new clusters,
1365         *    considering any cluster_offset of steps 1c or 2.
1366         */
1367        ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m);
1368        if (ret < 0) {
1369            return ret;
1370        } else if (ret) {
1371            continue;
1372        } else {
1373            assert(cur_bytes == 0);
1374            break;
1375        }
1376    }
1377
1378    *num -= remaining >> BDRV_SECTOR_BITS;
1379    assert(*num > 0);
1380    assert(*host_offset != 0);
1381
1382    return 0;
1383}
1384
1385static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
1386                             const uint8_t *buf, int buf_size)
1387{
1388    z_stream strm1, *strm = &strm1;
1389    int ret, out_len;
1390
1391    memset(strm, 0, sizeof(*strm));
1392
1393    strm->next_in = (uint8_t *)buf;
1394    strm->avail_in = buf_size;
1395    strm->next_out = out_buf;
1396    strm->avail_out = out_buf_size;
1397
1398    ret = inflateInit2(strm, -12);
1399    if (ret != Z_OK)
1400        return -1;
1401    ret = inflate(strm, Z_FINISH);
1402    out_len = strm->next_out - out_buf;
1403    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
1404        out_len != out_buf_size) {
1405        inflateEnd(strm);
1406        return -1;
1407    }
1408    inflateEnd(strm);
1409    return 0;
1410}
1411
1412int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
1413{
1414    BDRVQcow2State *s = bs->opaque;
1415    int ret, csize, nb_csectors, sector_offset;
1416    uint64_t coffset;
1417
1418    coffset = cluster_offset & s->cluster_offset_mask;
1419    if (s->cluster_cache_offset != coffset) {
1420        nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
1421        sector_offset = coffset & 511;
1422        csize = nb_csectors * 512 - sector_offset;
1423        BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
1424        ret = bdrv_read(bs->file->bs, coffset >> 9, s->cluster_data,
1425                        nb_csectors);
1426        if (ret < 0) {
1427            return ret;
1428        }
1429        if (decompress_buffer(s->cluster_cache, s->cluster_size,
1430                              s->cluster_data + sector_offset, csize) < 0) {
1431            return -EIO;
1432        }
1433        s->cluster_cache_offset = coffset;
1434    }
1435    return 0;
1436}
1437
1438/*
1439 * This discards as many clusters of nb_clusters as possible at once (i.e.
1440 * all clusters in the same L2 table) and returns the number of discarded
1441 * clusters.
1442 */
1443static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
1444                             uint64_t nb_clusters, enum qcow2_discard_type type,
1445                             bool full_discard)
1446{
1447    BDRVQcow2State *s = bs->opaque;
1448    uint64_t *l2_table;
1449    int l2_index;
1450    int ret;
1451    int i;
1452
1453    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
1454    if (ret < 0) {
1455        return ret;
1456    }
1457
1458    /* Limit nb_clusters to one L2 table */
1459    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
1460    assert(nb_clusters <= INT_MAX);
1461
1462    for (i = 0; i < nb_clusters; i++) {
1463        uint64_t old_l2_entry;
1464
1465        old_l2_entry = be64_to_cpu(l2_table[l2_index + i]);
1466
1467        /*
1468         * If full_discard is false, make sure that a discarded area reads back
1469         * as zeroes for v3 images (we cannot do it for v2 without actually
1470         * writing a zero-filled buffer). We can skip the operation if the
1471         * cluster is already marked as zero, or if it's unallocated and we
1472         * don't have a backing file.
1473         *
1474         * TODO We might want to use bdrv_get_block_status(bs) here, but we're
1475         * holding s->lock, so that doesn't work today.
1476         *
1477         * If full_discard is true, the sector should not read back as zeroes,
1478         * but rather fall through to the backing file.
1479         */
1480        switch (qcow2_get_cluster_type(old_l2_entry)) {
1481            case QCOW2_CLUSTER_UNALLOCATED:
1482                if (full_discard || !bs->backing) {
1483                    continue;
1484                }
1485                break;
1486
1487            case QCOW2_CLUSTER_ZERO:
1488                if (!full_discard) {
1489                    continue;
1490                }
1491                break;
1492
1493            case QCOW2_CLUSTER_NORMAL:
1494            case QCOW2_CLUSTER_COMPRESSED:
1495                break;
1496
1497            default:
1498                abort();
1499        }
1500
1501        /* First remove L2 entries */
1502        qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
1503        if (!full_discard && s->qcow_version >= 3) {
1504            l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
1505        } else {
1506            l2_table[l2_index + i] = cpu_to_be64(0);
1507        }
1508
1509        /* Then decrease the refcount */
1510        qcow2_free_any_clusters(bs, old_l2_entry, 1, type);
1511    }
1512
1513    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
1514
1515    return nb_clusters;
1516}
1517
1518int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
1519    int nb_sectors, enum qcow2_discard_type type, bool full_discard)
1520{
1521    BDRVQcow2State *s = bs->opaque;
1522    uint64_t end_offset;
1523    uint64_t nb_clusters;
1524    int ret;
1525
1526    end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS);
1527
1528    /* Round start up and end down */
1529    offset = align_offset(offset, s->cluster_size);
1530    end_offset = start_of_cluster(s, end_offset);
1531
1532    if (offset > end_offset) {
1533        return 0;
1534    }
1535
1536    nb_clusters = size_to_clusters(s, end_offset - offset);
1537
1538    s->cache_discards = true;
1539
1540    /* Each L2 table is handled by its own loop iteration */
1541    while (nb_clusters > 0) {
1542        ret = discard_single_l2(bs, offset, nb_clusters, type, full_discard);
1543        if (ret < 0) {
1544            goto fail;
1545        }
1546
1547        nb_clusters -= ret;
1548        offset += (ret * s->cluster_size);
1549    }
1550
1551    ret = 0;
1552fail:
1553    s->cache_discards = false;
1554    qcow2_process_discards(bs, ret);
1555
1556    return ret;
1557}
1558
1559/*
1560 * This zeroes as many clusters of nb_clusters as possible at once (i.e.
1561 * all clusters in the same L2 table) and returns the number of zeroed
1562 * clusters.
1563 */
1564static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
1565                          uint64_t nb_clusters)
1566{
1567    BDRVQcow2State *s = bs->opaque;
1568    uint64_t *l2_table;
1569    int l2_index;
1570    int ret;
1571    int i;
1572
1573    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
1574    if (ret < 0) {
1575        return ret;
1576    }
1577
1578    /* Limit nb_clusters to one L2 table */
1579    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
1580    assert(nb_clusters <= INT_MAX);
1581
1582    for (i = 0; i < nb_clusters; i++) {
1583        uint64_t old_offset;
1584
1585        old_offset = be64_to_cpu(l2_table[l2_index + i]);
1586
1587        /* Update L2 entries */
1588        qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
1589        if (old_offset & QCOW_OFLAG_COMPRESSED) {
1590            l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
1591            qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
1592        } else {
1593            l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO);
1594        }
1595    }
1596
1597    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
1598
1599    return nb_clusters;
1600}
1601
1602int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors)
1603{
1604    BDRVQcow2State *s = bs->opaque;
1605    uint64_t nb_clusters;
1606    int ret;
1607
1608    /* The zero flag is only supported by version 3 and newer */
1609    if (s->qcow_version < 3) {
1610        return -ENOTSUP;
1611    }
1612
1613    /* Each L2 table is handled by its own loop iteration */
1614    nb_clusters = size_to_clusters(s, nb_sectors << BDRV_SECTOR_BITS);
1615
1616    s->cache_discards = true;
1617
1618    while (nb_clusters > 0) {
1619        ret = zero_single_l2(bs, offset, nb_clusters);
1620        if (ret < 0) {
1621            goto fail;
1622        }
1623
1624        nb_clusters -= ret;
1625        offset += (ret * s->cluster_size);
1626    }
1627
1628    ret = 0;
1629fail:
1630    s->cache_discards = false;
1631    qcow2_process_discards(bs, ret);
1632
1633    return ret;
1634}
1635
1636/*
1637 * Expands all zero clusters in a specific L1 table (or deallocates them, for
1638 * non-backed non-pre-allocated zero clusters).
1639 *
1640 * l1_entries and *visited_l1_entries are used to keep track of progress for
1641 * status_cb(). l1_entries contains the total number of L1 entries and
1642 * *visited_l1_entries counts all visited L1 entries.
1643 */
1644static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
1645                                      int l1_size, int64_t *visited_l1_entries,
1646                                      int64_t l1_entries,
1647                                      BlockDriverAmendStatusCB *status_cb,
1648                                      void *cb_opaque)
1649{
1650    BDRVQcow2State *s = bs->opaque;
1651    bool is_active_l1 = (l1_table == s->l1_table);
1652    uint64_t *l2_table = NULL;
1653    int ret;
1654    int i, j;
1655
1656    if (!is_active_l1) {
1657        /* inactive L2 tables require a buffer to be stored in when loading
1658         * them from disk */
1659        l2_table = qemu_try_blockalign(bs->file->bs, s->cluster_size);
1660        if (l2_table == NULL) {
1661            return -ENOMEM;
1662        }
1663    }
1664
1665    for (i = 0; i < l1_size; i++) {
1666        uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK;
1667        bool l2_dirty = false;
1668        uint64_t l2_refcount;
1669
1670        if (!l2_offset) {
1671            /* unallocated */
1672            (*visited_l1_entries)++;
1673            if (status_cb) {
1674                status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque);
1675            }
1676            continue;
1677        }
1678
1679        if (offset_into_cluster(s, l2_offset)) {
1680            qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1681                                    PRIx64 " unaligned (L1 index: %#x)",
1682                                    l2_offset, i);
1683            ret = -EIO;
1684            goto fail;
1685        }
1686
1687        if (is_active_l1) {
1688            /* get active L2 tables from cache */
1689            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
1690                    (void **)&l2_table);
1691        } else {
1692            /* load inactive L2 tables from disk */
1693            ret = bdrv_read(bs->file->bs, l2_offset / BDRV_SECTOR_SIZE,
1694                            (void *)l2_table, s->cluster_sectors);
1695        }
1696        if (ret < 0) {
1697            goto fail;
1698        }
1699
1700        ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1701                                 &l2_refcount);
1702        if (ret < 0) {
1703            goto fail;
1704        }
1705
1706        for (j = 0; j < s->l2_size; j++) {
1707            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1708            int64_t offset = l2_entry & L2E_OFFSET_MASK;
1709            int cluster_type = qcow2_get_cluster_type(l2_entry);
1710            bool preallocated = offset != 0;
1711
1712            if (cluster_type != QCOW2_CLUSTER_ZERO) {
1713                continue;
1714            }
1715
1716            if (!preallocated) {
1717                if (!bs->backing) {
1718                    /* not backed; therefore we can simply deallocate the
1719                     * cluster */
1720                    l2_table[j] = 0;
1721                    l2_dirty = true;
1722                    continue;
1723                }
1724
1725                offset = qcow2_alloc_clusters(bs, s->cluster_size);
1726                if (offset < 0) {
1727                    ret = offset;
1728                    goto fail;
1729                }
1730
1731                if (l2_refcount > 1) {
1732                    /* For shared L2 tables, set the refcount accordingly (it is
1733                     * already 1 and needs to be l2_refcount) */
1734                    ret = qcow2_update_cluster_refcount(bs,
1735                            offset >> s->cluster_bits,
1736                            refcount_diff(1, l2_refcount), false,
1737                            QCOW2_DISCARD_OTHER);
1738                    if (ret < 0) {
1739                        qcow2_free_clusters(bs, offset, s->cluster_size,
1740                                            QCOW2_DISCARD_OTHER);
1741                        goto fail;
1742                    }
1743                }
1744            }
1745
1746            if (offset_into_cluster(s, offset)) {
1747                qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset "
1748                                        "%#" PRIx64 " unaligned (L2 offset: %#"
1749                                        PRIx64 ", L2 index: %#x)", offset,
1750                                        l2_offset, j);
1751                if (!preallocated) {
1752                    qcow2_free_clusters(bs, offset, s->cluster_size,
1753                                        QCOW2_DISCARD_ALWAYS);
1754                }
1755                ret = -EIO;
1756                goto fail;
1757            }
1758
1759            ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
1760            if (ret < 0) {
1761                if (!preallocated) {
1762                    qcow2_free_clusters(bs, offset, s->cluster_size,
1763                                        QCOW2_DISCARD_ALWAYS);
1764                }
1765                goto fail;
1766            }
1767
1768            ret = bdrv_write_zeroes(bs->file->bs, offset / BDRV_SECTOR_SIZE,
1769                                    s->cluster_sectors, 0);
1770            if (ret < 0) {
1771                if (!preallocated) {
1772                    qcow2_free_clusters(bs, offset, s->cluster_size,
1773                                        QCOW2_DISCARD_ALWAYS);
1774                }
1775                goto fail;
1776            }
1777
1778            if (l2_refcount == 1) {
1779                l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
1780            } else {
1781                l2_table[j] = cpu_to_be64(offset);
1782            }
1783            l2_dirty = true;
1784        }
1785
1786        if (is_active_l1) {
1787            if (l2_dirty) {
1788                qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
1789                qcow2_cache_depends_on_flush(s->l2_table_cache);
1790            }
1791            qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
1792        } else {
1793            if (l2_dirty) {
1794                ret = qcow2_pre_write_overlap_check(bs,
1795                        QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2, l2_offset,
1796                        s->cluster_size);
1797                if (ret < 0) {
1798                    goto fail;
1799                }
1800
1801                ret = bdrv_write(bs->file->bs, l2_offset / BDRV_SECTOR_SIZE,
1802                                 (void *)l2_table, s->cluster_sectors);
1803                if (ret < 0) {
1804                    goto fail;
1805                }
1806            }
1807        }
1808
1809        (*visited_l1_entries)++;
1810        if (status_cb) {
1811            status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque);
1812        }
1813    }
1814
1815    ret = 0;
1816
1817fail:
1818    if (l2_table) {
1819        if (!is_active_l1) {
1820            qemu_vfree(l2_table);
1821        } else {
1822            qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
1823        }
1824    }
1825    return ret;
1826}
1827
1828/*
1829 * For backed images, expands all zero clusters on the image. For non-backed
1830 * images, deallocates all non-pre-allocated zero clusters (and claims the
1831 * allocation for pre-allocated ones). This is important for downgrading to a
1832 * qcow2 version which doesn't yet support metadata zero clusters.
1833 */
1834int qcow2_expand_zero_clusters(BlockDriverState *bs,
1835                               BlockDriverAmendStatusCB *status_cb,
1836                               void *cb_opaque)
1837{
1838    BDRVQcow2State *s = bs->opaque;
1839    uint64_t *l1_table = NULL;
1840    int64_t l1_entries = 0, visited_l1_entries = 0;
1841    int ret;
1842    int i, j;
1843
1844    if (status_cb) {
1845        l1_entries = s->l1_size;
1846        for (i = 0; i < s->nb_snapshots; i++) {
1847            l1_entries += s->snapshots[i].l1_size;
1848        }
1849    }
1850
1851    ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size,
1852                                     &visited_l1_entries, l1_entries,
1853                                     status_cb, cb_opaque);
1854    if (ret < 0) {
1855        goto fail;
1856    }
1857
1858    /* Inactive L1 tables may point to active L2 tables - therefore it is
1859     * necessary to flush the L2 table cache before trying to access the L2
1860     * tables pointed to by inactive L1 entries (else we might try to expand
1861     * zero clusters that have already been expanded); furthermore, it is also
1862     * necessary to empty the L2 table cache, since it may contain tables which
1863     * are now going to be modified directly on disk, bypassing the cache.
1864     * qcow2_cache_empty() does both for us. */
1865    ret = qcow2_cache_empty(bs, s->l2_table_cache);
1866    if (ret < 0) {
1867        goto fail;
1868    }
1869
1870    for (i = 0; i < s->nb_snapshots; i++) {
1871        int l1_sectors = (s->snapshots[i].l1_size * sizeof(uint64_t) +
1872                BDRV_SECTOR_SIZE - 1) / BDRV_SECTOR_SIZE;
1873
1874        l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE);
1875
1876        ret = bdrv_read(bs->file->bs,
1877                        s->snapshots[i].l1_table_offset / BDRV_SECTOR_SIZE,
1878                        (void *)l1_table, l1_sectors);
1879        if (ret < 0) {
1880            goto fail;
1881        }
1882
1883        for (j = 0; j < s->snapshots[i].l1_size; j++) {
1884            be64_to_cpus(&l1_table[j]);
1885        }
1886
1887        ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size,
1888                                         &visited_l1_entries, l1_entries,
1889                                         status_cb, cb_opaque);
1890        if (ret < 0) {
1891            goto fail;
1892        }
1893    }
1894
1895    ret = 0;
1896
1897fail:
1898    g_free(l1_table);
1899    return ret;
1900}
1901