qemu/block/qcow.c
<<
>>
Prefs
   1/*
   2 * Block driver for the QCOW format
   3 *
   4 * Copyright (c) 2004-2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24#include "qemu/osdep.h"
  25#include "qapi/error.h"
  26#include "qemu-common.h"
  27#include "qemu/error-report.h"
  28#include "block/block_int.h"
  29#include "sysemu/block-backend.h"
  30#include "qemu/module.h"
  31#include <zlib.h>
  32#include "qapi/qmp/qerror.h"
  33#include "crypto/cipher.h"
  34#include "migration/migration.h"
  35
  36/**************************************************************/
  37/* QEMU COW block driver with compression and encryption support */
  38
  39#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
  40#define QCOW_VERSION 1
  41
  42#define QCOW_CRYPT_NONE 0
  43#define QCOW_CRYPT_AES  1
  44
  45#define QCOW_OFLAG_COMPRESSED (1LL << 63)
  46
  47typedef struct QCowHeader {
  48    uint32_t magic;
  49    uint32_t version;
  50    uint64_t backing_file_offset;
  51    uint32_t backing_file_size;
  52    uint32_t mtime;
  53    uint64_t size; /* in bytes */
  54    uint8_t cluster_bits;
  55    uint8_t l2_bits;
  56    uint16_t padding;
  57    uint32_t crypt_method;
  58    uint64_t l1_table_offset;
  59} QEMU_PACKED QCowHeader;
  60
  61#define L2_CACHE_SIZE 16
  62
  63typedef struct BDRVQcowState {
  64    int cluster_bits;
  65    int cluster_size;
  66    int cluster_sectors;
  67    int l2_bits;
  68    int l2_size;
  69    unsigned int l1_size;
  70    uint64_t cluster_offset_mask;
  71    uint64_t l1_table_offset;
  72    uint64_t *l1_table;
  73    uint64_t *l2_cache;
  74    uint64_t l2_cache_offsets[L2_CACHE_SIZE];
  75    uint32_t l2_cache_counts[L2_CACHE_SIZE];
  76    uint8_t *cluster_cache;
  77    uint8_t *cluster_data;
  78    uint64_t cluster_cache_offset;
  79    QCryptoCipher *cipher; /* NULL if no key yet */
  80    uint32_t crypt_method_header;
  81    CoMutex lock;
  82    Error *migration_blocker;
  83} BDRVQcowState;
  84
  85static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
  86
  87static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
  88{
  89    const QCowHeader *cow_header = (const void *)buf;
  90
  91    if (buf_size >= sizeof(QCowHeader) &&
  92        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
  93        be32_to_cpu(cow_header->version) == QCOW_VERSION)
  94        return 100;
  95    else
  96        return 0;
  97}
  98
  99static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
 100                     Error **errp)
 101{
 102    BDRVQcowState *s = bs->opaque;
 103    unsigned int len, i, shift;
 104    int ret;
 105    QCowHeader header;
 106
 107    ret = bdrv_pread(bs->file->bs, 0, &header, sizeof(header));
 108    if (ret < 0) {
 109        goto fail;
 110    }
 111    be32_to_cpus(&header.magic);
 112    be32_to_cpus(&header.version);
 113    be64_to_cpus(&header.backing_file_offset);
 114    be32_to_cpus(&header.backing_file_size);
 115    be32_to_cpus(&header.mtime);
 116    be64_to_cpus(&header.size);
 117    be32_to_cpus(&header.crypt_method);
 118    be64_to_cpus(&header.l1_table_offset);
 119
 120    if (header.magic != QCOW_MAGIC) {
 121        error_setg(errp, "Image not in qcow format");
 122        ret = -EINVAL;
 123        goto fail;
 124    }
 125    if (header.version != QCOW_VERSION) {
 126        error_setg(errp, "Unsupported qcow version %" PRIu32, header.version);
 127        ret = -ENOTSUP;
 128        goto fail;
 129    }
 130
 131    if (header.size <= 1) {
 132        error_setg(errp, "Image size is too small (must be at least 2 bytes)");
 133        ret = -EINVAL;
 134        goto fail;
 135    }
 136    if (header.cluster_bits < 9 || header.cluster_bits > 16) {
 137        error_setg(errp, "Cluster size must be between 512 and 64k");
 138        ret = -EINVAL;
 139        goto fail;
 140    }
 141
 142    /* l2_bits specifies number of entries; storing a uint64_t in each entry,
 143     * so bytes = num_entries << 3. */
 144    if (header.l2_bits < 9 - 3 || header.l2_bits > 16 - 3) {
 145        error_setg(errp, "L2 table size must be between 512 and 64k");
 146        ret = -EINVAL;
 147        goto fail;
 148    }
 149
 150    if (header.crypt_method > QCOW_CRYPT_AES) {
 151        error_setg(errp, "invalid encryption method in qcow header");
 152        ret = -EINVAL;
 153        goto fail;
 154    }
 155    if (!qcrypto_cipher_supports(QCRYPTO_CIPHER_ALG_AES_128)) {
 156        error_setg(errp, "AES cipher not available");
 157        ret = -EINVAL;
 158        goto fail;
 159    }
 160    s->crypt_method_header = header.crypt_method;
 161    if (s->crypt_method_header) {
 162        if (bdrv_uses_whitelist() &&
 163            s->crypt_method_header == QCOW_CRYPT_AES) {
 164            error_report("qcow built-in AES encryption is deprecated");
 165            error_printf("Support for it will be removed in a future release.\n"
 166                         "You can use 'qemu-img convert' to switch to an\n"
 167                         "unencrypted qcow image, or a LUKS raw image.\n");
 168        }
 169
 170        bs->encrypted = 1;
 171    }
 172    s->cluster_bits = header.cluster_bits;
 173    s->cluster_size = 1 << s->cluster_bits;
 174    s->cluster_sectors = 1 << (s->cluster_bits - 9);
 175    s->l2_bits = header.l2_bits;
 176    s->l2_size = 1 << s->l2_bits;
 177    bs->total_sectors = header.size / 512;
 178    s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
 179
 180    /* read the level 1 table */
 181    shift = s->cluster_bits + s->l2_bits;
 182    if (header.size > UINT64_MAX - (1LL << shift)) {
 183        error_setg(errp, "Image too large");
 184        ret = -EINVAL;
 185        goto fail;
 186    } else {
 187        uint64_t l1_size = (header.size + (1LL << shift) - 1) >> shift;
 188        if (l1_size > INT_MAX / sizeof(uint64_t)) {
 189            error_setg(errp, "Image too large");
 190            ret = -EINVAL;
 191            goto fail;
 192        }
 193        s->l1_size = l1_size;
 194    }
 195
 196    s->l1_table_offset = header.l1_table_offset;
 197    s->l1_table = g_try_new(uint64_t, s->l1_size);
 198    if (s->l1_table == NULL) {
 199        error_setg(errp, "Could not allocate memory for L1 table");
 200        ret = -ENOMEM;
 201        goto fail;
 202    }
 203
 204    ret = bdrv_pread(bs->file->bs, s->l1_table_offset, s->l1_table,
 205               s->l1_size * sizeof(uint64_t));
 206    if (ret < 0) {
 207        goto fail;
 208    }
 209
 210    for(i = 0;i < s->l1_size; i++) {
 211        be64_to_cpus(&s->l1_table[i]);
 212    }
 213
 214    /* alloc L2 cache (max. 64k * 16 * 8 = 8 MB) */
 215    s->l2_cache =
 216        qemu_try_blockalign(bs->file->bs,
 217                            s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
 218    if (s->l2_cache == NULL) {
 219        error_setg(errp, "Could not allocate L2 table cache");
 220        ret = -ENOMEM;
 221        goto fail;
 222    }
 223    s->cluster_cache = g_malloc(s->cluster_size);
 224    s->cluster_data = g_malloc(s->cluster_size);
 225    s->cluster_cache_offset = -1;
 226
 227    /* read the backing file name */
 228    if (header.backing_file_offset != 0) {
 229        len = header.backing_file_size;
 230        if (len > 1023 || len >= sizeof(bs->backing_file)) {
 231            error_setg(errp, "Backing file name too long");
 232            ret = -EINVAL;
 233            goto fail;
 234        }
 235        ret = bdrv_pread(bs->file->bs, header.backing_file_offset,
 236                   bs->backing_file, len);
 237        if (ret < 0) {
 238            goto fail;
 239        }
 240        bs->backing_file[len] = '\0';
 241    }
 242
 243    /* Disable migration when qcow images are used */
 244    error_setg(&s->migration_blocker, "The qcow format used by node '%s' "
 245               "does not support live migration",
 246               bdrv_get_device_or_node_name(bs));
 247    migrate_add_blocker(s->migration_blocker);
 248
 249    qemu_co_mutex_init(&s->lock);
 250    return 0;
 251
 252 fail:
 253    g_free(s->l1_table);
 254    qemu_vfree(s->l2_cache);
 255    g_free(s->cluster_cache);
 256    g_free(s->cluster_data);
 257    return ret;
 258}
 259
 260
 261/* We have nothing to do for QCOW reopen, stubs just return
 262 * success */
 263static int qcow_reopen_prepare(BDRVReopenState *state,
 264                               BlockReopenQueue *queue, Error **errp)
 265{
 266    return 0;
 267}
 268
 269static int qcow_set_key(BlockDriverState *bs, const char *key)
 270{
 271    BDRVQcowState *s = bs->opaque;
 272    uint8_t keybuf[16];
 273    int len, i;
 274    Error *err;
 275
 276    memset(keybuf, 0, 16);
 277    len = strlen(key);
 278    if (len > 16)
 279        len = 16;
 280    /* XXX: we could compress the chars to 7 bits to increase
 281       entropy */
 282    for(i = 0;i < len;i++) {
 283        keybuf[i] = key[i];
 284    }
 285    assert(bs->encrypted);
 286
 287    qcrypto_cipher_free(s->cipher);
 288    s->cipher = qcrypto_cipher_new(
 289        QCRYPTO_CIPHER_ALG_AES_128,
 290        QCRYPTO_CIPHER_MODE_CBC,
 291        keybuf, G_N_ELEMENTS(keybuf),
 292        &err);
 293
 294    if (!s->cipher) {
 295        /* XXX would be nice if errors in this method could
 296         * be properly propagate to the caller. Would need
 297         * the bdrv_set_key() API signature to be fixed. */
 298        error_free(err);
 299        return -1;
 300    }
 301    return 0;
 302}
 303
 304/* The crypt function is compatible with the linux cryptoloop
 305   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
 306   supported */
 307static int encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
 308                           uint8_t *out_buf, const uint8_t *in_buf,
 309                           int nb_sectors, bool enc, Error **errp)
 310{
 311    union {
 312        uint64_t ll[2];
 313        uint8_t b[16];
 314    } ivec;
 315    int i;
 316    int ret;
 317
 318    for(i = 0; i < nb_sectors; i++) {
 319        ivec.ll[0] = cpu_to_le64(sector_num);
 320        ivec.ll[1] = 0;
 321        if (qcrypto_cipher_setiv(s->cipher,
 322                                 ivec.b, G_N_ELEMENTS(ivec.b),
 323                                 errp) < 0) {
 324            return -1;
 325        }
 326        if (enc) {
 327            ret = qcrypto_cipher_encrypt(s->cipher,
 328                                         in_buf,
 329                                         out_buf,
 330                                         512,
 331                                         errp);
 332        } else {
 333            ret = qcrypto_cipher_decrypt(s->cipher,
 334                                         in_buf,
 335                                         out_buf,
 336                                         512,
 337                                         errp);
 338        }
 339        if (ret < 0) {
 340            return -1;
 341        }
 342        sector_num++;
 343        in_buf += 512;
 344        out_buf += 512;
 345    }
 346    return 0;
 347}
 348
 349/* 'allocate' is:
 350 *
 351 * 0 to not allocate.
 352 *
 353 * 1 to allocate a normal cluster (for sector indexes 'n_start' to
 354 * 'n_end')
 355 *
 356 * 2 to allocate a compressed cluster of size
 357 * 'compressed_size'. 'compressed_size' must be > 0 and <
 358 * cluster_size
 359 *
 360 * return 0 if not allocated.
 361 */
 362static uint64_t get_cluster_offset(BlockDriverState *bs,
 363                                   uint64_t offset, int allocate,
 364                                   int compressed_size,
 365                                   int n_start, int n_end)
 366{
 367    BDRVQcowState *s = bs->opaque;
 368    int min_index, i, j, l1_index, l2_index;
 369    uint64_t l2_offset, *l2_table, cluster_offset, tmp;
 370    uint32_t min_count;
 371    int new_l2_table;
 372
 373    l1_index = offset >> (s->l2_bits + s->cluster_bits);
 374    l2_offset = s->l1_table[l1_index];
 375    new_l2_table = 0;
 376    if (!l2_offset) {
 377        if (!allocate)
 378            return 0;
 379        /* allocate a new l2 entry */
 380        l2_offset = bdrv_getlength(bs->file->bs);
 381        /* round to cluster size */
 382        l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1);
 383        /* update the L1 entry */
 384        s->l1_table[l1_index] = l2_offset;
 385        tmp = cpu_to_be64(l2_offset);
 386        if (bdrv_pwrite_sync(bs->file->bs,
 387                s->l1_table_offset + l1_index * sizeof(tmp),
 388                &tmp, sizeof(tmp)) < 0)
 389            return 0;
 390        new_l2_table = 1;
 391    }
 392    for(i = 0; i < L2_CACHE_SIZE; i++) {
 393        if (l2_offset == s->l2_cache_offsets[i]) {
 394            /* increment the hit count */
 395            if (++s->l2_cache_counts[i] == 0xffffffff) {
 396                for(j = 0; j < L2_CACHE_SIZE; j++) {
 397                    s->l2_cache_counts[j] >>= 1;
 398                }
 399            }
 400            l2_table = s->l2_cache + (i << s->l2_bits);
 401            goto found;
 402        }
 403    }
 404    /* not found: load a new entry in the least used one */
 405    min_index = 0;
 406    min_count = 0xffffffff;
 407    for(i = 0; i < L2_CACHE_SIZE; i++) {
 408        if (s->l2_cache_counts[i] < min_count) {
 409            min_count = s->l2_cache_counts[i];
 410            min_index = i;
 411        }
 412    }
 413    l2_table = s->l2_cache + (min_index << s->l2_bits);
 414    if (new_l2_table) {
 415        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
 416        if (bdrv_pwrite_sync(bs->file->bs, l2_offset, l2_table,
 417                s->l2_size * sizeof(uint64_t)) < 0)
 418            return 0;
 419    } else {
 420        if (bdrv_pread(bs->file->bs, l2_offset, l2_table,
 421                       s->l2_size * sizeof(uint64_t)) !=
 422            s->l2_size * sizeof(uint64_t))
 423            return 0;
 424    }
 425    s->l2_cache_offsets[min_index] = l2_offset;
 426    s->l2_cache_counts[min_index] = 1;
 427 found:
 428    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
 429    cluster_offset = be64_to_cpu(l2_table[l2_index]);
 430    if (!cluster_offset ||
 431        ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) {
 432        if (!allocate)
 433            return 0;
 434        /* allocate a new cluster */
 435        if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
 436            (n_end - n_start) < s->cluster_sectors) {
 437            /* if the cluster is already compressed, we must
 438               decompress it in the case it is not completely
 439               overwritten */
 440            if (decompress_cluster(bs, cluster_offset) < 0)
 441                return 0;
 442            cluster_offset = bdrv_getlength(bs->file->bs);
 443            cluster_offset = (cluster_offset + s->cluster_size - 1) &
 444                ~(s->cluster_size - 1);
 445            /* write the cluster content */
 446            if (bdrv_pwrite(bs->file->bs, cluster_offset, s->cluster_cache,
 447                            s->cluster_size) !=
 448                s->cluster_size)
 449                return -1;
 450        } else {
 451            cluster_offset = bdrv_getlength(bs->file->bs);
 452            if (allocate == 1) {
 453                /* round to cluster size */
 454                cluster_offset = (cluster_offset + s->cluster_size - 1) &
 455                    ~(s->cluster_size - 1);
 456                bdrv_truncate(bs->file->bs, cluster_offset + s->cluster_size);
 457                /* if encrypted, we must initialize the cluster
 458                   content which won't be written */
 459                if (bs->encrypted &&
 460                    (n_end - n_start) < s->cluster_sectors) {
 461                    uint64_t start_sect;
 462                    assert(s->cipher);
 463                    start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
 464                    memset(s->cluster_data + 512, 0x00, 512);
 465                    for(i = 0; i < s->cluster_sectors; i++) {
 466                        if (i < n_start || i >= n_end) {
 467                            Error *err = NULL;
 468                            if (encrypt_sectors(s, start_sect + i,
 469                                                s->cluster_data,
 470                                                s->cluster_data + 512, 1,
 471                                                true, &err) < 0) {
 472                                error_free(err);
 473                                errno = EIO;
 474                                return -1;
 475                            }
 476                            if (bdrv_pwrite(bs->file->bs,
 477                                            cluster_offset + i * 512,
 478                                            s->cluster_data, 512) != 512)
 479                                return -1;
 480                        }
 481                    }
 482                }
 483            } else if (allocate == 2) {
 484                cluster_offset |= QCOW_OFLAG_COMPRESSED |
 485                    (uint64_t)compressed_size << (63 - s->cluster_bits);
 486            }
 487        }
 488        /* update L2 table */
 489        tmp = cpu_to_be64(cluster_offset);
 490        l2_table[l2_index] = tmp;
 491        if (bdrv_pwrite_sync(bs->file->bs, l2_offset + l2_index * sizeof(tmp),
 492                &tmp, sizeof(tmp)) < 0)
 493            return 0;
 494    }
 495    return cluster_offset;
 496}
 497
 498static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs,
 499        int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
 500{
 501    BDRVQcowState *s = bs->opaque;
 502    int index_in_cluster, n;
 503    uint64_t cluster_offset;
 504
 505    qemu_co_mutex_lock(&s->lock);
 506    cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
 507    qemu_co_mutex_unlock(&s->lock);
 508    index_in_cluster = sector_num & (s->cluster_sectors - 1);
 509    n = s->cluster_sectors - index_in_cluster;
 510    if (n > nb_sectors)
 511        n = nb_sectors;
 512    *pnum = n;
 513    if (!cluster_offset) {
 514        return 0;
 515    }
 516    if ((cluster_offset & QCOW_OFLAG_COMPRESSED) || s->cipher) {
 517        return BDRV_BLOCK_DATA;
 518    }
 519    cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS);
 520    *file = bs->file->bs;
 521    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | cluster_offset;
 522}
 523
 524static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
 525                             const uint8_t *buf, int buf_size)
 526{
 527    z_stream strm1, *strm = &strm1;
 528    int ret, out_len;
 529
 530    memset(strm, 0, sizeof(*strm));
 531
 532    strm->next_in = (uint8_t *)buf;
 533    strm->avail_in = buf_size;
 534    strm->next_out = out_buf;
 535    strm->avail_out = out_buf_size;
 536
 537    ret = inflateInit2(strm, -12);
 538    if (ret != Z_OK)
 539        return -1;
 540    ret = inflate(strm, Z_FINISH);
 541    out_len = strm->next_out - out_buf;
 542    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
 543        out_len != out_buf_size) {
 544        inflateEnd(strm);
 545        return -1;
 546    }
 547    inflateEnd(strm);
 548    return 0;
 549}
 550
 551static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
 552{
 553    BDRVQcowState *s = bs->opaque;
 554    int ret, csize;
 555    uint64_t coffset;
 556
 557    coffset = cluster_offset & s->cluster_offset_mask;
 558    if (s->cluster_cache_offset != coffset) {
 559        csize = cluster_offset >> (63 - s->cluster_bits);
 560        csize &= (s->cluster_size - 1);
 561        ret = bdrv_pread(bs->file->bs, coffset, s->cluster_data, csize);
 562        if (ret != csize)
 563            return -1;
 564        if (decompress_buffer(s->cluster_cache, s->cluster_size,
 565                              s->cluster_data, csize) < 0) {
 566            return -1;
 567        }
 568        s->cluster_cache_offset = coffset;
 569    }
 570    return 0;
 571}
 572
 573static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
 574                         int nb_sectors, QEMUIOVector *qiov)
 575{
 576    BDRVQcowState *s = bs->opaque;
 577    int index_in_cluster;
 578    int ret = 0, n;
 579    uint64_t cluster_offset;
 580    struct iovec hd_iov;
 581    QEMUIOVector hd_qiov;
 582    uint8_t *buf;
 583    void *orig_buf;
 584    Error *err = NULL;
 585
 586    if (qiov->niov > 1) {
 587        buf = orig_buf = qemu_try_blockalign(bs, qiov->size);
 588        if (buf == NULL) {
 589            return -ENOMEM;
 590        }
 591    } else {
 592        orig_buf = NULL;
 593        buf = (uint8_t *)qiov->iov->iov_base;
 594    }
 595
 596    qemu_co_mutex_lock(&s->lock);
 597
 598    while (nb_sectors != 0) {
 599        /* prepare next request */
 600        cluster_offset = get_cluster_offset(bs, sector_num << 9,
 601                                                 0, 0, 0, 0);
 602        index_in_cluster = sector_num & (s->cluster_sectors - 1);
 603        n = s->cluster_sectors - index_in_cluster;
 604        if (n > nb_sectors) {
 605            n = nb_sectors;
 606        }
 607
 608        if (!cluster_offset) {
 609            if (bs->backing) {
 610                /* read from the base image */
 611                hd_iov.iov_base = (void *)buf;
 612                hd_iov.iov_len = n * 512;
 613                qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
 614                qemu_co_mutex_unlock(&s->lock);
 615                ret = bdrv_co_readv(bs->backing->bs, sector_num,
 616                                    n, &hd_qiov);
 617                qemu_co_mutex_lock(&s->lock);
 618                if (ret < 0) {
 619                    goto fail;
 620                }
 621            } else {
 622                /* Note: in this case, no need to wait */
 623                memset(buf, 0, 512 * n);
 624            }
 625        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
 626            /* add AIO support for compressed blocks ? */
 627            if (decompress_cluster(bs, cluster_offset) < 0) {
 628                goto fail;
 629            }
 630            memcpy(buf,
 631                   s->cluster_cache + index_in_cluster * 512, 512 * n);
 632        } else {
 633            if ((cluster_offset & 511) != 0) {
 634                goto fail;
 635            }
 636            hd_iov.iov_base = (void *)buf;
 637            hd_iov.iov_len = n * 512;
 638            qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
 639            qemu_co_mutex_unlock(&s->lock);
 640            ret = bdrv_co_readv(bs->file->bs,
 641                                (cluster_offset >> 9) + index_in_cluster,
 642                                n, &hd_qiov);
 643            qemu_co_mutex_lock(&s->lock);
 644            if (ret < 0) {
 645                break;
 646            }
 647            if (bs->encrypted) {
 648                assert(s->cipher);
 649                if (encrypt_sectors(s, sector_num, buf, buf,
 650                                    n, false, &err) < 0) {
 651                    goto fail;
 652                }
 653            }
 654        }
 655        ret = 0;
 656
 657        nb_sectors -= n;
 658        sector_num += n;
 659        buf += n * 512;
 660    }
 661
 662done:
 663    qemu_co_mutex_unlock(&s->lock);
 664
 665    if (qiov->niov > 1) {
 666        qemu_iovec_from_buf(qiov, 0, orig_buf, qiov->size);
 667        qemu_vfree(orig_buf);
 668    }
 669
 670    return ret;
 671
 672fail:
 673    error_free(err);
 674    ret = -EIO;
 675    goto done;
 676}
 677
 678static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
 679                          int nb_sectors, QEMUIOVector *qiov)
 680{
 681    BDRVQcowState *s = bs->opaque;
 682    int index_in_cluster;
 683    uint64_t cluster_offset;
 684    const uint8_t *src_buf;
 685    int ret = 0, n;
 686    uint8_t *cluster_data = NULL;
 687    struct iovec hd_iov;
 688    QEMUIOVector hd_qiov;
 689    uint8_t *buf;
 690    void *orig_buf;
 691
 692    s->cluster_cache_offset = -1; /* disable compressed cache */
 693
 694    if (qiov->niov > 1) {
 695        buf = orig_buf = qemu_try_blockalign(bs, qiov->size);
 696        if (buf == NULL) {
 697            return -ENOMEM;
 698        }
 699        qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
 700    } else {
 701        orig_buf = NULL;
 702        buf = (uint8_t *)qiov->iov->iov_base;
 703    }
 704
 705    qemu_co_mutex_lock(&s->lock);
 706
 707    while (nb_sectors != 0) {
 708
 709        index_in_cluster = sector_num & (s->cluster_sectors - 1);
 710        n = s->cluster_sectors - index_in_cluster;
 711        if (n > nb_sectors) {
 712            n = nb_sectors;
 713        }
 714        cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0,
 715                                            index_in_cluster,
 716                                            index_in_cluster + n);
 717        if (!cluster_offset || (cluster_offset & 511) != 0) {
 718            ret = -EIO;
 719            break;
 720        }
 721        if (bs->encrypted) {
 722            Error *err = NULL;
 723            assert(s->cipher);
 724            if (!cluster_data) {
 725                cluster_data = g_malloc0(s->cluster_size);
 726            }
 727            if (encrypt_sectors(s, sector_num, cluster_data, buf,
 728                                n, true, &err) < 0) {
 729                error_free(err);
 730                ret = -EIO;
 731                break;
 732            }
 733            src_buf = cluster_data;
 734        } else {
 735            src_buf = buf;
 736        }
 737
 738        hd_iov.iov_base = (void *)src_buf;
 739        hd_iov.iov_len = n * 512;
 740        qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
 741        qemu_co_mutex_unlock(&s->lock);
 742        ret = bdrv_co_writev(bs->file->bs,
 743                             (cluster_offset >> 9) + index_in_cluster,
 744                             n, &hd_qiov);
 745        qemu_co_mutex_lock(&s->lock);
 746        if (ret < 0) {
 747            break;
 748        }
 749        ret = 0;
 750
 751        nb_sectors -= n;
 752        sector_num += n;
 753        buf += n * 512;
 754    }
 755    qemu_co_mutex_unlock(&s->lock);
 756
 757    if (qiov->niov > 1) {
 758        qemu_vfree(orig_buf);
 759    }
 760    g_free(cluster_data);
 761
 762    return ret;
 763}
 764
 765static void qcow_close(BlockDriverState *bs)
 766{
 767    BDRVQcowState *s = bs->opaque;
 768
 769    qcrypto_cipher_free(s->cipher);
 770    s->cipher = NULL;
 771    g_free(s->l1_table);
 772    qemu_vfree(s->l2_cache);
 773    g_free(s->cluster_cache);
 774    g_free(s->cluster_data);
 775
 776    migrate_del_blocker(s->migration_blocker);
 777    error_free(s->migration_blocker);
 778}
 779
 780static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
 781{
 782    int header_size, backing_filename_len, l1_size, shift, i;
 783    QCowHeader header;
 784    uint8_t *tmp;
 785    int64_t total_size = 0;
 786    char *backing_file = NULL;
 787    int flags = 0;
 788    Error *local_err = NULL;
 789    int ret;
 790    BlockBackend *qcow_blk;
 791
 792    /* Read out options */
 793    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
 794                          BDRV_SECTOR_SIZE);
 795    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
 796    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) {
 797        flags |= BLOCK_FLAG_ENCRYPT;
 798    }
 799
 800    ret = bdrv_create_file(filename, opts, &local_err);
 801    if (ret < 0) {
 802        error_propagate(errp, local_err);
 803        goto cleanup;
 804    }
 805
 806    qcow_blk = blk_new_open(filename, NULL, NULL,
 807                            BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
 808    if (qcow_blk == NULL) {
 809        error_propagate(errp, local_err);
 810        ret = -EIO;
 811        goto cleanup;
 812    }
 813
 814    blk_set_allow_write_beyond_eof(qcow_blk, true);
 815
 816    ret = blk_truncate(qcow_blk, 0);
 817    if (ret < 0) {
 818        goto exit;
 819    }
 820
 821    memset(&header, 0, sizeof(header));
 822    header.magic = cpu_to_be32(QCOW_MAGIC);
 823    header.version = cpu_to_be32(QCOW_VERSION);
 824    header.size = cpu_to_be64(total_size);
 825    header_size = sizeof(header);
 826    backing_filename_len = 0;
 827    if (backing_file) {
 828        if (strcmp(backing_file, "fat:")) {
 829            header.backing_file_offset = cpu_to_be64(header_size);
 830            backing_filename_len = strlen(backing_file);
 831            header.backing_file_size = cpu_to_be32(backing_filename_len);
 832            header_size += backing_filename_len;
 833        } else {
 834            /* special backing file for vvfat */
 835            backing_file = NULL;
 836        }
 837        header.cluster_bits = 9; /* 512 byte cluster to avoid copying
 838                                    unmodified sectors */
 839        header.l2_bits = 12; /* 32 KB L2 tables */
 840    } else {
 841        header.cluster_bits = 12; /* 4 KB clusters */
 842        header.l2_bits = 9; /* 4 KB L2 tables */
 843    }
 844    header_size = (header_size + 7) & ~7;
 845    shift = header.cluster_bits + header.l2_bits;
 846    l1_size = (total_size + (1LL << shift) - 1) >> shift;
 847
 848    header.l1_table_offset = cpu_to_be64(header_size);
 849    if (flags & BLOCK_FLAG_ENCRYPT) {
 850        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
 851    } else {
 852        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
 853    }
 854
 855    /* write all the data */
 856    ret = blk_pwrite(qcow_blk, 0, &header, sizeof(header));
 857    if (ret != sizeof(header)) {
 858        goto exit;
 859    }
 860
 861    if (backing_file) {
 862        ret = blk_pwrite(qcow_blk, sizeof(header),
 863            backing_file, backing_filename_len);
 864        if (ret != backing_filename_len) {
 865            goto exit;
 866        }
 867    }
 868
 869    tmp = g_malloc0(BDRV_SECTOR_SIZE);
 870    for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/
 871        BDRV_SECTOR_SIZE); i++) {
 872        ret = blk_pwrite(qcow_blk, header_size +
 873            BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE);
 874        if (ret != BDRV_SECTOR_SIZE) {
 875            g_free(tmp);
 876            goto exit;
 877        }
 878    }
 879
 880    g_free(tmp);
 881    ret = 0;
 882exit:
 883    blk_unref(qcow_blk);
 884cleanup:
 885    g_free(backing_file);
 886    return ret;
 887}
 888
 889static int qcow_make_empty(BlockDriverState *bs)
 890{
 891    BDRVQcowState *s = bs->opaque;
 892    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
 893    int ret;
 894
 895    memset(s->l1_table, 0, l1_length);
 896    if (bdrv_pwrite_sync(bs->file->bs, s->l1_table_offset, s->l1_table,
 897            l1_length) < 0)
 898        return -1;
 899    ret = bdrv_truncate(bs->file->bs, s->l1_table_offset + l1_length);
 900    if (ret < 0)
 901        return ret;
 902
 903    memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
 904    memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
 905    memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
 906
 907    return 0;
 908}
 909
 910/* XXX: put compressed sectors first, then all the cluster aligned
 911   tables to avoid losing bytes in alignment */
 912static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
 913                                 const uint8_t *buf, int nb_sectors)
 914{
 915    BDRVQcowState *s = bs->opaque;
 916    z_stream strm;
 917    int ret, out_len;
 918    uint8_t *out_buf;
 919    uint64_t cluster_offset;
 920
 921    if (nb_sectors != s->cluster_sectors) {
 922        ret = -EINVAL;
 923
 924        /* Zero-pad last write if image size is not cluster aligned */
 925        if (sector_num + nb_sectors == bs->total_sectors &&
 926            nb_sectors < s->cluster_sectors) {
 927            uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size);
 928            memset(pad_buf, 0, s->cluster_size);
 929            memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE);
 930            ret = qcow_write_compressed(bs, sector_num,
 931                                        pad_buf, s->cluster_sectors);
 932            qemu_vfree(pad_buf);
 933        }
 934        return ret;
 935    }
 936
 937    out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
 938
 939    /* best compression, small window, no zlib header */
 940    memset(&strm, 0, sizeof(strm));
 941    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
 942                       Z_DEFLATED, -12,
 943                       9, Z_DEFAULT_STRATEGY);
 944    if (ret != 0) {
 945        ret = -EINVAL;
 946        goto fail;
 947    }
 948
 949    strm.avail_in = s->cluster_size;
 950    strm.next_in = (uint8_t *)buf;
 951    strm.avail_out = s->cluster_size;
 952    strm.next_out = out_buf;
 953
 954    ret = deflate(&strm, Z_FINISH);
 955    if (ret != Z_STREAM_END && ret != Z_OK) {
 956        deflateEnd(&strm);
 957        ret = -EINVAL;
 958        goto fail;
 959    }
 960    out_len = strm.next_out - out_buf;
 961
 962    deflateEnd(&strm);
 963
 964    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
 965        /* could not compress: write normal cluster */
 966        ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors);
 967        if (ret < 0) {
 968            goto fail;
 969        }
 970    } else {
 971        cluster_offset = get_cluster_offset(bs, sector_num << 9, 2,
 972                                            out_len, 0, 0);
 973        if (cluster_offset == 0) {
 974            ret = -EIO;
 975            goto fail;
 976        }
 977
 978        cluster_offset &= s->cluster_offset_mask;
 979        ret = bdrv_pwrite(bs->file->bs, cluster_offset, out_buf, out_len);
 980        if (ret < 0) {
 981            goto fail;
 982        }
 983    }
 984
 985    ret = 0;
 986fail:
 987    g_free(out_buf);
 988    return ret;
 989}
 990
 991static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 992{
 993    BDRVQcowState *s = bs->opaque;
 994    bdi->cluster_size = s->cluster_size;
 995    return 0;
 996}
 997
 998static QemuOptsList qcow_create_opts = {
 999    .name = "qcow-create-opts",
1000    .head = QTAILQ_HEAD_INITIALIZER(qcow_create_opts.head),
1001    .desc = {
1002        {
1003            .name = BLOCK_OPT_SIZE,
1004            .type = QEMU_OPT_SIZE,
1005            .help = "Virtual disk size"
1006        },
1007        {
1008            .name = BLOCK_OPT_BACKING_FILE,
1009            .type = QEMU_OPT_STRING,
1010            .help = "File name of a base image"
1011        },
1012        {
1013            .name = BLOCK_OPT_ENCRYPT,
1014            .type = QEMU_OPT_BOOL,
1015            .help = "Encrypt the image",
1016            .def_value_str = "off"
1017        },
1018        { /* end of list */ }
1019    }
1020};
1021
1022static BlockDriver bdrv_qcow = {
1023    .format_name        = "qcow",
1024    .instance_size      = sizeof(BDRVQcowState),
1025    .bdrv_probe         = qcow_probe,
1026    .bdrv_open          = qcow_open,
1027    .bdrv_close         = qcow_close,
1028    .bdrv_reopen_prepare    = qcow_reopen_prepare,
1029    .bdrv_create            = qcow_create,
1030    .bdrv_has_zero_init     = bdrv_has_zero_init_1,
1031    .supports_backing       = true,
1032
1033    .bdrv_co_readv          = qcow_co_readv,
1034    .bdrv_co_writev         = qcow_co_writev,
1035    .bdrv_co_get_block_status   = qcow_co_get_block_status,
1036
1037    .bdrv_set_key           = qcow_set_key,
1038    .bdrv_make_empty        = qcow_make_empty,
1039    .bdrv_write_compressed  = qcow_write_compressed,
1040    .bdrv_get_info          = qcow_get_info,
1041
1042    .create_opts            = &qcow_create_opts,
1043};
1044
1045static void bdrv_qcow_init(void)
1046{
1047    bdrv_register(&bdrv_qcow);
1048}
1049
1050block_init(bdrv_qcow_init);
1051