qemu/block/qcow.c
<<
>>
Prefs
   1/*
   2 * Block driver for the QCOW format
   3 *
   4 * Copyright (c) 2004-2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24#include "qemu-common.h"
  25#include "block/block_int.h"
  26#include "qemu/module.h"
  27#include <zlib.h>
  28#include "block/aes.h"
  29#include "migration/migration.h"
  30
  31/**************************************************************/
  32/* QEMU COW block driver with compression and encryption support */
  33
  34#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
  35#define QCOW_VERSION 1
  36
  37#define QCOW_CRYPT_NONE 0
  38#define QCOW_CRYPT_AES  1
  39
  40#define QCOW_OFLAG_COMPRESSED (1LL << 63)
  41
  42typedef struct QCowHeader {
  43    uint32_t magic;
  44    uint32_t version;
  45    uint64_t backing_file_offset;
  46    uint32_t backing_file_size;
  47    uint32_t mtime;
  48    uint64_t size; /* in bytes */
  49    uint8_t cluster_bits;
  50    uint8_t l2_bits;
  51    uint32_t crypt_method;
  52    uint64_t l1_table_offset;
  53} QCowHeader;
  54
  55#define L2_CACHE_SIZE 16
  56
  57typedef struct BDRVQcowState {
  58    int cluster_bits;
  59    int cluster_size;
  60    int cluster_sectors;
  61    int l2_bits;
  62    int l2_size;
  63    int l1_size;
  64    uint64_t cluster_offset_mask;
  65    uint64_t l1_table_offset;
  66    uint64_t *l1_table;
  67    uint64_t *l2_cache;
  68    uint64_t l2_cache_offsets[L2_CACHE_SIZE];
  69    uint32_t l2_cache_counts[L2_CACHE_SIZE];
  70    uint8_t *cluster_cache;
  71    uint8_t *cluster_data;
  72    uint64_t cluster_cache_offset;
  73    uint32_t crypt_method; /* current crypt method, 0 if no key yet */
  74    uint32_t crypt_method_header;
  75    AES_KEY aes_encrypt_key;
  76    AES_KEY aes_decrypt_key;
  77    CoMutex lock;
  78    Error *migration_blocker;
  79} BDRVQcowState;
  80
  81static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
  82
  83static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
  84{
  85    const QCowHeader *cow_header = (const void *)buf;
  86
  87    if (buf_size >= sizeof(QCowHeader) &&
  88        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
  89        be32_to_cpu(cow_header->version) == QCOW_VERSION)
  90        return 100;
  91    else
  92        return 0;
  93}
  94
  95static int qcow_open(BlockDriverState *bs, int flags)
  96{
  97    BDRVQcowState *s = bs->opaque;
  98    int len, i, shift, ret;
  99    QCowHeader header;
 100
 101    ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
 102    if (ret < 0) {
 103        goto fail;
 104    }
 105    be32_to_cpus(&header.magic);
 106    be32_to_cpus(&header.version);
 107    be64_to_cpus(&header.backing_file_offset);
 108    be32_to_cpus(&header.backing_file_size);
 109    be32_to_cpus(&header.mtime);
 110    be64_to_cpus(&header.size);
 111    be32_to_cpus(&header.crypt_method);
 112    be64_to_cpus(&header.l1_table_offset);
 113
 114    if (header.magic != QCOW_MAGIC) {
 115        ret = -EMEDIUMTYPE;
 116        goto fail;
 117    }
 118    if (header.version != QCOW_VERSION) {
 119        char version[64];
 120        snprintf(version, sizeof(version), "QCOW version %d", header.version);
 121        qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
 122            bs->device_name, "qcow", version);
 123        ret = -ENOTSUP;
 124        goto fail;
 125    }
 126
 127    if (header.size <= 1 || header.cluster_bits < 9) {
 128        ret = -EINVAL;
 129        goto fail;
 130    }
 131    if (header.crypt_method > QCOW_CRYPT_AES) {
 132        ret = -EINVAL;
 133        goto fail;
 134    }
 135    s->crypt_method_header = header.crypt_method;
 136    if (s->crypt_method_header) {
 137        bs->encrypted = 1;
 138    }
 139    s->cluster_bits = header.cluster_bits;
 140    s->cluster_size = 1 << s->cluster_bits;
 141    s->cluster_sectors = 1 << (s->cluster_bits - 9);
 142    s->l2_bits = header.l2_bits;
 143    s->l2_size = 1 << s->l2_bits;
 144    bs->total_sectors = header.size / 512;
 145    s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
 146
 147    /* read the level 1 table */
 148    shift = s->cluster_bits + s->l2_bits;
 149    s->l1_size = (header.size + (1LL << shift) - 1) >> shift;
 150
 151    s->l1_table_offset = header.l1_table_offset;
 152    s->l1_table = g_malloc(s->l1_size * sizeof(uint64_t));
 153
 154    ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
 155               s->l1_size * sizeof(uint64_t));
 156    if (ret < 0) {
 157        goto fail;
 158    }
 159
 160    for(i = 0;i < s->l1_size; i++) {
 161        be64_to_cpus(&s->l1_table[i]);
 162    }
 163    /* alloc L2 cache */
 164    s->l2_cache = g_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
 165    s->cluster_cache = g_malloc(s->cluster_size);
 166    s->cluster_data = g_malloc(s->cluster_size);
 167    s->cluster_cache_offset = -1;
 168
 169    /* read the backing file name */
 170    if (header.backing_file_offset != 0) {
 171        len = header.backing_file_size;
 172        if (len > 1023) {
 173            len = 1023;
 174        }
 175        ret = bdrv_pread(bs->file, header.backing_file_offset,
 176                   bs->backing_file, len);
 177        if (ret < 0) {
 178            goto fail;
 179        }
 180        bs->backing_file[len] = '\0';
 181    }
 182
 183    /* Disable migration when qcow images are used */
 184    error_set(&s->migration_blocker,
 185              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
 186              "qcow", bs->device_name, "live migration");
 187    migrate_add_blocker(s->migration_blocker);
 188
 189    qemu_co_mutex_init(&s->lock);
 190    return 0;
 191
 192 fail:
 193    g_free(s->l1_table);
 194    g_free(s->l2_cache);
 195    g_free(s->cluster_cache);
 196    g_free(s->cluster_data);
 197    return ret;
 198}
 199
 200
 201/* We have nothing to do for QCOW reopen, stubs just return
 202 * success */
 203static int qcow_reopen_prepare(BDRVReopenState *state,
 204                               BlockReopenQueue *queue, Error **errp)
 205{
 206    return 0;
 207}
 208
 209static int qcow_set_key(BlockDriverState *bs, const char *key)
 210{
 211    BDRVQcowState *s = bs->opaque;
 212    uint8_t keybuf[16];
 213    int len, i;
 214
 215    memset(keybuf, 0, 16);
 216    len = strlen(key);
 217    if (len > 16)
 218        len = 16;
 219    /* XXX: we could compress the chars to 7 bits to increase
 220       entropy */
 221    for(i = 0;i < len;i++) {
 222        keybuf[i] = key[i];
 223    }
 224    s->crypt_method = s->crypt_method_header;
 225
 226    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
 227        return -1;
 228    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
 229        return -1;
 230    return 0;
 231}
 232
 233/* The crypt function is compatible with the linux cryptoloop
 234   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
 235   supported */
 236static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
 237                            uint8_t *out_buf, const uint8_t *in_buf,
 238                            int nb_sectors, int enc,
 239                            const AES_KEY *key)
 240{
 241    union {
 242        uint64_t ll[2];
 243        uint8_t b[16];
 244    } ivec;
 245    int i;
 246
 247    for(i = 0; i < nb_sectors; i++) {
 248        ivec.ll[0] = cpu_to_le64(sector_num);
 249        ivec.ll[1] = 0;
 250        AES_cbc_encrypt(in_buf, out_buf, 512, key,
 251                        ivec.b, enc);
 252        sector_num++;
 253        in_buf += 512;
 254        out_buf += 512;
 255    }
 256}
 257
 258/* 'allocate' is:
 259 *
 260 * 0 to not allocate.
 261 *
 262 * 1 to allocate a normal cluster (for sector indexes 'n_start' to
 263 * 'n_end')
 264 *
 265 * 2 to allocate a compressed cluster of size
 266 * 'compressed_size'. 'compressed_size' must be > 0 and <
 267 * cluster_size
 268 *
 269 * return 0 if not allocated.
 270 */
 271static uint64_t get_cluster_offset(BlockDriverState *bs,
 272                                   uint64_t offset, int allocate,
 273                                   int compressed_size,
 274                                   int n_start, int n_end)
 275{
 276    BDRVQcowState *s = bs->opaque;
 277    int min_index, i, j, l1_index, l2_index;
 278    uint64_t l2_offset, *l2_table, cluster_offset, tmp;
 279    uint32_t min_count;
 280    int new_l2_table;
 281
 282    l1_index = offset >> (s->l2_bits + s->cluster_bits);
 283    l2_offset = s->l1_table[l1_index];
 284    new_l2_table = 0;
 285    if (!l2_offset) {
 286        if (!allocate)
 287            return 0;
 288        /* allocate a new l2 entry */
 289        l2_offset = bdrv_getlength(bs->file);
 290        /* round to cluster size */
 291        l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1);
 292        /* update the L1 entry */
 293        s->l1_table[l1_index] = l2_offset;
 294        tmp = cpu_to_be64(l2_offset);
 295        if (bdrv_pwrite_sync(bs->file,
 296                s->l1_table_offset + l1_index * sizeof(tmp),
 297                &tmp, sizeof(tmp)) < 0)
 298            return 0;
 299        new_l2_table = 1;
 300    }
 301    for(i = 0; i < L2_CACHE_SIZE; i++) {
 302        if (l2_offset == s->l2_cache_offsets[i]) {
 303            /* increment the hit count */
 304            if (++s->l2_cache_counts[i] == 0xffffffff) {
 305                for(j = 0; j < L2_CACHE_SIZE; j++) {
 306                    s->l2_cache_counts[j] >>= 1;
 307                }
 308            }
 309            l2_table = s->l2_cache + (i << s->l2_bits);
 310            goto found;
 311        }
 312    }
 313    /* not found: load a new entry in the least used one */
 314    min_index = 0;
 315    min_count = 0xffffffff;
 316    for(i = 0; i < L2_CACHE_SIZE; i++) {
 317        if (s->l2_cache_counts[i] < min_count) {
 318            min_count = s->l2_cache_counts[i];
 319            min_index = i;
 320        }
 321    }
 322    l2_table = s->l2_cache + (min_index << s->l2_bits);
 323    if (new_l2_table) {
 324        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
 325        if (bdrv_pwrite_sync(bs->file, l2_offset, l2_table,
 326                s->l2_size * sizeof(uint64_t)) < 0)
 327            return 0;
 328    } else {
 329        if (bdrv_pread(bs->file, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
 330            s->l2_size * sizeof(uint64_t))
 331            return 0;
 332    }
 333    s->l2_cache_offsets[min_index] = l2_offset;
 334    s->l2_cache_counts[min_index] = 1;
 335 found:
 336    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
 337    cluster_offset = be64_to_cpu(l2_table[l2_index]);
 338    if (!cluster_offset ||
 339        ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) {
 340        if (!allocate)
 341            return 0;
 342        /* allocate a new cluster */
 343        if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
 344            (n_end - n_start) < s->cluster_sectors) {
 345            /* if the cluster is already compressed, we must
 346               decompress it in the case it is not completely
 347               overwritten */
 348            if (decompress_cluster(bs, cluster_offset) < 0)
 349                return 0;
 350            cluster_offset = bdrv_getlength(bs->file);
 351            cluster_offset = (cluster_offset + s->cluster_size - 1) &
 352                ~(s->cluster_size - 1);
 353            /* write the cluster content */
 354            if (bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache, s->cluster_size) !=
 355                s->cluster_size)
 356                return -1;
 357        } else {
 358            cluster_offset = bdrv_getlength(bs->file);
 359            if (allocate == 1) {
 360                /* round to cluster size */
 361                cluster_offset = (cluster_offset + s->cluster_size - 1) &
 362                    ~(s->cluster_size - 1);
 363                bdrv_truncate(bs->file, cluster_offset + s->cluster_size);
 364                /* if encrypted, we must initialize the cluster
 365                   content which won't be written */
 366                if (s->crypt_method &&
 367                    (n_end - n_start) < s->cluster_sectors) {
 368                    uint64_t start_sect;
 369                    start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
 370                    memset(s->cluster_data + 512, 0x00, 512);
 371                    for(i = 0; i < s->cluster_sectors; i++) {
 372                        if (i < n_start || i >= n_end) {
 373                            encrypt_sectors(s, start_sect + i,
 374                                            s->cluster_data,
 375                                            s->cluster_data + 512, 1, 1,
 376                                            &s->aes_encrypt_key);
 377                            if (bdrv_pwrite(bs->file, cluster_offset + i * 512,
 378                                            s->cluster_data, 512) != 512)
 379                                return -1;
 380                        }
 381                    }
 382                }
 383            } else if (allocate == 2) {
 384                cluster_offset |= QCOW_OFLAG_COMPRESSED |
 385                    (uint64_t)compressed_size << (63 - s->cluster_bits);
 386            }
 387        }
 388        /* update L2 table */
 389        tmp = cpu_to_be64(cluster_offset);
 390        l2_table[l2_index] = tmp;
 391        if (bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp),
 392                &tmp, sizeof(tmp)) < 0)
 393            return 0;
 394    }
 395    return cluster_offset;
 396}
 397
 398static int coroutine_fn qcow_co_is_allocated(BlockDriverState *bs,
 399        int64_t sector_num, int nb_sectors, int *pnum)
 400{
 401    BDRVQcowState *s = bs->opaque;
 402    int index_in_cluster, n;
 403    uint64_t cluster_offset;
 404
 405    qemu_co_mutex_lock(&s->lock);
 406    cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
 407    qemu_co_mutex_unlock(&s->lock);
 408    index_in_cluster = sector_num & (s->cluster_sectors - 1);
 409    n = s->cluster_sectors - index_in_cluster;
 410    if (n > nb_sectors)
 411        n = nb_sectors;
 412    *pnum = n;
 413    return (cluster_offset != 0);
 414}
 415
 416static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
 417                             const uint8_t *buf, int buf_size)
 418{
 419    z_stream strm1, *strm = &strm1;
 420    int ret, out_len;
 421
 422    memset(strm, 0, sizeof(*strm));
 423
 424    strm->next_in = (uint8_t *)buf;
 425    strm->avail_in = buf_size;
 426    strm->next_out = out_buf;
 427    strm->avail_out = out_buf_size;
 428
 429    ret = inflateInit2(strm, -12);
 430    if (ret != Z_OK)
 431        return -1;
 432    ret = inflate(strm, Z_FINISH);
 433    out_len = strm->next_out - out_buf;
 434    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
 435        out_len != out_buf_size) {
 436        inflateEnd(strm);
 437        return -1;
 438    }
 439    inflateEnd(strm);
 440    return 0;
 441}
 442
 443static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
 444{
 445    BDRVQcowState *s = bs->opaque;
 446    int ret, csize;
 447    uint64_t coffset;
 448
 449    coffset = cluster_offset & s->cluster_offset_mask;
 450    if (s->cluster_cache_offset != coffset) {
 451        csize = cluster_offset >> (63 - s->cluster_bits);
 452        csize &= (s->cluster_size - 1);
 453        ret = bdrv_pread(bs->file, coffset, s->cluster_data, csize);
 454        if (ret != csize)
 455            return -1;
 456        if (decompress_buffer(s->cluster_cache, s->cluster_size,
 457                              s->cluster_data, csize) < 0) {
 458            return -1;
 459        }
 460        s->cluster_cache_offset = coffset;
 461    }
 462    return 0;
 463}
 464
 465static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
 466                         int nb_sectors, QEMUIOVector *qiov)
 467{
 468    BDRVQcowState *s = bs->opaque;
 469    int index_in_cluster;
 470    int ret = 0, n;
 471    uint64_t cluster_offset;
 472    struct iovec hd_iov;
 473    QEMUIOVector hd_qiov;
 474    uint8_t *buf;
 475    void *orig_buf;
 476
 477    if (qiov->niov > 1) {
 478        buf = orig_buf = qemu_blockalign(bs, qiov->size);
 479    } else {
 480        orig_buf = NULL;
 481        buf = (uint8_t *)qiov->iov->iov_base;
 482    }
 483
 484    qemu_co_mutex_lock(&s->lock);
 485
 486    while (nb_sectors != 0) {
 487        /* prepare next request */
 488        cluster_offset = get_cluster_offset(bs, sector_num << 9,
 489                                                 0, 0, 0, 0);
 490        index_in_cluster = sector_num & (s->cluster_sectors - 1);
 491        n = s->cluster_sectors - index_in_cluster;
 492        if (n > nb_sectors) {
 493            n = nb_sectors;
 494        }
 495
 496        if (!cluster_offset) {
 497            if (bs->backing_hd) {
 498                /* read from the base image */
 499                hd_iov.iov_base = (void *)buf;
 500                hd_iov.iov_len = n * 512;
 501                qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
 502                qemu_co_mutex_unlock(&s->lock);
 503                ret = bdrv_co_readv(bs->backing_hd, sector_num,
 504                                    n, &hd_qiov);
 505                qemu_co_mutex_lock(&s->lock);
 506                if (ret < 0) {
 507                    goto fail;
 508                }
 509            } else {
 510                /* Note: in this case, no need to wait */
 511                memset(buf, 0, 512 * n);
 512            }
 513        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
 514            /* add AIO support for compressed blocks ? */
 515            if (decompress_cluster(bs, cluster_offset) < 0) {
 516                goto fail;
 517            }
 518            memcpy(buf,
 519                   s->cluster_cache + index_in_cluster * 512, 512 * n);
 520        } else {
 521            if ((cluster_offset & 511) != 0) {
 522                goto fail;
 523            }
 524            hd_iov.iov_base = (void *)buf;
 525            hd_iov.iov_len = n * 512;
 526            qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
 527            qemu_co_mutex_unlock(&s->lock);
 528            ret = bdrv_co_readv(bs->file,
 529                                (cluster_offset >> 9) + index_in_cluster,
 530                                n, &hd_qiov);
 531            qemu_co_mutex_lock(&s->lock);
 532            if (ret < 0) {
 533                break;
 534            }
 535            if (s->crypt_method) {
 536                encrypt_sectors(s, sector_num, buf, buf,
 537                                n, 0,
 538                                &s->aes_decrypt_key);
 539            }
 540        }
 541        ret = 0;
 542
 543        nb_sectors -= n;
 544        sector_num += n;
 545        buf += n * 512;
 546    }
 547
 548done:
 549    qemu_co_mutex_unlock(&s->lock);
 550
 551    if (qiov->niov > 1) {
 552        qemu_iovec_from_buf(qiov, 0, orig_buf, qiov->size);
 553        qemu_vfree(orig_buf);
 554    }
 555
 556    return ret;
 557
 558fail:
 559    ret = -EIO;
 560    goto done;
 561}
 562
 563static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
 564                          int nb_sectors, QEMUIOVector *qiov)
 565{
 566    BDRVQcowState *s = bs->opaque;
 567    int index_in_cluster;
 568    uint64_t cluster_offset;
 569    const uint8_t *src_buf;
 570    int ret = 0, n;
 571    uint8_t *cluster_data = NULL;
 572    struct iovec hd_iov;
 573    QEMUIOVector hd_qiov;
 574    uint8_t *buf;
 575    void *orig_buf;
 576
 577    s->cluster_cache_offset = -1; /* disable compressed cache */
 578
 579    if (qiov->niov > 1) {
 580        buf = orig_buf = qemu_blockalign(bs, qiov->size);
 581        qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
 582    } else {
 583        orig_buf = NULL;
 584        buf = (uint8_t *)qiov->iov->iov_base;
 585    }
 586
 587    qemu_co_mutex_lock(&s->lock);
 588
 589    while (nb_sectors != 0) {
 590
 591        index_in_cluster = sector_num & (s->cluster_sectors - 1);
 592        n = s->cluster_sectors - index_in_cluster;
 593        if (n > nb_sectors) {
 594            n = nb_sectors;
 595        }
 596        cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0,
 597                                            index_in_cluster,
 598                                            index_in_cluster + n);
 599        if (!cluster_offset || (cluster_offset & 511) != 0) {
 600            ret = -EIO;
 601            break;
 602        }
 603        if (s->crypt_method) {
 604            if (!cluster_data) {
 605                cluster_data = g_malloc0(s->cluster_size);
 606            }
 607            encrypt_sectors(s, sector_num, cluster_data, buf,
 608                            n, 1, &s->aes_encrypt_key);
 609            src_buf = cluster_data;
 610        } else {
 611            src_buf = buf;
 612        }
 613
 614        hd_iov.iov_base = (void *)src_buf;
 615        hd_iov.iov_len = n * 512;
 616        qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
 617        qemu_co_mutex_unlock(&s->lock);
 618        ret = bdrv_co_writev(bs->file,
 619                             (cluster_offset >> 9) + index_in_cluster,
 620                             n, &hd_qiov);
 621        qemu_co_mutex_lock(&s->lock);
 622        if (ret < 0) {
 623            break;
 624        }
 625        ret = 0;
 626
 627        nb_sectors -= n;
 628        sector_num += n;
 629        buf += n * 512;
 630    }
 631    qemu_co_mutex_unlock(&s->lock);
 632
 633    if (qiov->niov > 1) {
 634        qemu_vfree(orig_buf);
 635    }
 636    g_free(cluster_data);
 637
 638    return ret;
 639}
 640
 641static void qcow_close(BlockDriverState *bs)
 642{
 643    BDRVQcowState *s = bs->opaque;
 644
 645    g_free(s->l1_table);
 646    g_free(s->l2_cache);
 647    g_free(s->cluster_cache);
 648    g_free(s->cluster_data);
 649
 650    migrate_del_blocker(s->migration_blocker);
 651    error_free(s->migration_blocker);
 652}
 653
 654static int qcow_create(const char *filename, QEMUOptionParameter *options)
 655{
 656    int header_size, backing_filename_len, l1_size, shift, i;
 657    QCowHeader header;
 658    uint8_t *tmp;
 659    int64_t total_size = 0;
 660    const char *backing_file = NULL;
 661    int flags = 0;
 662    int ret;
 663    BlockDriverState *qcow_bs;
 664
 665    /* Read out options */
 666    while (options && options->name) {
 667        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
 668            total_size = options->value.n / 512;
 669        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
 670            backing_file = options->value.s;
 671        } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) {
 672            flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0;
 673        }
 674        options++;
 675    }
 676
 677    ret = bdrv_create_file(filename, options);
 678    if (ret < 0) {
 679        return ret;
 680    }
 681
 682    ret = bdrv_file_open(&qcow_bs, filename, BDRV_O_RDWR);
 683    if (ret < 0) {
 684        return ret;
 685    }
 686
 687    ret = bdrv_truncate(qcow_bs, 0);
 688    if (ret < 0) {
 689        goto exit;
 690    }
 691
 692    memset(&header, 0, sizeof(header));
 693    header.magic = cpu_to_be32(QCOW_MAGIC);
 694    header.version = cpu_to_be32(QCOW_VERSION);
 695    header.size = cpu_to_be64(total_size * 512);
 696    header_size = sizeof(header);
 697    backing_filename_len = 0;
 698    if (backing_file) {
 699        if (strcmp(backing_file, "fat:")) {
 700            header.backing_file_offset = cpu_to_be64(header_size);
 701            backing_filename_len = strlen(backing_file);
 702            header.backing_file_size = cpu_to_be32(backing_filename_len);
 703            header_size += backing_filename_len;
 704        } else {
 705            /* special backing file for vvfat */
 706            backing_file = NULL;
 707        }
 708        header.cluster_bits = 9; /* 512 byte cluster to avoid copying
 709                                    unmodifyed sectors */
 710        header.l2_bits = 12; /* 32 KB L2 tables */
 711    } else {
 712        header.cluster_bits = 12; /* 4 KB clusters */
 713        header.l2_bits = 9; /* 4 KB L2 tables */
 714    }
 715    header_size = (header_size + 7) & ~7;
 716    shift = header.cluster_bits + header.l2_bits;
 717    l1_size = ((total_size * 512) + (1LL << shift) - 1) >> shift;
 718
 719    header.l1_table_offset = cpu_to_be64(header_size);
 720    if (flags & BLOCK_FLAG_ENCRYPT) {
 721        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
 722    } else {
 723        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
 724    }
 725
 726    /* write all the data */
 727    ret = bdrv_pwrite(qcow_bs, 0, &header, sizeof(header));
 728    if (ret != sizeof(header)) {
 729        goto exit;
 730    }
 731
 732    if (backing_file) {
 733        ret = bdrv_pwrite(qcow_bs, sizeof(header),
 734            backing_file, backing_filename_len);
 735        if (ret != backing_filename_len) {
 736            goto exit;
 737        }
 738    }
 739
 740    tmp = g_malloc0(BDRV_SECTOR_SIZE);
 741    for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/
 742        BDRV_SECTOR_SIZE); i++) {
 743        ret = bdrv_pwrite(qcow_bs, header_size +
 744            BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE);
 745        if (ret != BDRV_SECTOR_SIZE) {
 746            g_free(tmp);
 747            goto exit;
 748        }
 749    }
 750
 751    g_free(tmp);
 752    ret = 0;
 753exit:
 754    bdrv_delete(qcow_bs);
 755    return ret;
 756}
 757
 758static int qcow_make_empty(BlockDriverState *bs)
 759{
 760    BDRVQcowState *s = bs->opaque;
 761    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
 762    int ret;
 763
 764    memset(s->l1_table, 0, l1_length);
 765    if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table,
 766            l1_length) < 0)
 767        return -1;
 768    ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length);
 769    if (ret < 0)
 770        return ret;
 771
 772    memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
 773    memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
 774    memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
 775
 776    return 0;
 777}
 778
 779/* XXX: put compressed sectors first, then all the cluster aligned
 780   tables to avoid losing bytes in alignment */
 781static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
 782                                 const uint8_t *buf, int nb_sectors)
 783{
 784    BDRVQcowState *s = bs->opaque;
 785    z_stream strm;
 786    int ret, out_len;
 787    uint8_t *out_buf;
 788    uint64_t cluster_offset;
 789
 790    if (nb_sectors != s->cluster_sectors)
 791        return -EINVAL;
 792
 793    out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
 794
 795    /* best compression, small window, no zlib header */
 796    memset(&strm, 0, sizeof(strm));
 797    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
 798                       Z_DEFLATED, -12,
 799                       9, Z_DEFAULT_STRATEGY);
 800    if (ret != 0) {
 801        ret = -EINVAL;
 802        goto fail;
 803    }
 804
 805    strm.avail_in = s->cluster_size;
 806    strm.next_in = (uint8_t *)buf;
 807    strm.avail_out = s->cluster_size;
 808    strm.next_out = out_buf;
 809
 810    ret = deflate(&strm, Z_FINISH);
 811    if (ret != Z_STREAM_END && ret != Z_OK) {
 812        deflateEnd(&strm);
 813        ret = -EINVAL;
 814        goto fail;
 815    }
 816    out_len = strm.next_out - out_buf;
 817
 818    deflateEnd(&strm);
 819
 820    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
 821        /* could not compress: write normal cluster */
 822        ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors);
 823        if (ret < 0) {
 824            goto fail;
 825        }
 826    } else {
 827        cluster_offset = get_cluster_offset(bs, sector_num << 9, 2,
 828                                            out_len, 0, 0);
 829        if (cluster_offset == 0) {
 830            ret = -EIO;
 831            goto fail;
 832        }
 833
 834        cluster_offset &= s->cluster_offset_mask;
 835        ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len);
 836        if (ret < 0) {
 837            goto fail;
 838        }
 839    }
 840
 841    ret = 0;
 842fail:
 843    g_free(out_buf);
 844    return ret;
 845}
 846
 847static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 848{
 849    BDRVQcowState *s = bs->opaque;
 850    bdi->cluster_size = s->cluster_size;
 851    return 0;
 852}
 853
 854
 855static QEMUOptionParameter qcow_create_options[] = {
 856    {
 857        .name = BLOCK_OPT_SIZE,
 858        .type = OPT_SIZE,
 859        .help = "Virtual disk size"
 860    },
 861    {
 862        .name = BLOCK_OPT_BACKING_FILE,
 863        .type = OPT_STRING,
 864        .help = "File name of a base image"
 865    },
 866    {
 867        .name = BLOCK_OPT_ENCRYPT,
 868        .type = OPT_FLAG,
 869        .help = "Encrypt the image"
 870    },
 871    { NULL }
 872};
 873
 874static BlockDriver bdrv_qcow = {
 875    .format_name        = "qcow",
 876    .instance_size      = sizeof(BDRVQcowState),
 877    .bdrv_probe         = qcow_probe,
 878    .bdrv_open          = qcow_open,
 879    .bdrv_close         = qcow_close,
 880    .bdrv_reopen_prepare = qcow_reopen_prepare,
 881    .bdrv_create        = qcow_create,
 882
 883    .bdrv_co_readv          = qcow_co_readv,
 884    .bdrv_co_writev         = qcow_co_writev,
 885    .bdrv_co_is_allocated   = qcow_co_is_allocated,
 886
 887    .bdrv_set_key           = qcow_set_key,
 888    .bdrv_make_empty        = qcow_make_empty,
 889    .bdrv_write_compressed  = qcow_write_compressed,
 890    .bdrv_get_info          = qcow_get_info,
 891
 892    .create_options = qcow_create_options,
 893};
 894
 895static void bdrv_qcow_init(void)
 896{
 897    bdrv_register(&bdrv_qcow);
 898}
 899
 900block_init(bdrv_qcow_init);
 901