qemu/block-qcow.c
<<
>>
Prefs
   1/*
   2 * Block driver for the QCOW format
   3 *
   4 * Copyright (c) 2004-2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24#include "qemu-common.h"
  25#include "block_int.h"
  26#include <zlib.h>
  27#include "aes.h"
  28
  29/**************************************************************/
  30/* QEMU COW block driver with compression and encryption support */
  31
  32#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
  33#define QCOW_VERSION 1
  34
  35#define QCOW_CRYPT_NONE 0
  36#define QCOW_CRYPT_AES  1
  37
  38#define QCOW_OFLAG_COMPRESSED (1LL << 63)
  39
  40typedef struct QCowHeader {
  41    uint32_t magic;
  42    uint32_t version;
  43    uint64_t backing_file_offset;
  44    uint32_t backing_file_size;
  45    uint32_t mtime;
  46    uint64_t size; /* in bytes */
  47    uint8_t cluster_bits;
  48    uint8_t l2_bits;
  49    uint32_t crypt_method;
  50    uint64_t l1_table_offset;
  51} QCowHeader;
  52
  53#define L2_CACHE_SIZE 16
  54
  55typedef struct BDRVQcowState {
  56    BlockDriverState *hd;
  57    int cluster_bits;
  58    int cluster_size;
  59    int cluster_sectors;
  60    int l2_bits;
  61    int l2_size;
  62    int l1_size;
  63    uint64_t cluster_offset_mask;
  64    uint64_t l1_table_offset;
  65    uint64_t *l1_table;
  66    uint64_t *l2_cache;
  67    uint64_t l2_cache_offsets[L2_CACHE_SIZE];
  68    uint32_t l2_cache_counts[L2_CACHE_SIZE];
  69    uint8_t *cluster_cache;
  70    uint8_t *cluster_data;
  71    uint64_t cluster_cache_offset;
  72    uint32_t crypt_method; /* current crypt method, 0 if no key yet */
  73    uint32_t crypt_method_header;
  74    AES_KEY aes_encrypt_key;
  75    AES_KEY aes_decrypt_key;
  76} BDRVQcowState;
  77
  78static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset);
  79
  80static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
  81{
  82    const QCowHeader *cow_header = (const void *)buf;
  83
  84    if (buf_size >= sizeof(QCowHeader) &&
  85        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
  86        be32_to_cpu(cow_header->version) == QCOW_VERSION)
  87        return 100;
  88    else
  89        return 0;
  90}
  91
  92static int qcow_open(BlockDriverState *bs, const char *filename, int flags)
  93{
  94    BDRVQcowState *s = bs->opaque;
  95    int len, i, shift, ret;
  96    QCowHeader header;
  97
  98    ret = bdrv_file_open(&s->hd, filename, flags);
  99    if (ret < 0)
 100        return ret;
 101    if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header))
 102        goto fail;
 103    be32_to_cpus(&header.magic);
 104    be32_to_cpus(&header.version);
 105    be64_to_cpus(&header.backing_file_offset);
 106    be32_to_cpus(&header.backing_file_size);
 107    be32_to_cpus(&header.mtime);
 108    be64_to_cpus(&header.size);
 109    be32_to_cpus(&header.crypt_method);
 110    be64_to_cpus(&header.l1_table_offset);
 111
 112    if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION)
 113        goto fail;
 114    if (header.size <= 1 || header.cluster_bits < 9)
 115        goto fail;
 116    if (header.crypt_method > QCOW_CRYPT_AES)
 117        goto fail;
 118    s->crypt_method_header = header.crypt_method;
 119    if (s->crypt_method_header)
 120        bs->encrypted = 1;
 121    s->cluster_bits = header.cluster_bits;
 122    s->cluster_size = 1 << s->cluster_bits;
 123    s->cluster_sectors = 1 << (s->cluster_bits - 9);
 124    s->l2_bits = header.l2_bits;
 125    s->l2_size = 1 << s->l2_bits;
 126    bs->total_sectors = header.size / 512;
 127    s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
 128
 129    /* read the level 1 table */
 130    shift = s->cluster_bits + s->l2_bits;
 131    s->l1_size = (header.size + (1LL << shift) - 1) >> shift;
 132
 133    s->l1_table_offset = header.l1_table_offset;
 134    s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
 135    if (!s->l1_table)
 136        goto fail;
 137    if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) !=
 138        s->l1_size * sizeof(uint64_t))
 139        goto fail;
 140    for(i = 0;i < s->l1_size; i++) {
 141        be64_to_cpus(&s->l1_table[i]);
 142    }
 143    /* alloc L2 cache */
 144    s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
 145    if (!s->l2_cache)
 146        goto fail;
 147    s->cluster_cache = qemu_malloc(s->cluster_size);
 148    if (!s->cluster_cache)
 149        goto fail;
 150    s->cluster_data = qemu_malloc(s->cluster_size);
 151    if (!s->cluster_data)
 152        goto fail;
 153    s->cluster_cache_offset = -1;
 154
 155    /* read the backing file name */
 156    if (header.backing_file_offset != 0) {
 157        len = header.backing_file_size;
 158        if (len > 1023)
 159            len = 1023;
 160        if (bdrv_pread(s->hd, header.backing_file_offset, bs->backing_file, len) != len)
 161            goto fail;
 162        bs->backing_file[len] = '\0';
 163    }
 164    return 0;
 165
 166 fail:
 167    qemu_free(s->l1_table);
 168    qemu_free(s->l2_cache);
 169    qemu_free(s->cluster_cache);
 170    qemu_free(s->cluster_data);
 171    bdrv_delete(s->hd);
 172    return -1;
 173}
 174
 175static int qcow_set_key(BlockDriverState *bs, const char *key)
 176{
 177    BDRVQcowState *s = bs->opaque;
 178    uint8_t keybuf[16];
 179    int len, i;
 180
 181    memset(keybuf, 0, 16);
 182    len = strlen(key);
 183    if (len > 16)
 184        len = 16;
 185    /* XXX: we could compress the chars to 7 bits to increase
 186       entropy */
 187    for(i = 0;i < len;i++) {
 188        keybuf[i] = key[i];
 189    }
 190    s->crypt_method = s->crypt_method_header;
 191
 192    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
 193        return -1;
 194    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
 195        return -1;
 196#if 0
 197    /* test */
 198    {
 199        uint8_t in[16];
 200        uint8_t out[16];
 201        uint8_t tmp[16];
 202        for(i=0;i<16;i++)
 203            in[i] = i;
 204        AES_encrypt(in, tmp, &s->aes_encrypt_key);
 205        AES_decrypt(tmp, out, &s->aes_decrypt_key);
 206        for(i = 0; i < 16; i++)
 207            printf(" %02x", tmp[i]);
 208        printf("\n");
 209        for(i = 0; i < 16; i++)
 210            printf(" %02x", out[i]);
 211        printf("\n");
 212    }
 213#endif
 214    return 0;
 215}
 216
 217/* The crypt function is compatible with the linux cryptoloop
 218   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
 219   supported */
 220static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
 221                            uint8_t *out_buf, const uint8_t *in_buf,
 222                            int nb_sectors, int enc,
 223                            const AES_KEY *key)
 224{
 225    union {
 226        uint64_t ll[2];
 227        uint8_t b[16];
 228    } ivec;
 229    int i;
 230
 231    for(i = 0; i < nb_sectors; i++) {
 232        ivec.ll[0] = cpu_to_le64(sector_num);
 233        ivec.ll[1] = 0;
 234        AES_cbc_encrypt(in_buf, out_buf, 512, key,
 235                        ivec.b, enc);
 236        sector_num++;
 237        in_buf += 512;
 238        out_buf += 512;
 239    }
 240}
 241
 242/* 'allocate' is:
 243 *
 244 * 0 to not allocate.
 245 *
 246 * 1 to allocate a normal cluster (for sector indexes 'n_start' to
 247 * 'n_end')
 248 *
 249 * 2 to allocate a compressed cluster of size
 250 * 'compressed_size'. 'compressed_size' must be > 0 and <
 251 * cluster_size
 252 *
 253 * return 0 if not allocated.
 254 */
 255static uint64_t get_cluster_offset(BlockDriverState *bs,
 256                                   uint64_t offset, int allocate,
 257                                   int compressed_size,
 258                                   int n_start, int n_end)
 259{
 260    BDRVQcowState *s = bs->opaque;
 261    int min_index, i, j, l1_index, l2_index;
 262    uint64_t l2_offset, *l2_table, cluster_offset, tmp;
 263    uint32_t min_count;
 264    int new_l2_table;
 265
 266    l1_index = offset >> (s->l2_bits + s->cluster_bits);
 267    l2_offset = s->l1_table[l1_index];
 268    new_l2_table = 0;
 269    if (!l2_offset) {
 270        if (!allocate)
 271            return 0;
 272        /* allocate a new l2 entry */
 273        l2_offset = bdrv_getlength(s->hd);
 274        /* round to cluster size */
 275        l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1);
 276        /* update the L1 entry */
 277        s->l1_table[l1_index] = l2_offset;
 278        tmp = cpu_to_be64(l2_offset);
 279        if (bdrv_pwrite(s->hd, s->l1_table_offset + l1_index * sizeof(tmp),
 280                        &tmp, sizeof(tmp)) != sizeof(tmp))
 281            return 0;
 282        new_l2_table = 1;
 283    }
 284    for(i = 0; i < L2_CACHE_SIZE; i++) {
 285        if (l2_offset == s->l2_cache_offsets[i]) {
 286            /* increment the hit count */
 287            if (++s->l2_cache_counts[i] == 0xffffffff) {
 288                for(j = 0; j < L2_CACHE_SIZE; j++) {
 289                    s->l2_cache_counts[j] >>= 1;
 290                }
 291            }
 292            l2_table = s->l2_cache + (i << s->l2_bits);
 293            goto found;
 294        }
 295    }
 296    /* not found: load a new entry in the least used one */
 297    min_index = 0;
 298    min_count = 0xffffffff;
 299    for(i = 0; i < L2_CACHE_SIZE; i++) {
 300        if (s->l2_cache_counts[i] < min_count) {
 301            min_count = s->l2_cache_counts[i];
 302            min_index = i;
 303        }
 304    }
 305    l2_table = s->l2_cache + (min_index << s->l2_bits);
 306    if (new_l2_table) {
 307        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
 308        if (bdrv_pwrite(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
 309            s->l2_size * sizeof(uint64_t))
 310            return 0;
 311    } else {
 312        if (bdrv_pread(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
 313            s->l2_size * sizeof(uint64_t))
 314            return 0;
 315    }
 316    s->l2_cache_offsets[min_index] = l2_offset;
 317    s->l2_cache_counts[min_index] = 1;
 318 found:
 319    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
 320    cluster_offset = be64_to_cpu(l2_table[l2_index]);
 321    if (!cluster_offset ||
 322        ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) {
 323        if (!allocate)
 324            return 0;
 325        /* allocate a new cluster */
 326        if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
 327            (n_end - n_start) < s->cluster_sectors) {
 328            /* if the cluster is already compressed, we must
 329               decompress it in the case it is not completely
 330               overwritten */
 331            if (decompress_cluster(s, cluster_offset) < 0)
 332                return 0;
 333            cluster_offset = bdrv_getlength(s->hd);
 334            cluster_offset = (cluster_offset + s->cluster_size - 1) &
 335                ~(s->cluster_size - 1);
 336            /* write the cluster content */
 337            if (bdrv_pwrite(s->hd, cluster_offset, s->cluster_cache, s->cluster_size) !=
 338                s->cluster_size)
 339                return -1;
 340        } else {
 341            cluster_offset = bdrv_getlength(s->hd);
 342            if (allocate == 1) {
 343                /* round to cluster size */
 344                cluster_offset = (cluster_offset + s->cluster_size - 1) &
 345                    ~(s->cluster_size - 1);
 346                bdrv_truncate(s->hd, cluster_offset + s->cluster_size);
 347                /* if encrypted, we must initialize the cluster
 348                   content which won't be written */
 349                if (s->crypt_method &&
 350                    (n_end - n_start) < s->cluster_sectors) {
 351                    uint64_t start_sect;
 352                    start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
 353                    memset(s->cluster_data + 512, 0x00, 512);
 354                    for(i = 0; i < s->cluster_sectors; i++) {
 355                        if (i < n_start || i >= n_end) {
 356                            encrypt_sectors(s, start_sect + i,
 357                                            s->cluster_data,
 358                                            s->cluster_data + 512, 1, 1,
 359                                            &s->aes_encrypt_key);
 360                            if (bdrv_pwrite(s->hd, cluster_offset + i * 512,
 361                                            s->cluster_data, 512) != 512)
 362                                return -1;
 363                        }
 364                    }
 365                }
 366            } else if (allocate == 2) {
 367                cluster_offset |= QCOW_OFLAG_COMPRESSED |
 368                    (uint64_t)compressed_size << (63 - s->cluster_bits);
 369            }
 370        }
 371        /* update L2 table */
 372        tmp = cpu_to_be64(cluster_offset);
 373        l2_table[l2_index] = tmp;
 374        if (bdrv_pwrite(s->hd,
 375                        l2_offset + l2_index * sizeof(tmp), &tmp, sizeof(tmp)) != sizeof(tmp))
 376            return 0;
 377    }
 378    return cluster_offset;
 379}
 380
 381static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num,
 382                             int nb_sectors, int *pnum)
 383{
 384    BDRVQcowState *s = bs->opaque;
 385    int index_in_cluster, n;
 386    uint64_t cluster_offset;
 387
 388    cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
 389    index_in_cluster = sector_num & (s->cluster_sectors - 1);
 390    n = s->cluster_sectors - index_in_cluster;
 391    if (n > nb_sectors)
 392        n = nb_sectors;
 393    *pnum = n;
 394    return (cluster_offset != 0);
 395}
 396
 397static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
 398                             const uint8_t *buf, int buf_size)
 399{
 400    z_stream strm1, *strm = &strm1;
 401    int ret, out_len;
 402
 403    memset(strm, 0, sizeof(*strm));
 404
 405    strm->next_in = (uint8_t *)buf;
 406    strm->avail_in = buf_size;
 407    strm->next_out = out_buf;
 408    strm->avail_out = out_buf_size;
 409
 410    ret = inflateInit2(strm, -12);
 411    if (ret != Z_OK)
 412        return -1;
 413    ret = inflate(strm, Z_FINISH);
 414    out_len = strm->next_out - out_buf;
 415    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
 416        out_len != out_buf_size) {
 417        inflateEnd(strm);
 418        return -1;
 419    }
 420    inflateEnd(strm);
 421    return 0;
 422}
 423
 424static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset)
 425{
 426    int ret, csize;
 427    uint64_t coffset;
 428
 429    coffset = cluster_offset & s->cluster_offset_mask;
 430    if (s->cluster_cache_offset != coffset) {
 431        csize = cluster_offset >> (63 - s->cluster_bits);
 432        csize &= (s->cluster_size - 1);
 433        ret = bdrv_pread(s->hd, coffset, s->cluster_data, csize);
 434        if (ret != csize)
 435            return -1;
 436        if (decompress_buffer(s->cluster_cache, s->cluster_size,
 437                              s->cluster_data, csize) < 0) {
 438            return -1;
 439        }
 440        s->cluster_cache_offset = coffset;
 441    }
 442    return 0;
 443}
 444
 445#if 0
 446
 447static int qcow_read(BlockDriverState *bs, int64_t sector_num,
 448                     uint8_t *buf, int nb_sectors)
 449{
 450    BDRVQcowState *s = bs->opaque;
 451    int ret, index_in_cluster, n;
 452    uint64_t cluster_offset;
 453
 454    while (nb_sectors > 0) {
 455        cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
 456        index_in_cluster = sector_num & (s->cluster_sectors - 1);
 457        n = s->cluster_sectors - index_in_cluster;
 458        if (n > nb_sectors)
 459            n = nb_sectors;
 460        if (!cluster_offset) {
 461            if (bs->backing_hd) {
 462                /* read from the base image */
 463                ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
 464                if (ret < 0)
 465                    return -1;
 466            } else {
 467                memset(buf, 0, 512 * n);
 468            }
 469        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
 470            if (decompress_cluster(s, cluster_offset) < 0)
 471                return -1;
 472            memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n);
 473        } else {
 474            ret = bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
 475            if (ret != n * 512)
 476                return -1;
 477            if (s->crypt_method) {
 478                encrypt_sectors(s, sector_num, buf, buf, n, 0,
 479                                &s->aes_decrypt_key);
 480            }
 481        }
 482        nb_sectors -= n;
 483        sector_num += n;
 484        buf += n * 512;
 485    }
 486    return 0;
 487}
 488#endif
 489
 490static int qcow_write(BlockDriverState *bs, int64_t sector_num,
 491                     const uint8_t *buf, int nb_sectors)
 492{
 493    BDRVQcowState *s = bs->opaque;
 494    int ret, index_in_cluster, n;
 495    uint64_t cluster_offset;
 496
 497    while (nb_sectors > 0) {
 498        index_in_cluster = sector_num & (s->cluster_sectors - 1);
 499        n = s->cluster_sectors - index_in_cluster;
 500        if (n > nb_sectors)
 501            n = nb_sectors;
 502        cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0,
 503                                            index_in_cluster,
 504                                            index_in_cluster + n);
 505        if (!cluster_offset)
 506            return -1;
 507        if (s->crypt_method) {
 508            encrypt_sectors(s, sector_num, s->cluster_data, buf, n, 1,
 509                            &s->aes_encrypt_key);
 510            ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512,
 511                              s->cluster_data, n * 512);
 512        } else {
 513            ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
 514        }
 515        if (ret != n * 512)
 516            return -1;
 517        nb_sectors -= n;
 518        sector_num += n;
 519        buf += n * 512;
 520    }
 521    s->cluster_cache_offset = -1; /* disable compressed cache */
 522    return 0;
 523}
 524
 525typedef struct QCowAIOCB {
 526    BlockDriverAIOCB common;
 527    int64_t sector_num;
 528    uint8_t *buf;
 529    int nb_sectors;
 530    int n;
 531    uint64_t cluster_offset;
 532    uint8_t *cluster_data;
 533    BlockDriverAIOCB *hd_aiocb;
 534} QCowAIOCB;
 535
 536static void qcow_aio_read_cb(void *opaque, int ret)
 537{
 538    QCowAIOCB *acb = opaque;
 539    BlockDriverState *bs = acb->common.bs;
 540    BDRVQcowState *s = bs->opaque;
 541    int index_in_cluster;
 542
 543    acb->hd_aiocb = NULL;
 544    if (ret < 0) {
 545    fail:
 546        acb->common.cb(acb->common.opaque, ret);
 547        qemu_aio_release(acb);
 548        return;
 549    }
 550
 551 redo:
 552    /* post process the read buffer */
 553    if (!acb->cluster_offset) {
 554        /* nothing to do */
 555    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
 556        /* nothing to do */
 557    } else {
 558        if (s->crypt_method) {
 559            encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf,
 560                            acb->n, 0,
 561                            &s->aes_decrypt_key);
 562        }
 563    }
 564
 565    acb->nb_sectors -= acb->n;
 566    acb->sector_num += acb->n;
 567    acb->buf += acb->n * 512;
 568
 569    if (acb->nb_sectors == 0) {
 570        /* request completed */
 571        acb->common.cb(acb->common.opaque, 0);
 572        qemu_aio_release(acb);
 573        return;
 574    }
 575
 576    /* prepare next AIO request */
 577    acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9,
 578                                             0, 0, 0, 0);
 579    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
 580    acb->n = s->cluster_sectors - index_in_cluster;
 581    if (acb->n > acb->nb_sectors)
 582        acb->n = acb->nb_sectors;
 583
 584    if (!acb->cluster_offset) {
 585        if (bs->backing_hd) {
 586            /* read from the base image */
 587            acb->hd_aiocb = bdrv_aio_read(bs->backing_hd,
 588                acb->sector_num, acb->buf, acb->n, qcow_aio_read_cb, acb);
 589            if (acb->hd_aiocb == NULL)
 590                goto fail;
 591        } else {
 592            /* Note: in this case, no need to wait */
 593            memset(acb->buf, 0, 512 * acb->n);
 594            goto redo;
 595        }
 596    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
 597        /* add AIO support for compressed blocks ? */
 598        if (decompress_cluster(s, acb->cluster_offset) < 0)
 599            goto fail;
 600        memcpy(acb->buf,
 601               s->cluster_cache + index_in_cluster * 512, 512 * acb->n);
 602        goto redo;
 603    } else {
 604        if ((acb->cluster_offset & 511) != 0) {
 605            ret = -EIO;
 606            goto fail;
 607        }
 608        acb->hd_aiocb = bdrv_aio_read(s->hd,
 609                            (acb->cluster_offset >> 9) + index_in_cluster,
 610                            acb->buf, acb->n, qcow_aio_read_cb, acb);
 611        if (acb->hd_aiocb == NULL)
 612            goto fail;
 613    }
 614}
 615
 616static BlockDriverAIOCB *qcow_aio_read(BlockDriverState *bs,
 617        int64_t sector_num, uint8_t *buf, int nb_sectors,
 618        BlockDriverCompletionFunc *cb, void *opaque)
 619{
 620    QCowAIOCB *acb;
 621
 622    acb = qemu_aio_get(bs, cb, opaque);
 623    if (!acb)
 624        return NULL;
 625    acb->hd_aiocb = NULL;
 626    acb->sector_num = sector_num;
 627    acb->buf = buf;
 628    acb->nb_sectors = nb_sectors;
 629    acb->n = 0;
 630    acb->cluster_offset = 0;
 631
 632    qcow_aio_read_cb(acb, 0);
 633    return &acb->common;
 634}
 635
 636static void qcow_aio_write_cb(void *opaque, int ret)
 637{
 638    QCowAIOCB *acb = opaque;
 639    BlockDriverState *bs = acb->common.bs;
 640    BDRVQcowState *s = bs->opaque;
 641    int index_in_cluster;
 642    uint64_t cluster_offset;
 643    const uint8_t *src_buf;
 644
 645    acb->hd_aiocb = NULL;
 646
 647    if (ret < 0) {
 648    fail:
 649        acb->common.cb(acb->common.opaque, ret);
 650        qemu_aio_release(acb);
 651        return;
 652    }
 653
 654    acb->nb_sectors -= acb->n;
 655    acb->sector_num += acb->n;
 656    acb->buf += acb->n * 512;
 657
 658    if (acb->nb_sectors == 0) {
 659        /* request completed */
 660        acb->common.cb(acb->common.opaque, 0);
 661        qemu_aio_release(acb);
 662        return;
 663    }
 664
 665    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
 666    acb->n = s->cluster_sectors - index_in_cluster;
 667    if (acb->n > acb->nb_sectors)
 668        acb->n = acb->nb_sectors;
 669    cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, 1, 0,
 670                                        index_in_cluster,
 671                                        index_in_cluster + acb->n);
 672    if (!cluster_offset || (cluster_offset & 511) != 0) {
 673        ret = -EIO;
 674        goto fail;
 675    }
 676    if (s->crypt_method) {
 677        if (!acb->cluster_data) {
 678            acb->cluster_data = qemu_mallocz(s->cluster_size);
 679            if (!acb->cluster_data) {
 680                ret = -ENOMEM;
 681                goto fail;
 682            }
 683        }
 684        encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf,
 685                        acb->n, 1, &s->aes_encrypt_key);
 686        src_buf = acb->cluster_data;
 687    } else {
 688        src_buf = acb->buf;
 689    }
 690    acb->hd_aiocb = bdrv_aio_write(s->hd,
 691                                   (cluster_offset >> 9) + index_in_cluster,
 692                                   src_buf, acb->n,
 693                                   qcow_aio_write_cb, acb);
 694    if (acb->hd_aiocb == NULL)
 695        goto fail;
 696}
 697
 698static BlockDriverAIOCB *qcow_aio_write(BlockDriverState *bs,
 699        int64_t sector_num, const uint8_t *buf, int nb_sectors,
 700        BlockDriverCompletionFunc *cb, void *opaque)
 701{
 702    BDRVQcowState *s = bs->opaque;
 703    QCowAIOCB *acb;
 704
 705    s->cluster_cache_offset = -1; /* disable compressed cache */
 706
 707    acb = qemu_aio_get(bs, cb, opaque);
 708    if (!acb)
 709        return NULL;
 710    acb->hd_aiocb = NULL;
 711    acb->sector_num = sector_num;
 712    acb->buf = (uint8_t *)buf;
 713    acb->nb_sectors = nb_sectors;
 714    acb->n = 0;
 715
 716    qcow_aio_write_cb(acb, 0);
 717    return &acb->common;
 718}
 719
 720static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
 721{
 722    QCowAIOCB *acb = (QCowAIOCB *)blockacb;
 723    if (acb->hd_aiocb)
 724        bdrv_aio_cancel(acb->hd_aiocb);
 725    qemu_aio_release(acb);
 726}
 727
 728static void qcow_close(BlockDriverState *bs)
 729{
 730    BDRVQcowState *s = bs->opaque;
 731    qemu_free(s->l1_table);
 732    qemu_free(s->l2_cache);
 733    qemu_free(s->cluster_cache);
 734    qemu_free(s->cluster_data);
 735    bdrv_delete(s->hd);
 736}
 737
 738static int qcow_create(const char *filename, int64_t total_size,
 739                      const char *backing_file, int flags)
 740{
 741    int fd, header_size, backing_filename_len, l1_size, i, shift;
 742    QCowHeader header;
 743    uint64_t tmp;
 744
 745    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
 746    if (fd < 0)
 747        return -1;
 748    memset(&header, 0, sizeof(header));
 749    header.magic = cpu_to_be32(QCOW_MAGIC);
 750    header.version = cpu_to_be32(QCOW_VERSION);
 751    header.size = cpu_to_be64(total_size * 512);
 752    header_size = sizeof(header);
 753    backing_filename_len = 0;
 754    if (backing_file) {
 755        if (strcmp(backing_file, "fat:")) {
 756            header.backing_file_offset = cpu_to_be64(header_size);
 757            backing_filename_len = strlen(backing_file);
 758            header.backing_file_size = cpu_to_be32(backing_filename_len);
 759            header_size += backing_filename_len;
 760        } else {
 761            /* special backing file for vvfat */
 762            backing_file = NULL;
 763        }
 764        header.cluster_bits = 9; /* 512 byte cluster to avoid copying
 765                                    unmodifyed sectors */
 766        header.l2_bits = 12; /* 32 KB L2 tables */
 767    } else {
 768        header.cluster_bits = 12; /* 4 KB clusters */
 769        header.l2_bits = 9; /* 4 KB L2 tables */
 770    }
 771    header_size = (header_size + 7) & ~7;
 772    shift = header.cluster_bits + header.l2_bits;
 773    l1_size = ((total_size * 512) + (1LL << shift) - 1) >> shift;
 774
 775    header.l1_table_offset = cpu_to_be64(header_size);
 776    if (flags & BLOCK_FLAG_ENCRYPT) {
 777        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
 778    } else {
 779        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
 780    }
 781
 782    /* write all the data */
 783    write(fd, &header, sizeof(header));
 784    if (backing_file) {
 785        write(fd, backing_file, backing_filename_len);
 786    }
 787    lseek(fd, header_size, SEEK_SET);
 788    tmp = 0;
 789    for(i = 0;i < l1_size; i++) {
 790        write(fd, &tmp, sizeof(tmp));
 791    }
 792    close(fd);
 793    return 0;
 794}
 795
 796static int qcow_make_empty(BlockDriverState *bs)
 797{
 798    BDRVQcowState *s = bs->opaque;
 799    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
 800    int ret;
 801
 802    memset(s->l1_table, 0, l1_length);
 803    if (bdrv_pwrite(s->hd, s->l1_table_offset, s->l1_table, l1_length) < 0)
 804        return -1;
 805    ret = bdrv_truncate(s->hd, s->l1_table_offset + l1_length);
 806    if (ret < 0)
 807        return ret;
 808
 809    memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
 810    memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
 811    memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
 812
 813    return 0;
 814}
 815
 816/* XXX: put compressed sectors first, then all the cluster aligned
 817   tables to avoid losing bytes in alignment */
 818static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
 819                                 const uint8_t *buf, int nb_sectors)
 820{
 821    BDRVQcowState *s = bs->opaque;
 822    z_stream strm;
 823    int ret, out_len;
 824    uint8_t *out_buf;
 825    uint64_t cluster_offset;
 826
 827    if (nb_sectors != s->cluster_sectors)
 828        return -EINVAL;
 829
 830    out_buf = qemu_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
 831    if (!out_buf)
 832        return -1;
 833
 834    /* best compression, small window, no zlib header */
 835    memset(&strm, 0, sizeof(strm));
 836    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
 837                       Z_DEFLATED, -12,
 838                       9, Z_DEFAULT_STRATEGY);
 839    if (ret != 0) {
 840        qemu_free(out_buf);
 841        return -1;
 842    }
 843
 844    strm.avail_in = s->cluster_size;
 845    strm.next_in = (uint8_t *)buf;
 846    strm.avail_out = s->cluster_size;
 847    strm.next_out = out_buf;
 848
 849    ret = deflate(&strm, Z_FINISH);
 850    if (ret != Z_STREAM_END && ret != Z_OK) {
 851        qemu_free(out_buf);
 852        deflateEnd(&strm);
 853        return -1;
 854    }
 855    out_len = strm.next_out - out_buf;
 856
 857    deflateEnd(&strm);
 858
 859    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
 860        /* could not compress: write normal cluster */
 861        qcow_write(bs, sector_num, buf, s->cluster_sectors);
 862    } else {
 863        cluster_offset = get_cluster_offset(bs, sector_num << 9, 2,
 864                                            out_len, 0, 0);
 865        cluster_offset &= s->cluster_offset_mask;
 866        if (bdrv_pwrite(s->hd, cluster_offset, out_buf, out_len) != out_len) {
 867            qemu_free(out_buf);
 868            return -1;
 869        }
 870    }
 871
 872    qemu_free(out_buf);
 873    return 0;
 874}
 875
 876static void qcow_flush(BlockDriverState *bs)
 877{
 878    BDRVQcowState *s = bs->opaque;
 879    bdrv_flush(s->hd);
 880}
 881
 882static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 883{
 884    BDRVQcowState *s = bs->opaque;
 885    bdi->cluster_size = s->cluster_size;
 886    return 0;
 887}
 888
 889BlockDriver bdrv_qcow = {
 890    "qcow",
 891    sizeof(BDRVQcowState),
 892    qcow_probe,
 893    qcow_open,
 894    NULL,
 895    NULL,
 896    qcow_close,
 897    qcow_create,
 898    qcow_flush,
 899    qcow_is_allocated,
 900    qcow_set_key,
 901    qcow_make_empty,
 902
 903    .bdrv_aio_read = qcow_aio_read,
 904    .bdrv_aio_write = qcow_aio_write,
 905    .bdrv_aio_cancel = qcow_aio_cancel,
 906    .aiocb_size = sizeof(QCowAIOCB),
 907    .bdrv_write_compressed = qcow_write_compressed,
 908    .bdrv_get_info = qcow_get_info,
 909};
 910