qemu/block/vdi.c
<<
>>
Prefs
   1/*
   2 * Block driver for the Virtual Disk Image (VDI) format
   3 *
   4 * Copyright (c) 2009, 2012 Stefan Weil
   5 *
   6 * This program is free software: you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License as published by
   8 * the Free Software Foundation, either version 2 of the License, or
   9 * (at your option) version 3 or any later version.
  10 *
  11 * This program is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 * GNU General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 *
  19 * Reference:
  20 * http://forums.virtualbox.org/viewtopic.php?t=8046
  21 *
  22 * This driver supports create / read / write operations on VDI images.
  23 *
  24 * Todo (see also TODO in code):
  25 *
  26 * Some features like snapshots are still missing.
  27 *
  28 * Deallocation of zero-filled blocks and shrinking images are missing, too
  29 * (might be added to common block layer).
  30 *
  31 * Allocation of blocks could be optimized (less writes to block map and
  32 * header).
  33 *
  34 * Read and write of adjacent blocks could be done in one operation
  35 * (current code uses one operation per block (1 MiB).
  36 *
  37 * The code is not thread safe (missing locks for changes in header and
  38 * block table, no problem with current QEMU).
  39 *
  40 * Hints:
  41 *
  42 * Blocks (VDI documentation) correspond to clusters (QEMU).
  43 * QEMU's backing files could be implemented using VDI snapshot files (TODO).
  44 * VDI snapshot files may also contain the complete machine state.
  45 * Maybe this machine state can be converted to QEMU PC machine snapshot data.
  46 *
  47 * The driver keeps a block cache (little endian entries) in memory.
  48 * For the standard block size (1 MiB), a 1 TiB disk will use 4 MiB RAM,
  49 * so this seems to be reasonable.
  50 */
  51
  52#include "qemu/osdep.h"
  53#include "qapi/error.h"
  54#include "block/block_int.h"
  55#include "sysemu/block-backend.h"
  56#include "qemu/module.h"
  57#include "migration/migration.h"
  58#include "qemu/coroutine.h"
  59#include "qemu/cutils.h"
  60
  61#if defined(CONFIG_UUID)
  62#include <uuid/uuid.h>
  63#else
  64/* TODO: move uuid emulation to some central place in QEMU. */
  65#include "sysemu/sysemu.h"     /* UUID_FMT */
  66typedef unsigned char uuid_t[16];
  67#endif
  68
  69/* Code configuration options. */
  70
  71/* Enable debug messages. */
  72//~ #define CONFIG_VDI_DEBUG
  73
  74/* Support write operations on VDI images. */
  75#define CONFIG_VDI_WRITE
  76
  77/* Support non-standard block (cluster) size. This is untested.
  78 * Maybe it will be needed for very large images.
  79 */
  80//~ #define CONFIG_VDI_BLOCK_SIZE
  81
  82/* Support static (fixed, pre-allocated) images. */
  83#define CONFIG_VDI_STATIC_IMAGE
  84
  85/* Command line option for static images. */
  86#define BLOCK_OPT_STATIC "static"
  87
  88#define KiB     1024
  89#define MiB     (KiB * KiB)
  90
  91#define SECTOR_SIZE 512
  92#define DEFAULT_CLUSTER_SIZE (1 * MiB)
  93
  94#if defined(CONFIG_VDI_DEBUG)
  95#define logout(fmt, ...) \
  96                fprintf(stderr, "vdi\t%-24s" fmt, __func__, ##__VA_ARGS__)
  97#else
  98#define logout(fmt, ...) ((void)0)
  99#endif
 100
 101/* Image signature. */
 102#define VDI_SIGNATURE 0xbeda107f
 103
 104/* Image version. */
 105#define VDI_VERSION_1_1 0x00010001
 106
 107/* Image type. */
 108#define VDI_TYPE_DYNAMIC 1
 109#define VDI_TYPE_STATIC  2
 110
 111/* Innotek / SUN images use these strings in header.text:
 112 * "<<< innotek VirtualBox Disk Image >>>\n"
 113 * "<<< Sun xVM VirtualBox Disk Image >>>\n"
 114 * "<<< Sun VirtualBox Disk Image >>>\n"
 115 * The value does not matter, so QEMU created images use a different text.
 116 */
 117#define VDI_TEXT "<<< QEMU VM Virtual Disk Image >>>\n"
 118
 119/* A never-allocated block; semantically arbitrary content. */
 120#define VDI_UNALLOCATED 0xffffffffU
 121
 122/* A discarded (no longer allocated) block; semantically zero-filled. */
 123#define VDI_DISCARDED   0xfffffffeU
 124
 125#define VDI_IS_ALLOCATED(X) ((X) < VDI_DISCARDED)
 126
 127/* The bmap will take up VDI_BLOCKS_IN_IMAGE_MAX * sizeof(uint32_t) bytes; since
 128 * the bmap is read and written in a single operation, its size needs to be
 129 * limited to INT_MAX; furthermore, when opening an image, the bmap size is
 130 * rounded up to be aligned on BDRV_SECTOR_SIZE.
 131 * Therefore this should satisfy the following:
 132 * VDI_BLOCKS_IN_IMAGE_MAX * sizeof(uint32_t) + BDRV_SECTOR_SIZE == INT_MAX + 1
 133 * (INT_MAX + 1 is the first value not representable as an int)
 134 * This guarantees that any value below or equal to the constant will, when
 135 * multiplied by sizeof(uint32_t) and rounded up to a BDRV_SECTOR_SIZE boundary,
 136 * still be below or equal to INT_MAX. */
 137#define VDI_BLOCKS_IN_IMAGE_MAX \
 138    ((unsigned)((INT_MAX + 1u - BDRV_SECTOR_SIZE) / sizeof(uint32_t)))
 139#define VDI_DISK_SIZE_MAX        ((uint64_t)VDI_BLOCKS_IN_IMAGE_MAX * \
 140                                  (uint64_t)DEFAULT_CLUSTER_SIZE)
 141
 142#if !defined(CONFIG_UUID)
 143static inline void uuid_generate(uuid_t out)
 144{
 145    memset(out, 0, sizeof(uuid_t));
 146}
 147
 148static inline int uuid_is_null(const uuid_t uu)
 149{
 150    uuid_t null_uuid = { 0 };
 151    return memcmp(uu, null_uuid, sizeof(uuid_t)) == 0;
 152}
 153
 154# if defined(CONFIG_VDI_DEBUG)
 155static inline void uuid_unparse(const uuid_t uu, char *out)
 156{
 157    snprintf(out, 37, UUID_FMT,
 158            uu[0], uu[1], uu[2], uu[3], uu[4], uu[5], uu[6], uu[7],
 159            uu[8], uu[9], uu[10], uu[11], uu[12], uu[13], uu[14], uu[15]);
 160}
 161# endif
 162#endif
 163
 164typedef struct {
 165    char text[0x40];
 166    uint32_t signature;
 167    uint32_t version;
 168    uint32_t header_size;
 169    uint32_t image_type;
 170    uint32_t image_flags;
 171    char description[256];
 172    uint32_t offset_bmap;
 173    uint32_t offset_data;
 174    uint32_t cylinders;         /* disk geometry, unused here */
 175    uint32_t heads;             /* disk geometry, unused here */
 176    uint32_t sectors;           /* disk geometry, unused here */
 177    uint32_t sector_size;
 178    uint32_t unused1;
 179    uint64_t disk_size;
 180    uint32_t block_size;
 181    uint32_t block_extra;       /* unused here */
 182    uint32_t blocks_in_image;
 183    uint32_t blocks_allocated;
 184    uuid_t uuid_image;
 185    uuid_t uuid_last_snap;
 186    uuid_t uuid_link;
 187    uuid_t uuid_parent;
 188    uint64_t unused2[7];
 189} QEMU_PACKED VdiHeader;
 190
 191typedef struct {
 192    /* The block map entries are little endian (even in memory). */
 193    uint32_t *bmap;
 194    /* Size of block (bytes). */
 195    uint32_t block_size;
 196    /* Size of block (sectors). */
 197    uint32_t block_sectors;
 198    /* First sector of block map. */
 199    uint32_t bmap_sector;
 200    /* VDI header (converted to host endianness). */
 201    VdiHeader header;
 202
 203    CoMutex write_lock;
 204
 205    Error *migration_blocker;
 206} BDRVVdiState;
 207
 208/* Change UUID from little endian (IPRT = VirtualBox format) to big endian
 209 * format (network byte order, standard, see RFC 4122) and vice versa.
 210 */
 211static void uuid_convert(uuid_t uuid)
 212{
 213    bswap32s((uint32_t *)&uuid[0]);
 214    bswap16s((uint16_t *)&uuid[4]);
 215    bswap16s((uint16_t *)&uuid[6]);
 216}
 217
 218static void vdi_header_to_cpu(VdiHeader *header)
 219{
 220    le32_to_cpus(&header->signature);
 221    le32_to_cpus(&header->version);
 222    le32_to_cpus(&header->header_size);
 223    le32_to_cpus(&header->image_type);
 224    le32_to_cpus(&header->image_flags);
 225    le32_to_cpus(&header->offset_bmap);
 226    le32_to_cpus(&header->offset_data);
 227    le32_to_cpus(&header->cylinders);
 228    le32_to_cpus(&header->heads);
 229    le32_to_cpus(&header->sectors);
 230    le32_to_cpus(&header->sector_size);
 231    le64_to_cpus(&header->disk_size);
 232    le32_to_cpus(&header->block_size);
 233    le32_to_cpus(&header->block_extra);
 234    le32_to_cpus(&header->blocks_in_image);
 235    le32_to_cpus(&header->blocks_allocated);
 236    uuid_convert(header->uuid_image);
 237    uuid_convert(header->uuid_last_snap);
 238    uuid_convert(header->uuid_link);
 239    uuid_convert(header->uuid_parent);
 240}
 241
 242static void vdi_header_to_le(VdiHeader *header)
 243{
 244    cpu_to_le32s(&header->signature);
 245    cpu_to_le32s(&header->version);
 246    cpu_to_le32s(&header->header_size);
 247    cpu_to_le32s(&header->image_type);
 248    cpu_to_le32s(&header->image_flags);
 249    cpu_to_le32s(&header->offset_bmap);
 250    cpu_to_le32s(&header->offset_data);
 251    cpu_to_le32s(&header->cylinders);
 252    cpu_to_le32s(&header->heads);
 253    cpu_to_le32s(&header->sectors);
 254    cpu_to_le32s(&header->sector_size);
 255    cpu_to_le64s(&header->disk_size);
 256    cpu_to_le32s(&header->block_size);
 257    cpu_to_le32s(&header->block_extra);
 258    cpu_to_le32s(&header->blocks_in_image);
 259    cpu_to_le32s(&header->blocks_allocated);
 260    uuid_convert(header->uuid_image);
 261    uuid_convert(header->uuid_last_snap);
 262    uuid_convert(header->uuid_link);
 263    uuid_convert(header->uuid_parent);
 264}
 265
 266#if defined(CONFIG_VDI_DEBUG)
 267static void vdi_header_print(VdiHeader *header)
 268{
 269    char uuid[37];
 270    logout("text        %s", header->text);
 271    logout("signature   0x%08x\n", header->signature);
 272    logout("header size 0x%04x\n", header->header_size);
 273    logout("image type  0x%04x\n", header->image_type);
 274    logout("image flags 0x%04x\n", header->image_flags);
 275    logout("description %s\n", header->description);
 276    logout("offset bmap 0x%04x\n", header->offset_bmap);
 277    logout("offset data 0x%04x\n", header->offset_data);
 278    logout("cylinders   0x%04x\n", header->cylinders);
 279    logout("heads       0x%04x\n", header->heads);
 280    logout("sectors     0x%04x\n", header->sectors);
 281    logout("sector size 0x%04x\n", header->sector_size);
 282    logout("image size  0x%" PRIx64 " B (%" PRIu64 " MiB)\n",
 283           header->disk_size, header->disk_size / MiB);
 284    logout("block size  0x%04x\n", header->block_size);
 285    logout("block extra 0x%04x\n", header->block_extra);
 286    logout("blocks tot. 0x%04x\n", header->blocks_in_image);
 287    logout("blocks all. 0x%04x\n", header->blocks_allocated);
 288    uuid_unparse(header->uuid_image, uuid);
 289    logout("uuid image  %s\n", uuid);
 290    uuid_unparse(header->uuid_last_snap, uuid);
 291    logout("uuid snap   %s\n", uuid);
 292    uuid_unparse(header->uuid_link, uuid);
 293    logout("uuid link   %s\n", uuid);
 294    uuid_unparse(header->uuid_parent, uuid);
 295    logout("uuid parent %s\n", uuid);
 296}
 297#endif
 298
 299static int vdi_check(BlockDriverState *bs, BdrvCheckResult *res,
 300                     BdrvCheckMode fix)
 301{
 302    /* TODO: additional checks possible. */
 303    BDRVVdiState *s = (BDRVVdiState *)bs->opaque;
 304    uint32_t blocks_allocated = 0;
 305    uint32_t block;
 306    uint32_t *bmap;
 307    logout("\n");
 308
 309    if (fix) {
 310        return -ENOTSUP;
 311    }
 312
 313    bmap = g_try_new(uint32_t, s->header.blocks_in_image);
 314    if (s->header.blocks_in_image && bmap == NULL) {
 315        res->check_errors++;
 316        return -ENOMEM;
 317    }
 318
 319    memset(bmap, 0xff, s->header.blocks_in_image * sizeof(uint32_t));
 320
 321    /* Check block map and value of blocks_allocated. */
 322    for (block = 0; block < s->header.blocks_in_image; block++) {
 323        uint32_t bmap_entry = le32_to_cpu(s->bmap[block]);
 324        if (VDI_IS_ALLOCATED(bmap_entry)) {
 325            if (bmap_entry < s->header.blocks_in_image) {
 326                blocks_allocated++;
 327                if (!VDI_IS_ALLOCATED(bmap[bmap_entry])) {
 328                    bmap[bmap_entry] = bmap_entry;
 329                } else {
 330                    fprintf(stderr, "ERROR: block index %" PRIu32
 331                            " also used by %" PRIu32 "\n", bmap[bmap_entry], bmap_entry);
 332                    res->corruptions++;
 333                }
 334            } else {
 335                fprintf(stderr, "ERROR: block index %" PRIu32
 336                        " too large, is %" PRIu32 "\n", block, bmap_entry);
 337                res->corruptions++;
 338            }
 339        }
 340    }
 341    if (blocks_allocated != s->header.blocks_allocated) {
 342        fprintf(stderr, "ERROR: allocated blocks mismatch, is %" PRIu32
 343               ", should be %" PRIu32 "\n",
 344               blocks_allocated, s->header.blocks_allocated);
 345        res->corruptions++;
 346    }
 347
 348    g_free(bmap);
 349
 350    return 0;
 351}
 352
 353static int vdi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 354{
 355    /* TODO: vdi_get_info would be needed for machine snapshots.
 356       vm_state_offset is still missing. */
 357    BDRVVdiState *s = (BDRVVdiState *)bs->opaque;
 358    logout("\n");
 359    bdi->cluster_size = s->block_size;
 360    bdi->vm_state_offset = 0;
 361    bdi->unallocated_blocks_are_zero = true;
 362    return 0;
 363}
 364
 365static int vdi_make_empty(BlockDriverState *bs)
 366{
 367    /* TODO: missing code. */
 368    logout("\n");
 369    /* The return value for missing code must be 0, see block.c. */
 370    return 0;
 371}
 372
 373static int vdi_probe(const uint8_t *buf, int buf_size, const char *filename)
 374{
 375    const VdiHeader *header = (const VdiHeader *)buf;
 376    int ret = 0;
 377
 378    logout("\n");
 379
 380    if (buf_size < sizeof(*header)) {
 381        /* Header too small, no VDI. */
 382    } else if (le32_to_cpu(header->signature) == VDI_SIGNATURE) {
 383        ret = 100;
 384    }
 385
 386    if (ret == 0) {
 387        logout("no vdi image\n");
 388    } else {
 389        logout("%s", header->text);
 390    }
 391
 392    return ret;
 393}
 394
 395static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
 396                    Error **errp)
 397{
 398    BDRVVdiState *s = bs->opaque;
 399    VdiHeader header;
 400    size_t bmap_size;
 401    int ret;
 402
 403    logout("\n");
 404
 405    ret = bdrv_read(bs->file->bs, 0, (uint8_t *)&header, 1);
 406    if (ret < 0) {
 407        goto fail;
 408    }
 409
 410    vdi_header_to_cpu(&header);
 411#if defined(CONFIG_VDI_DEBUG)
 412    vdi_header_print(&header);
 413#endif
 414
 415    if (header.disk_size > VDI_DISK_SIZE_MAX) {
 416        error_setg(errp, "Unsupported VDI image size (size is 0x%" PRIx64
 417                          ", max supported is 0x%" PRIx64 ")",
 418                          header.disk_size, VDI_DISK_SIZE_MAX);
 419        ret = -ENOTSUP;
 420        goto fail;
 421    }
 422
 423    if (header.disk_size % SECTOR_SIZE != 0) {
 424        /* 'VBoxManage convertfromraw' can create images with odd disk sizes.
 425           We accept them but round the disk size to the next multiple of
 426           SECTOR_SIZE. */
 427        logout("odd disk size %" PRIu64 " B, round up\n", header.disk_size);
 428        header.disk_size = ROUND_UP(header.disk_size, SECTOR_SIZE);
 429    }
 430
 431    if (header.signature != VDI_SIGNATURE) {
 432        error_setg(errp, "Image not in VDI format (bad signature %08" PRIx32
 433                   ")", header.signature);
 434        ret = -EINVAL;
 435        goto fail;
 436    } else if (header.version != VDI_VERSION_1_1) {
 437        error_setg(errp, "unsupported VDI image (version %" PRIu32 ".%" PRIu32
 438                   ")", header.version >> 16, header.version & 0xffff);
 439        ret = -ENOTSUP;
 440        goto fail;
 441    } else if (header.offset_bmap % SECTOR_SIZE != 0) {
 442        /* We only support block maps which start on a sector boundary. */
 443        error_setg(errp, "unsupported VDI image (unaligned block map offset "
 444                   "0x%" PRIx32 ")", header.offset_bmap);
 445        ret = -ENOTSUP;
 446        goto fail;
 447    } else if (header.offset_data % SECTOR_SIZE != 0) {
 448        /* We only support data blocks which start on a sector boundary. */
 449        error_setg(errp, "unsupported VDI image (unaligned data offset 0x%"
 450                   PRIx32 ")", header.offset_data);
 451        ret = -ENOTSUP;
 452        goto fail;
 453    } else if (header.sector_size != SECTOR_SIZE) {
 454        error_setg(errp, "unsupported VDI image (sector size %" PRIu32
 455                   " is not %u)", header.sector_size, SECTOR_SIZE);
 456        ret = -ENOTSUP;
 457        goto fail;
 458    } else if (header.block_size != DEFAULT_CLUSTER_SIZE) {
 459        error_setg(errp, "unsupported VDI image (block size %" PRIu32
 460                   " is not %u)", header.block_size, DEFAULT_CLUSTER_SIZE);
 461        ret = -ENOTSUP;
 462        goto fail;
 463    } else if (header.disk_size >
 464               (uint64_t)header.blocks_in_image * header.block_size) {
 465        error_setg(errp, "unsupported VDI image (disk size %" PRIu64 ", "
 466                   "image bitmap has room for %" PRIu64 ")",
 467                   header.disk_size,
 468                   (uint64_t)header.blocks_in_image * header.block_size);
 469        ret = -ENOTSUP;
 470        goto fail;
 471    } else if (!uuid_is_null(header.uuid_link)) {
 472        error_setg(errp, "unsupported VDI image (non-NULL link UUID)");
 473        ret = -ENOTSUP;
 474        goto fail;
 475    } else if (!uuid_is_null(header.uuid_parent)) {
 476        error_setg(errp, "unsupported VDI image (non-NULL parent UUID)");
 477        ret = -ENOTSUP;
 478        goto fail;
 479    } else if (header.blocks_in_image > VDI_BLOCKS_IN_IMAGE_MAX) {
 480        error_setg(errp, "unsupported VDI image "
 481                         "(too many blocks %u, max is %u)",
 482                          header.blocks_in_image, VDI_BLOCKS_IN_IMAGE_MAX);
 483        ret = -ENOTSUP;
 484        goto fail;
 485    }
 486
 487    bs->total_sectors = header.disk_size / SECTOR_SIZE;
 488
 489    s->block_size = header.block_size;
 490    s->block_sectors = header.block_size / SECTOR_SIZE;
 491    s->bmap_sector = header.offset_bmap / SECTOR_SIZE;
 492    s->header = header;
 493
 494    bmap_size = header.blocks_in_image * sizeof(uint32_t);
 495    bmap_size = DIV_ROUND_UP(bmap_size, SECTOR_SIZE);
 496    s->bmap = qemu_try_blockalign(bs->file->bs, bmap_size * SECTOR_SIZE);
 497    if (s->bmap == NULL) {
 498        ret = -ENOMEM;
 499        goto fail;
 500    }
 501
 502    ret = bdrv_read(bs->file->bs, s->bmap_sector, (uint8_t *)s->bmap,
 503                    bmap_size);
 504    if (ret < 0) {
 505        goto fail_free_bmap;
 506    }
 507
 508    /* Disable migration when vdi images are used */
 509    error_setg(&s->migration_blocker, "The vdi format used by node '%s' "
 510               "does not support live migration",
 511               bdrv_get_device_or_node_name(bs));
 512    migrate_add_blocker(s->migration_blocker);
 513
 514    qemu_co_mutex_init(&s->write_lock);
 515
 516    return 0;
 517
 518 fail_free_bmap:
 519    qemu_vfree(s->bmap);
 520
 521 fail:
 522    return ret;
 523}
 524
 525static int vdi_reopen_prepare(BDRVReopenState *state,
 526                              BlockReopenQueue *queue, Error **errp)
 527{
 528    return 0;
 529}
 530
 531static int64_t coroutine_fn vdi_co_get_block_status(BlockDriverState *bs,
 532        int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
 533{
 534    /* TODO: Check for too large sector_num (in bdrv_is_allocated or here). */
 535    BDRVVdiState *s = (BDRVVdiState *)bs->opaque;
 536    size_t bmap_index = sector_num / s->block_sectors;
 537    size_t sector_in_block = sector_num % s->block_sectors;
 538    int n_sectors = s->block_sectors - sector_in_block;
 539    uint32_t bmap_entry = le32_to_cpu(s->bmap[bmap_index]);
 540    uint64_t offset;
 541    int result;
 542
 543    logout("%p, %" PRId64 ", %d, %p\n", bs, sector_num, nb_sectors, pnum);
 544    if (n_sectors > nb_sectors) {
 545        n_sectors = nb_sectors;
 546    }
 547    *pnum = n_sectors;
 548    result = VDI_IS_ALLOCATED(bmap_entry);
 549    if (!result) {
 550        return 0;
 551    }
 552
 553    offset = s->header.offset_data +
 554                              (uint64_t)bmap_entry * s->block_size +
 555                              sector_in_block * SECTOR_SIZE;
 556    *file = bs->file->bs;
 557    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
 558}
 559
 560static int vdi_co_read(BlockDriverState *bs,
 561        int64_t sector_num, uint8_t *buf, int nb_sectors)
 562{
 563    BDRVVdiState *s = bs->opaque;
 564    uint32_t bmap_entry;
 565    uint32_t block_index;
 566    uint32_t sector_in_block;
 567    uint32_t n_sectors;
 568    int ret = 0;
 569
 570    logout("\n");
 571
 572    while (ret >= 0 && nb_sectors > 0) {
 573        block_index = sector_num / s->block_sectors;
 574        sector_in_block = sector_num % s->block_sectors;
 575        n_sectors = s->block_sectors - sector_in_block;
 576        if (n_sectors > nb_sectors) {
 577            n_sectors = nb_sectors;
 578        }
 579
 580        logout("will read %u sectors starting at sector %" PRIu64 "\n",
 581               n_sectors, sector_num);
 582
 583        /* prepare next AIO request */
 584        bmap_entry = le32_to_cpu(s->bmap[block_index]);
 585        if (!VDI_IS_ALLOCATED(bmap_entry)) {
 586            /* Block not allocated, return zeros, no need to wait. */
 587            memset(buf, 0, n_sectors * SECTOR_SIZE);
 588            ret = 0;
 589        } else {
 590            uint64_t offset = s->header.offset_data / SECTOR_SIZE +
 591                              (uint64_t)bmap_entry * s->block_sectors +
 592                              sector_in_block;
 593            ret = bdrv_read(bs->file->bs, offset, buf, n_sectors);
 594        }
 595        logout("%u sectors read\n", n_sectors);
 596
 597        nb_sectors -= n_sectors;
 598        sector_num += n_sectors;
 599        buf += n_sectors * SECTOR_SIZE;
 600    }
 601
 602    return ret;
 603}
 604
 605static int vdi_co_write(BlockDriverState *bs,
 606        int64_t sector_num, const uint8_t *buf, int nb_sectors)
 607{
 608    BDRVVdiState *s = bs->opaque;
 609    uint32_t bmap_entry;
 610    uint32_t block_index;
 611    uint32_t sector_in_block;
 612    uint32_t n_sectors;
 613    uint32_t bmap_first = VDI_UNALLOCATED;
 614    uint32_t bmap_last = VDI_UNALLOCATED;
 615    uint8_t *block = NULL;
 616    int ret = 0;
 617
 618    logout("\n");
 619
 620    while (ret >= 0 && nb_sectors > 0) {
 621        block_index = sector_num / s->block_sectors;
 622        sector_in_block = sector_num % s->block_sectors;
 623        n_sectors = s->block_sectors - sector_in_block;
 624        if (n_sectors > nb_sectors) {
 625            n_sectors = nb_sectors;
 626        }
 627
 628        logout("will write %u sectors starting at sector %" PRIu64 "\n",
 629               n_sectors, sector_num);
 630
 631        /* prepare next AIO request */
 632        bmap_entry = le32_to_cpu(s->bmap[block_index]);
 633        if (!VDI_IS_ALLOCATED(bmap_entry)) {
 634            /* Allocate new block and write to it. */
 635            uint64_t offset;
 636            bmap_entry = s->header.blocks_allocated;
 637            s->bmap[block_index] = cpu_to_le32(bmap_entry);
 638            s->header.blocks_allocated++;
 639            offset = s->header.offset_data / SECTOR_SIZE +
 640                     (uint64_t)bmap_entry * s->block_sectors;
 641            if (block == NULL) {
 642                block = g_malloc(s->block_size);
 643                bmap_first = block_index;
 644            }
 645            bmap_last = block_index;
 646            /* Copy data to be written to new block and zero unused parts. */
 647            memset(block, 0, sector_in_block * SECTOR_SIZE);
 648            memcpy(block + sector_in_block * SECTOR_SIZE,
 649                   buf, n_sectors * SECTOR_SIZE);
 650            memset(block + (sector_in_block + n_sectors) * SECTOR_SIZE, 0,
 651                   (s->block_sectors - n_sectors - sector_in_block) * SECTOR_SIZE);
 652
 653            /* Note that this coroutine does not yield anywhere from reading the
 654             * bmap entry until here, so in regards to all the coroutines trying
 655             * to write to this cluster, the one doing the allocation will
 656             * always be the first to try to acquire the lock.
 657             * Therefore, it is also the first that will actually be able to
 658             * acquire the lock and thus the padded cluster is written before
 659             * the other coroutines can write to the affected area. */
 660            qemu_co_mutex_lock(&s->write_lock);
 661            ret = bdrv_write(bs->file->bs, offset, block, s->block_sectors);
 662            qemu_co_mutex_unlock(&s->write_lock);
 663        } else {
 664            uint64_t offset = s->header.offset_data / SECTOR_SIZE +
 665                              (uint64_t)bmap_entry * s->block_sectors +
 666                              sector_in_block;
 667            qemu_co_mutex_lock(&s->write_lock);
 668            /* This lock is only used to make sure the following write operation
 669             * is executed after the write issued by the coroutine allocating
 670             * this cluster, therefore we do not need to keep it locked.
 671             * As stated above, the allocating coroutine will always try to lock
 672             * the mutex before all the other concurrent accesses to that
 673             * cluster, therefore at this point we can be absolutely certain
 674             * that that write operation has returned (there may be other writes
 675             * in flight, but they do not concern this very operation). */
 676            qemu_co_mutex_unlock(&s->write_lock);
 677            ret = bdrv_write(bs->file->bs, offset, buf, n_sectors);
 678        }
 679
 680        nb_sectors -= n_sectors;
 681        sector_num += n_sectors;
 682        buf += n_sectors * SECTOR_SIZE;
 683
 684        logout("%u sectors written\n", n_sectors);
 685    }
 686
 687    logout("finished data write\n");
 688    if (ret < 0) {
 689        return ret;
 690    }
 691
 692    if (block) {
 693        /* One or more new blocks were allocated. */
 694        VdiHeader *header = (VdiHeader *) block;
 695        uint8_t *base;
 696        uint64_t offset;
 697
 698        logout("now writing modified header\n");
 699        assert(VDI_IS_ALLOCATED(bmap_first));
 700        *header = s->header;
 701        vdi_header_to_le(header);
 702        ret = bdrv_write(bs->file->bs, 0, block, 1);
 703        g_free(block);
 704        block = NULL;
 705
 706        if (ret < 0) {
 707            return ret;
 708        }
 709
 710        logout("now writing modified block map entry %u...%u\n",
 711               bmap_first, bmap_last);
 712        /* Write modified sectors from block map. */
 713        bmap_first /= (SECTOR_SIZE / sizeof(uint32_t));
 714        bmap_last /= (SECTOR_SIZE / sizeof(uint32_t));
 715        n_sectors = bmap_last - bmap_first + 1;
 716        offset = s->bmap_sector + bmap_first;
 717        base = ((uint8_t *)&s->bmap[0]) + bmap_first * SECTOR_SIZE;
 718        logout("will write %u block map sectors starting from entry %u\n",
 719               n_sectors, bmap_first);
 720        ret = bdrv_write(bs->file->bs, offset, base, n_sectors);
 721    }
 722
 723    return ret;
 724}
 725
 726static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
 727{
 728    int ret = 0;
 729    uint64_t bytes = 0;
 730    uint32_t blocks;
 731    size_t block_size = DEFAULT_CLUSTER_SIZE;
 732    uint32_t image_type = VDI_TYPE_DYNAMIC;
 733    VdiHeader header;
 734    size_t i;
 735    size_t bmap_size;
 736    int64_t offset = 0;
 737    Error *local_err = NULL;
 738    BlockBackend *blk = NULL;
 739    uint32_t *bmap = NULL;
 740
 741    logout("\n");
 742
 743    /* Read out options. */
 744    bytes = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
 745                     BDRV_SECTOR_SIZE);
 746#if defined(CONFIG_VDI_BLOCK_SIZE)
 747    /* TODO: Additional checks (SECTOR_SIZE * 2^n, ...). */
 748    block_size = qemu_opt_get_size_del(opts,
 749                                       BLOCK_OPT_CLUSTER_SIZE,
 750                                       DEFAULT_CLUSTER_SIZE);
 751#endif
 752#if defined(CONFIG_VDI_STATIC_IMAGE)
 753    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_STATIC, false)) {
 754        image_type = VDI_TYPE_STATIC;
 755    }
 756#endif
 757
 758    if (bytes > VDI_DISK_SIZE_MAX) {
 759        ret = -ENOTSUP;
 760        error_setg(errp, "Unsupported VDI image size (size is 0x%" PRIx64
 761                          ", max supported is 0x%" PRIx64 ")",
 762                          bytes, VDI_DISK_SIZE_MAX);
 763        goto exit;
 764    }
 765
 766    ret = bdrv_create_file(filename, opts, &local_err);
 767    if (ret < 0) {
 768        error_propagate(errp, local_err);
 769        goto exit;
 770    }
 771
 772    blk = blk_new_open(filename, NULL, NULL,
 773                       BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err);
 774    if (blk == NULL) {
 775        error_propagate(errp, local_err);
 776        ret = -EIO;
 777        goto exit;
 778    }
 779
 780    blk_set_allow_write_beyond_eof(blk, true);
 781
 782    /* We need enough blocks to store the given disk size,
 783       so always round up. */
 784    blocks = DIV_ROUND_UP(bytes, block_size);
 785
 786    bmap_size = blocks * sizeof(uint32_t);
 787    bmap_size = ROUND_UP(bmap_size, SECTOR_SIZE);
 788
 789    memset(&header, 0, sizeof(header));
 790    pstrcpy(header.text, sizeof(header.text), VDI_TEXT);
 791    header.signature = VDI_SIGNATURE;
 792    header.version = VDI_VERSION_1_1;
 793    header.header_size = 0x180;
 794    header.image_type = image_type;
 795    header.offset_bmap = 0x200;
 796    header.offset_data = 0x200 + bmap_size;
 797    header.sector_size = SECTOR_SIZE;
 798    header.disk_size = bytes;
 799    header.block_size = block_size;
 800    header.blocks_in_image = blocks;
 801    if (image_type == VDI_TYPE_STATIC) {
 802        header.blocks_allocated = blocks;
 803    }
 804    uuid_generate(header.uuid_image);
 805    uuid_generate(header.uuid_last_snap);
 806    /* There is no need to set header.uuid_link or header.uuid_parent here. */
 807#if defined(CONFIG_VDI_DEBUG)
 808    vdi_header_print(&header);
 809#endif
 810    vdi_header_to_le(&header);
 811    ret = blk_pwrite(blk, offset, &header, sizeof(header));
 812    if (ret < 0) {
 813        error_setg(errp, "Error writing header to %s", filename);
 814        goto exit;
 815    }
 816    offset += sizeof(header);
 817
 818    if (bmap_size > 0) {
 819        bmap = g_try_malloc0(bmap_size);
 820        if (bmap == NULL) {
 821            ret = -ENOMEM;
 822            error_setg(errp, "Could not allocate bmap");
 823            goto exit;
 824        }
 825        for (i = 0; i < blocks; i++) {
 826            if (image_type == VDI_TYPE_STATIC) {
 827                bmap[i] = i;
 828            } else {
 829                bmap[i] = VDI_UNALLOCATED;
 830            }
 831        }
 832        ret = blk_pwrite(blk, offset, bmap, bmap_size);
 833        if (ret < 0) {
 834            error_setg(errp, "Error writing bmap to %s", filename);
 835            goto exit;
 836        }
 837        offset += bmap_size;
 838    }
 839
 840    if (image_type == VDI_TYPE_STATIC) {
 841        ret = blk_truncate(blk, offset + blocks * block_size);
 842        if (ret < 0) {
 843            error_setg(errp, "Failed to statically allocate %s", filename);
 844            goto exit;
 845        }
 846    }
 847
 848exit:
 849    blk_unref(blk);
 850    g_free(bmap);
 851    return ret;
 852}
 853
 854static void vdi_close(BlockDriverState *bs)
 855{
 856    BDRVVdiState *s = bs->opaque;
 857
 858    qemu_vfree(s->bmap);
 859
 860    migrate_del_blocker(s->migration_blocker);
 861    error_free(s->migration_blocker);
 862}
 863
 864static QemuOptsList vdi_create_opts = {
 865    .name = "vdi-create-opts",
 866    .head = QTAILQ_HEAD_INITIALIZER(vdi_create_opts.head),
 867    .desc = {
 868        {
 869            .name = BLOCK_OPT_SIZE,
 870            .type = QEMU_OPT_SIZE,
 871            .help = "Virtual disk size"
 872        },
 873#if defined(CONFIG_VDI_BLOCK_SIZE)
 874        {
 875            .name = BLOCK_OPT_CLUSTER_SIZE,
 876            .type = QEMU_OPT_SIZE,
 877            .help = "VDI cluster (block) size",
 878            .def_value_str = stringify(DEFAULT_CLUSTER_SIZE)
 879        },
 880#endif
 881#if defined(CONFIG_VDI_STATIC_IMAGE)
 882        {
 883            .name = BLOCK_OPT_STATIC,
 884            .type = QEMU_OPT_BOOL,
 885            .help = "VDI static (pre-allocated) image",
 886            .def_value_str = "off"
 887        },
 888#endif
 889        /* TODO: An additional option to set UUID values might be useful. */
 890        { /* end of list */ }
 891    }
 892};
 893
 894static BlockDriver bdrv_vdi = {
 895    .format_name = "vdi",
 896    .instance_size = sizeof(BDRVVdiState),
 897    .bdrv_probe = vdi_probe,
 898    .bdrv_open = vdi_open,
 899    .bdrv_close = vdi_close,
 900    .bdrv_reopen_prepare = vdi_reopen_prepare,
 901    .bdrv_create = vdi_create,
 902    .bdrv_has_zero_init = bdrv_has_zero_init_1,
 903    .bdrv_co_get_block_status = vdi_co_get_block_status,
 904    .bdrv_make_empty = vdi_make_empty,
 905
 906    .bdrv_read = vdi_co_read,
 907#if defined(CONFIG_VDI_WRITE)
 908    .bdrv_write = vdi_co_write,
 909#endif
 910
 911    .bdrv_get_info = vdi_get_info,
 912
 913    .create_opts = &vdi_create_opts,
 914    .bdrv_check = vdi_check,
 915};
 916
 917static void bdrv_vdi_init(void)
 918{
 919    logout("\n");
 920    bdrv_register(&bdrv_vdi);
 921}
 922
 923block_init(bdrv_vdi_init);
 924