qemu/block/vpc.c
<<
>>
Prefs
   1/*
   2 * Block driver for Connectix / Microsoft Virtual PC images
   3 *
   4 * Copyright (c) 2005 Alex Beregszaszi
   5 * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
   6 *
   7 * Permission is hereby granted, free of charge, to any person obtaining a copy
   8 * of this software and associated documentation files (the "Software"), to deal
   9 * in the Software without restriction, including without limitation the rights
  10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 * copies of the Software, and to permit persons to whom the Software is
  12 * furnished to do so, subject to the following conditions:
  13 *
  14 * The above copyright notice and this permission notice shall be included in
  15 * all copies or substantial portions of the Software.
  16 *
  17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  23 * THE SOFTWARE.
  24 */
  25
  26#include "qemu/osdep.h"
  27#include "qapi/error.h"
  28#include "block/block_int.h"
  29#include "block/qdict.h"
  30#include "sysemu/block-backend.h"
  31#include "qemu/module.h"
  32#include "qemu/option.h"
  33#include "migration/blocker.h"
  34#include "qemu/bswap.h"
  35#include "qemu/uuid.h"
  36#include "qapi/qmp/qdict.h"
  37#include "qapi/qobject-input-visitor.h"
  38#include "qapi/qapi-visit-block-core.h"
  39
  40/**************************************************************/
  41
  42#define HEADER_SIZE 512
  43
  44//#define CACHE
  45
  46enum vhd_type {
  47    VHD_FIXED           = 2,
  48    VHD_DYNAMIC         = 3,
  49    VHD_DIFFERENCING    = 4,
  50};
  51
  52/* Seconds since Jan 1, 2000 0:00:00 (UTC) */
  53#define VHD_TIMESTAMP_BASE 946684800
  54
  55#define VHD_CHS_MAX_C   65535LL
  56#define VHD_CHS_MAX_H   16
  57#define VHD_CHS_MAX_S   255
  58
  59#define VHD_MAX_SECTORS       0xff000000    /* 2040 GiB max image size */
  60#define VHD_MAX_GEOMETRY      (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S)
  61
  62#define VPC_OPT_FORCE_SIZE "force_size"
  63
  64/* always big-endian */
  65typedef struct vhd_footer {
  66    char        creator[8]; /* "conectix" */
  67    uint32_t    features;
  68    uint32_t    version;
  69
  70    /* Offset of next header structure, 0xFFFFFFFF if none */
  71    uint64_t    data_offset;
  72
  73    /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
  74    uint32_t    timestamp;
  75
  76    char        creator_app[4]; /*  e.g., "vpc " */
  77    uint16_t    major;
  78    uint16_t    minor;
  79    char        creator_os[4]; /* "Wi2k" */
  80
  81    uint64_t    orig_size;
  82    uint64_t    current_size;
  83
  84    uint16_t    cyls;
  85    uint8_t     heads;
  86    uint8_t     secs_per_cyl;
  87
  88    uint32_t    type;
  89
  90    /* Checksum of the Hard Disk Footer ("one's complement of the sum of all
  91       the bytes in the footer without the checksum field") */
  92    uint32_t    checksum;
  93
  94    /* UUID used to identify a parent hard disk (backing file) */
  95    QemuUUID    uuid;
  96
  97    uint8_t     in_saved_state;
  98} QEMU_PACKED VHDFooter;
  99
 100typedef struct vhd_dyndisk_header {
 101    char        magic[8]; /* "cxsparse" */
 102
 103    /* Offset of next header structure, 0xFFFFFFFF if none */
 104    uint64_t    data_offset;
 105
 106    /* Offset of the Block Allocation Table (BAT) */
 107    uint64_t    table_offset;
 108
 109    uint32_t    version;
 110    uint32_t    max_table_entries; /* 32bit/entry */
 111
 112    /* 2 MB by default, must be a power of two */
 113    uint32_t    block_size;
 114
 115    uint32_t    checksum;
 116    uint8_t     parent_uuid[16];
 117    uint32_t    parent_timestamp;
 118    uint32_t    reserved;
 119
 120    /* Backing file name (in UTF-16) */
 121    uint8_t     parent_name[512];
 122
 123    struct {
 124        uint32_t    platform;
 125        uint32_t    data_space;
 126        uint32_t    data_length;
 127        uint32_t    reserved;
 128        uint64_t    data_offset;
 129    } parent_locator[8];
 130} QEMU_PACKED VHDDynDiskHeader;
 131
 132typedef struct BDRVVPCState {
 133    CoMutex lock;
 134    uint8_t footer_buf[HEADER_SIZE];
 135    uint64_t free_data_block_offset;
 136    int max_table_entries;
 137    uint32_t *pagetable;
 138    uint64_t bat_offset;
 139    uint64_t last_bitmap_offset;
 140
 141    uint32_t block_size;
 142    uint32_t bitmap_size;
 143    bool force_use_chs;
 144    bool force_use_sz;
 145
 146#ifdef CACHE
 147    uint8_t *pageentry_u8;
 148    uint32_t *pageentry_u32;
 149    uint16_t *pageentry_u16;
 150
 151    uint64_t last_bitmap;
 152#endif
 153
 154    Error *migration_blocker;
 155} BDRVVPCState;
 156
 157#define VPC_OPT_SIZE_CALC "force_size_calc"
 158static QemuOptsList vpc_runtime_opts = {
 159    .name = "vpc-runtime-opts",
 160    .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head),
 161    .desc = {
 162        {
 163            .name = VPC_OPT_SIZE_CALC,
 164            .type = QEMU_OPT_STRING,
 165            .help = "Force disk size calculation to use either CHS geometry, "
 166                    "or use the disk current_size specified in the VHD footer. "
 167                    "{chs, current_size}"
 168        },
 169        { /* end of list */ }
 170    }
 171};
 172
 173static QemuOptsList vpc_create_opts;
 174
 175static uint32_t vpc_checksum(uint8_t* buf, size_t size)
 176{
 177    uint32_t res = 0;
 178    int i;
 179
 180    for (i = 0; i < size; i++)
 181        res += buf[i];
 182
 183    return ~res;
 184}
 185
 186
 187static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
 188{
 189    if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
 190        return 100;
 191    return 0;
 192}
 193
 194static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts,
 195                              Error **errp)
 196{
 197    BDRVVPCState *s = bs->opaque;
 198    const char *size_calc;
 199
 200    size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC);
 201
 202    if (!size_calc) {
 203       /* no override, use autodetect only */
 204    } else if (!strcmp(size_calc, "current_size")) {
 205        s->force_use_sz = true;
 206    } else if (!strcmp(size_calc, "chs")) {
 207        s->force_use_chs = true;
 208    } else {
 209        error_setg(errp, "Invalid size calculation mode: '%s'", size_calc);
 210    }
 211}
 212
 213static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
 214                    Error **errp)
 215{
 216    BDRVVPCState *s = bs->opaque;
 217    int i;
 218    VHDFooter *footer;
 219    VHDDynDiskHeader *dyndisk_header;
 220    QemuOpts *opts = NULL;
 221    Error *local_err = NULL;
 222    bool use_chs;
 223    uint8_t buf[HEADER_SIZE];
 224    uint32_t checksum;
 225    uint64_t computed_size;
 226    uint64_t pagetable_size;
 227    int disk_type = VHD_DYNAMIC;
 228    int ret;
 229    int64_t bs_size;
 230
 231    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
 232                               false, errp);
 233    if (!bs->file) {
 234        return -EINVAL;
 235    }
 236
 237    opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
 238    qemu_opts_absorb_qdict(opts, options, &local_err);
 239    if (local_err) {
 240        error_propagate(errp, local_err);
 241        ret = -EINVAL;
 242        goto fail;
 243    }
 244
 245    vpc_parse_options(bs, opts, &local_err);
 246    if (local_err) {
 247        error_propagate(errp, local_err);
 248        ret = -EINVAL;
 249        goto fail;
 250    }
 251
 252    ret = bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE);
 253    if (ret < 0) {
 254        error_setg(errp, "Unable to read VHD header");
 255        goto fail;
 256    }
 257
 258    footer = (VHDFooter *) s->footer_buf;
 259    if (strncmp(footer->creator, "conectix", 8)) {
 260        int64_t offset = bdrv_getlength(bs->file->bs);
 261        if (offset < 0) {
 262            ret = offset;
 263            error_setg(errp, "Invalid file size");
 264            goto fail;
 265        } else if (offset < HEADER_SIZE) {
 266            ret = -EINVAL;
 267            error_setg(errp, "File too small for a VHD header");
 268            goto fail;
 269        }
 270
 271        /* If a fixed disk, the footer is found only at the end of the file */
 272        ret = bdrv_pread(bs->file, offset-HEADER_SIZE, s->footer_buf,
 273                         HEADER_SIZE);
 274        if (ret < 0) {
 275            goto fail;
 276        }
 277        if (strncmp(footer->creator, "conectix", 8)) {
 278            error_setg(errp, "invalid VPC image");
 279            ret = -EINVAL;
 280            goto fail;
 281        }
 282        disk_type = VHD_FIXED;
 283    }
 284
 285    checksum = be32_to_cpu(footer->checksum);
 286    footer->checksum = 0;
 287    if (vpc_checksum(s->footer_buf, HEADER_SIZE) != checksum) {
 288        error_setg(errp, "Incorrect header checksum");
 289        ret = -EINVAL;
 290        goto fail;
 291    }
 292
 293    /* Write 'checksum' back to footer, or else will leave it with zero. */
 294    footer->checksum = cpu_to_be32(checksum);
 295
 296    /* The visible size of a image in Virtual PC depends on the geometry
 297       rather than on the size stored in the footer (the size in the footer
 298       is too large usually) */
 299    bs->total_sectors = (int64_t)
 300        be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
 301
 302    /* Microsoft Virtual PC and Microsoft Hyper-V produce and read
 303     * VHD image sizes differently.  VPC will rely on CHS geometry,
 304     * while Hyper-V and disk2vhd use the size specified in the footer.
 305     *
 306     * We use a couple of approaches to try and determine the correct method:
 307     * look at the Creator App field, and look for images that have CHS
 308     * geometry that is the maximum value.
 309     *
 310     * If the CHS geometry is the maximum CHS geometry, then we assume that
 311     * the size is the footer->current_size to avoid truncation.  Otherwise,
 312     * we follow the table based on footer->creator_app:
 313     *
 314     *  Known creator apps:
 315     *      'vpc '  :  CHS              Virtual PC (uses disk geometry)
 316     *      'qemu'  :  CHS              QEMU (uses disk geometry)
 317     *      'qem2'  :  current_size     QEMU (uses current_size)
 318     *      'win '  :  current_size     Hyper-V
 319     *      'd2v '  :  current_size     Disk2vhd
 320     *      'tap\0' :  current_size     XenServer
 321     *      'CTXS'  :  current_size     XenConverter
 322     *
 323     *  The user can override the table values via drive options, however
 324     *  even with an override we will still use current_size for images
 325     *  that have CHS geometry of the maximum size.
 326     */
 327    use_chs = (!!strncmp(footer->creator_app, "win ", 4) &&
 328               !!strncmp(footer->creator_app, "qem2", 4) &&
 329               !!strncmp(footer->creator_app, "d2v ", 4) &&
 330               !!strncmp(footer->creator_app, "CTXS", 4) &&
 331               !!memcmp(footer->creator_app, "tap", 4)) || s->force_use_chs;
 332
 333    if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) {
 334        bs->total_sectors = be64_to_cpu(footer->current_size) /
 335                                        BDRV_SECTOR_SIZE;
 336    }
 337
 338    /* Allow a maximum disk size of 2040 GiB */
 339    if (bs->total_sectors > VHD_MAX_SECTORS) {
 340        ret = -EFBIG;
 341        goto fail;
 342    }
 343
 344    if (disk_type == VHD_DYNAMIC) {
 345        ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf,
 346                         HEADER_SIZE);
 347        if (ret < 0) {
 348            error_setg(errp, "Error reading dynamic VHD header");
 349            goto fail;
 350        }
 351
 352        dyndisk_header = (VHDDynDiskHeader *) buf;
 353
 354        if (strncmp(dyndisk_header->magic, "cxsparse", 8)) {
 355            error_setg(errp, "Invalid header magic");
 356            ret = -EINVAL;
 357            goto fail;
 358        }
 359
 360        s->block_size = be32_to_cpu(dyndisk_header->block_size);
 361        if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) {
 362            error_setg(errp, "Invalid block size %" PRIu32, s->block_size);
 363            ret = -EINVAL;
 364            goto fail;
 365        }
 366        s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
 367
 368        s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries);
 369
 370        if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
 371            error_setg(errp, "Too many blocks");
 372            ret = -EINVAL;
 373            goto fail;
 374        }
 375
 376        computed_size = (uint64_t) s->max_table_entries * s->block_size;
 377        if (computed_size < bs->total_sectors * 512) {
 378            error_setg(errp, "Page table too small");
 379            ret = -EINVAL;
 380            goto fail;
 381        }
 382
 383        if (s->max_table_entries > SIZE_MAX / 4 ||
 384            s->max_table_entries > (int) INT_MAX / 4) {
 385            error_setg(errp, "Max Table Entries too large (%" PRId32 ")",
 386                        s->max_table_entries);
 387            ret = -EINVAL;
 388            goto fail;
 389        }
 390
 391        pagetable_size = (uint64_t) s->max_table_entries * 4;
 392
 393        s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size);
 394        if (s->pagetable == NULL) {
 395            error_setg(errp, "Unable to allocate memory for page table");
 396            ret = -ENOMEM;
 397            goto fail;
 398        }
 399
 400        s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);
 401
 402        ret = bdrv_pread(bs->file, s->bat_offset, s->pagetable,
 403                         pagetable_size);
 404        if (ret < 0) {
 405            error_setg(errp, "Error reading pagetable");
 406            goto fail;
 407        }
 408
 409        s->free_data_block_offset =
 410            ROUND_UP(s->bat_offset + pagetable_size, 512);
 411
 412        for (i = 0; i < s->max_table_entries; i++) {
 413            be32_to_cpus(&s->pagetable[i]);
 414            if (s->pagetable[i] != 0xFFFFFFFF) {
 415                int64_t next = (512 * (int64_t) s->pagetable[i]) +
 416                    s->bitmap_size + s->block_size;
 417
 418                if (next > s->free_data_block_offset) {
 419                    s->free_data_block_offset = next;
 420                }
 421            }
 422        }
 423
 424        bs_size = bdrv_getlength(bs->file->bs);
 425        if (bs_size < 0) {
 426            error_setg_errno(errp, -bs_size, "Unable to learn image size");
 427            ret = bs_size;
 428            goto fail;
 429        }
 430        if (s->free_data_block_offset > bs_size) {
 431            error_setg(errp, "block-vpc: free_data_block_offset points after "
 432                             "the end of file. The image has been truncated.");
 433            ret = -EINVAL;
 434            goto fail;
 435        }
 436
 437        s->last_bitmap_offset = (int64_t) -1;
 438
 439#ifdef CACHE
 440        s->pageentry_u8 = g_malloc(512);
 441        s->pageentry_u32 = s->pageentry_u8;
 442        s->pageentry_u16 = s->pageentry_u8;
 443        s->last_pagetable = -1;
 444#endif
 445    }
 446
 447    /* Disable migration when VHD images are used */
 448    error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
 449               "does not support live migration",
 450               bdrv_get_device_or_node_name(bs));
 451    ret = migrate_add_blocker(s->migration_blocker, &local_err);
 452    if (local_err) {
 453        error_propagate(errp, local_err);
 454        error_free(s->migration_blocker);
 455        goto fail;
 456    }
 457
 458    qemu_co_mutex_init(&s->lock);
 459    qemu_opts_del(opts);
 460
 461    return 0;
 462
 463fail:
 464    qemu_opts_del(opts);
 465    qemu_vfree(s->pagetable);
 466#ifdef CACHE
 467    g_free(s->pageentry_u8);
 468#endif
 469    return ret;
 470}
 471
 472static int vpc_reopen_prepare(BDRVReopenState *state,
 473                              BlockReopenQueue *queue, Error **errp)
 474{
 475    return 0;
 476}
 477
 478/*
 479 * Returns the absolute byte offset of the given sector in the image file.
 480 * If the sector is not allocated, -1 is returned instead.
 481 * If an error occurred trying to write an updated block bitmap back to
 482 * the file, -2 is returned, and the error value is written to *err.
 483 * This can only happen for a write operation.
 484 *
 485 * The parameter write must be 1 if the offset will be used for a write
 486 * operation (the block bitmaps is updated then), 0 otherwise.
 487 * If write is true then err must not be NULL.
 488 */
 489static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
 490                                       bool write, int *err)
 491{
 492    BDRVVPCState *s = bs->opaque;
 493    uint64_t bitmap_offset, block_offset;
 494    uint32_t pagetable_index, offset_in_block;
 495
 496    assert(!(write && err == NULL));
 497
 498    pagetable_index = offset / s->block_size;
 499    offset_in_block = offset % s->block_size;
 500
 501    if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
 502        return -1; /* not allocated */
 503
 504    bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
 505    block_offset = bitmap_offset + s->bitmap_size + offset_in_block;
 506
 507    /* We must ensure that we don't write to any sectors which are marked as
 508       unused in the bitmap. We get away with setting all bits in the block
 509       bitmap each time we write to a new block. This might cause Virtual PC to
 510       miss sparse read optimization, but it's not a problem in terms of
 511       correctness. */
 512    if (write && (s->last_bitmap_offset != bitmap_offset)) {
 513        uint8_t bitmap[s->bitmap_size];
 514        int r;
 515
 516        s->last_bitmap_offset = bitmap_offset;
 517        memset(bitmap, 0xff, s->bitmap_size);
 518        r = bdrv_pwrite_sync(bs->file, bitmap_offset, bitmap, s->bitmap_size);
 519        if (r < 0) {
 520            *err = r;
 521            return -2;
 522        }
 523    }
 524
 525    return block_offset;
 526}
 527
 528/*
 529 * Writes the footer to the end of the image file. This is needed when the
 530 * file grows as it overwrites the old footer
 531 *
 532 * Returns 0 on success and < 0 on error
 533 */
 534static int rewrite_footer(BlockDriverState* bs)
 535{
 536    int ret;
 537    BDRVVPCState *s = bs->opaque;
 538    int64_t offset = s->free_data_block_offset;
 539
 540    ret = bdrv_pwrite_sync(bs->file, offset, s->footer_buf, HEADER_SIZE);
 541    if (ret < 0)
 542        return ret;
 543
 544    return 0;
 545}
 546
 547/*
 548 * Allocates a new block. This involves writing a new footer and updating
 549 * the Block Allocation Table to use the space at the old end of the image
 550 * file (overwriting the old footer)
 551 *
 552 * Returns the sectors' offset in the image file on success and < 0 on error
 553 */
 554static int64_t alloc_block(BlockDriverState* bs, int64_t offset)
 555{
 556    BDRVVPCState *s = bs->opaque;
 557    int64_t bat_offset;
 558    uint32_t index, bat_value;
 559    int ret;
 560    uint8_t bitmap[s->bitmap_size];
 561
 562    /* Check if sector_num is valid */
 563    if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
 564        return -EINVAL;
 565    }
 566
 567    /* Write entry into in-memory BAT */
 568    index = offset / s->block_size;
 569    assert(s->pagetable[index] == 0xFFFFFFFF);
 570    s->pagetable[index] = s->free_data_block_offset / 512;
 571
 572    /* Initialize the block's bitmap */
 573    memset(bitmap, 0xff, s->bitmap_size);
 574    ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset, bitmap,
 575        s->bitmap_size);
 576    if (ret < 0) {
 577        return ret;
 578    }
 579
 580    /* Write new footer (the old one will be overwritten) */
 581    s->free_data_block_offset += s->block_size + s->bitmap_size;
 582    ret = rewrite_footer(bs);
 583    if (ret < 0)
 584        goto fail;
 585
 586    /* Write BAT entry to disk */
 587    bat_offset = s->bat_offset + (4 * index);
 588    bat_value = cpu_to_be32(s->pagetable[index]);
 589    ret = bdrv_pwrite_sync(bs->file, bat_offset, &bat_value, 4);
 590    if (ret < 0)
 591        goto fail;
 592
 593    return get_image_offset(bs, offset, false, NULL);
 594
 595fail:
 596    s->free_data_block_offset -= (s->block_size + s->bitmap_size);
 597    return ret;
 598}
 599
 600static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 601{
 602    BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
 603    VHDFooter *footer = (VHDFooter *) s->footer_buf;
 604
 605    if (be32_to_cpu(footer->type) != VHD_FIXED) {
 606        bdi->cluster_size = s->block_size;
 607    }
 608
 609    bdi->unallocated_blocks_are_zero = true;
 610    return 0;
 611}
 612
 613static int coroutine_fn
 614vpc_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
 615              QEMUIOVector *qiov, int flags)
 616{
 617    BDRVVPCState *s = bs->opaque;
 618    int ret;
 619    int64_t image_offset;
 620    int64_t n_bytes;
 621    int64_t bytes_done = 0;
 622    VHDFooter *footer = (VHDFooter *) s->footer_buf;
 623    QEMUIOVector local_qiov;
 624
 625    if (be32_to_cpu(footer->type) == VHD_FIXED) {
 626        return bdrv_co_preadv(bs->file, offset, bytes, qiov, 0);
 627    }
 628
 629    qemu_co_mutex_lock(&s->lock);
 630    qemu_iovec_init(&local_qiov, qiov->niov);
 631
 632    while (bytes > 0) {
 633        image_offset = get_image_offset(bs, offset, false, NULL);
 634        n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
 635
 636        if (image_offset == -1) {
 637            qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
 638        } else {
 639            qemu_iovec_reset(&local_qiov);
 640            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
 641
 642            qemu_co_mutex_unlock(&s->lock);
 643            ret = bdrv_co_preadv(bs->file, image_offset, n_bytes,
 644                                 &local_qiov, 0);
 645            qemu_co_mutex_lock(&s->lock);
 646            if (ret < 0) {
 647                goto fail;
 648            }
 649        }
 650
 651        bytes -= n_bytes;
 652        offset += n_bytes;
 653        bytes_done += n_bytes;
 654    }
 655
 656    ret = 0;
 657fail:
 658    qemu_iovec_destroy(&local_qiov);
 659    qemu_co_mutex_unlock(&s->lock);
 660
 661    return ret;
 662}
 663
 664static int coroutine_fn
 665vpc_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
 666               QEMUIOVector *qiov, int flags)
 667{
 668    BDRVVPCState *s = bs->opaque;
 669    int64_t image_offset;
 670    int64_t n_bytes;
 671    int64_t bytes_done = 0;
 672    int ret = 0;
 673    VHDFooter *footer =  (VHDFooter *) s->footer_buf;
 674    QEMUIOVector local_qiov;
 675
 676    if (be32_to_cpu(footer->type) == VHD_FIXED) {
 677        return bdrv_co_pwritev(bs->file, offset, bytes, qiov, 0);
 678    }
 679
 680    qemu_co_mutex_lock(&s->lock);
 681    qemu_iovec_init(&local_qiov, qiov->niov);
 682
 683    while (bytes > 0) {
 684        image_offset = get_image_offset(bs, offset, true, &ret);
 685        if (image_offset == -2) {
 686            /* Failed to write block bitmap: can't proceed with write */
 687            goto fail;
 688        }
 689        n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
 690
 691        if (image_offset == -1) {
 692            image_offset = alloc_block(bs, offset);
 693            if (image_offset < 0) {
 694                ret = image_offset;
 695                goto fail;
 696            }
 697        }
 698
 699        qemu_iovec_reset(&local_qiov);
 700        qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
 701
 702        qemu_co_mutex_unlock(&s->lock);
 703        ret = bdrv_co_pwritev(bs->file, image_offset, n_bytes,
 704                              &local_qiov, 0);
 705        qemu_co_mutex_lock(&s->lock);
 706        if (ret < 0) {
 707            goto fail;
 708        }
 709
 710        bytes -= n_bytes;
 711        offset += n_bytes;
 712        bytes_done += n_bytes;
 713    }
 714
 715    ret = 0;
 716fail:
 717    qemu_iovec_destroy(&local_qiov);
 718    qemu_co_mutex_unlock(&s->lock);
 719
 720    return ret;
 721}
 722
 723static int coroutine_fn vpc_co_block_status(BlockDriverState *bs,
 724                                            bool want_zero,
 725                                            int64_t offset, int64_t bytes,
 726                                            int64_t *pnum, int64_t *map,
 727                                            BlockDriverState **file)
 728{
 729    BDRVVPCState *s = bs->opaque;
 730    VHDFooter *footer = (VHDFooter*) s->footer_buf;
 731    int64_t image_offset;
 732    bool allocated;
 733    int ret;
 734    int64_t n;
 735
 736    if (be32_to_cpu(footer->type) == VHD_FIXED) {
 737        *pnum = bytes;
 738        *map = offset;
 739        *file = bs->file->bs;
 740        return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
 741    }
 742
 743    qemu_co_mutex_lock(&s->lock);
 744
 745    image_offset = get_image_offset(bs, offset, false, NULL);
 746    allocated = (image_offset != -1);
 747    *pnum = 0;
 748    ret = 0;
 749
 750    do {
 751        /* All sectors in a block are contiguous (without using the bitmap) */
 752        n = ROUND_UP(offset + 1, s->block_size) - offset;
 753        n = MIN(n, bytes);
 754
 755        *pnum += n;
 756        offset += n;
 757        bytes -= n;
 758        /* *pnum can't be greater than one block for allocated
 759         * sectors since there is always a bitmap in between. */
 760        if (allocated) {
 761            *file = bs->file->bs;
 762            *map = image_offset;
 763            ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
 764            break;
 765        }
 766        if (bytes == 0) {
 767            break;
 768        }
 769        image_offset = get_image_offset(bs, offset, false, NULL);
 770    } while (image_offset == -1);
 771
 772    qemu_co_mutex_unlock(&s->lock);
 773    return ret;
 774}
 775
 776/*
 777 * Calculates the number of cylinders, heads and sectors per cylinder
 778 * based on a given number of sectors. This is the algorithm described
 779 * in the VHD specification.
 780 *
 781 * Note that the geometry doesn't always exactly match total_sectors but
 782 * may round it down.
 783 *
 784 * Returns 0 on success, -EFBIG if the size is larger than 2040 GiB. Override
 785 * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
 786 * and instead allow up to 255 heads.
 787 */
 788static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
 789    uint8_t* heads, uint8_t* secs_per_cyl)
 790{
 791    uint32_t cyls_times_heads;
 792
 793    total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY);
 794
 795    if (total_sectors >= 65535LL * 16 * 63) {
 796        *secs_per_cyl = 255;
 797        *heads = 16;
 798        cyls_times_heads = total_sectors / *secs_per_cyl;
 799    } else {
 800        *secs_per_cyl = 17;
 801        cyls_times_heads = total_sectors / *secs_per_cyl;
 802        *heads = DIV_ROUND_UP(cyls_times_heads, 1024);
 803
 804        if (*heads < 4) {
 805            *heads = 4;
 806        }
 807
 808        if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
 809            *secs_per_cyl = 31;
 810            *heads = 16;
 811            cyls_times_heads = total_sectors / *secs_per_cyl;
 812        }
 813
 814        if (cyls_times_heads >= (*heads * 1024)) {
 815            *secs_per_cyl = 63;
 816            *heads = 16;
 817            cyls_times_heads = total_sectors / *secs_per_cyl;
 818        }
 819    }
 820
 821    *cyls = cyls_times_heads / *heads;
 822
 823    return 0;
 824}
 825
 826static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,
 827                               int64_t total_sectors)
 828{
 829    VHDDynDiskHeader *dyndisk_header =
 830        (VHDDynDiskHeader *) buf;
 831    size_t block_size, num_bat_entries;
 832    int i;
 833    int ret;
 834    int64_t offset = 0;
 835
 836    /* Write the footer (twice: at the beginning and at the end) */
 837    block_size = 0x200000;
 838    num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);
 839
 840    ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
 841    if (ret < 0) {
 842        goto fail;
 843    }
 844
 845    offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
 846    ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
 847    if (ret < 0) {
 848        goto fail;
 849    }
 850
 851    /* Write the initial BAT */
 852    offset = 3 * 512;
 853
 854    memset(buf, 0xFF, 512);
 855    for (i = 0; i < DIV_ROUND_UP(num_bat_entries * 4, 512); i++) {
 856        ret = blk_pwrite(blk, offset, buf, 512, 0);
 857        if (ret < 0) {
 858            goto fail;
 859        }
 860        offset += 512;
 861    }
 862
 863    /* Prepare the Dynamic Disk Header */
 864    memset(buf, 0, 1024);
 865
 866    memcpy(dyndisk_header->magic, "cxsparse", 8);
 867
 868    /*
 869     * Note: The spec is actually wrong here for data_offset, it says
 870     * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
 871     */
 872    dyndisk_header->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
 873    dyndisk_header->table_offset = cpu_to_be64(3 * 512);
 874    dyndisk_header->version = cpu_to_be32(0x00010000);
 875    dyndisk_header->block_size = cpu_to_be32(block_size);
 876    dyndisk_header->max_table_entries = cpu_to_be32(num_bat_entries);
 877
 878    dyndisk_header->checksum = cpu_to_be32(vpc_checksum(buf, 1024));
 879
 880    /* Write the header */
 881    offset = 512;
 882
 883    ret = blk_pwrite(blk, offset, buf, 1024, 0);
 884    if (ret < 0) {
 885        goto fail;
 886    }
 887
 888    ret = 0;
 889 fail:
 890    return ret;
 891}
 892
 893static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
 894                             int64_t total_size, Error **errp)
 895{
 896    int ret;
 897
 898    /* Add footer to total size */
 899    total_size += HEADER_SIZE;
 900
 901    ret = blk_truncate(blk, total_size, PREALLOC_MODE_OFF, errp);
 902    if (ret < 0) {
 903        return ret;
 904    }
 905
 906    ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE, 0);
 907    if (ret < 0) {
 908        error_setg_errno(errp, -ret, "Unable to write VHD header");
 909        return ret;
 910    }
 911
 912    return 0;
 913}
 914
 915static int calculate_rounded_image_size(BlockdevCreateOptionsVpc *vpc_opts,
 916                                        uint16_t *out_cyls,
 917                                        uint8_t *out_heads,
 918                                        uint8_t *out_secs_per_cyl,
 919                                        int64_t *out_total_sectors,
 920                                        Error **errp)
 921{
 922    int64_t total_size = vpc_opts->size;
 923    uint16_t cyls = 0;
 924    uint8_t heads = 0;
 925    uint8_t secs_per_cyl = 0;
 926    int64_t total_sectors;
 927    int i;
 928
 929    /*
 930     * Calculate matching total_size and geometry. Increase the number of
 931     * sectors requested until we get enough (or fail). This ensures that
 932     * qemu-img convert doesn't truncate images, but rather rounds up.
 933     *
 934     * If the image size can't be represented by a spec conformant CHS geometry,
 935     * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use
 936     * the image size from the VHD footer to calculate total_sectors.
 937     */
 938    if (vpc_opts->force_size) {
 939        /* This will force the use of total_size for sector count, below */
 940        cyls         = VHD_CHS_MAX_C;
 941        heads        = VHD_CHS_MAX_H;
 942        secs_per_cyl = VHD_CHS_MAX_S;
 943    } else {
 944        total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
 945        for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
 946            calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
 947        }
 948    }
 949
 950    if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) {
 951        total_sectors = total_size / BDRV_SECTOR_SIZE;
 952        /* Allow a maximum disk size of 2040 GiB */
 953        if (total_sectors > VHD_MAX_SECTORS) {
 954            error_setg(errp, "Disk size is too large, max size is 2040 GiB");
 955            return -EFBIG;
 956        }
 957    } else {
 958        total_sectors = (int64_t) cyls * heads * secs_per_cyl;
 959    }
 960
 961    *out_total_sectors = total_sectors;
 962    if (out_cyls) {
 963        *out_cyls = cyls;
 964        *out_heads = heads;
 965        *out_secs_per_cyl = secs_per_cyl;
 966    }
 967
 968    return 0;
 969}
 970
 971static int coroutine_fn vpc_co_create(BlockdevCreateOptions *opts,
 972                                      Error **errp)
 973{
 974    BlockdevCreateOptionsVpc *vpc_opts;
 975    BlockBackend *blk = NULL;
 976    BlockDriverState *bs = NULL;
 977
 978    uint8_t buf[1024];
 979    VHDFooter *footer = (VHDFooter *) buf;
 980    uint16_t cyls = 0;
 981    uint8_t heads = 0;
 982    uint8_t secs_per_cyl = 0;
 983    int64_t total_sectors;
 984    int64_t total_size;
 985    int disk_type;
 986    int ret = -EIO;
 987    QemuUUID uuid;
 988
 989    assert(opts->driver == BLOCKDEV_DRIVER_VPC);
 990    vpc_opts = &opts->u.vpc;
 991
 992    /* Validate options and set default values */
 993    total_size = vpc_opts->size;
 994
 995    if (!vpc_opts->has_subformat) {
 996        vpc_opts->subformat = BLOCKDEV_VPC_SUBFORMAT_DYNAMIC;
 997    }
 998    switch (vpc_opts->subformat) {
 999    case BLOCKDEV_VPC_SUBFORMAT_DYNAMIC:
1000        disk_type = VHD_DYNAMIC;
1001        break;
1002    case BLOCKDEV_VPC_SUBFORMAT_FIXED:
1003        disk_type = VHD_FIXED;
1004        break;
1005    default:
1006        g_assert_not_reached();
1007    }
1008
1009    /* Create BlockBackend to write to the image */
1010    bs = bdrv_open_blockdev_ref(vpc_opts->file, errp);
1011    if (bs == NULL) {
1012        return -EIO;
1013    }
1014
1015    blk = blk_new(bdrv_get_aio_context(bs),
1016                  BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
1017    ret = blk_insert_bs(blk, bs, errp);
1018    if (ret < 0) {
1019        goto out;
1020    }
1021    blk_set_allow_write_beyond_eof(blk, true);
1022
1023    /* Get geometry and check that it matches the image size*/
1024    ret = calculate_rounded_image_size(vpc_opts, &cyls, &heads, &secs_per_cyl,
1025                                       &total_sectors, errp);
1026    if (ret < 0) {
1027        goto out;
1028    }
1029
1030    if (total_size != total_sectors * BDRV_SECTOR_SIZE) {
1031        error_setg(errp, "The requested image size cannot be represented in "
1032                         "CHS geometry");
1033        error_append_hint(errp, "Try size=%llu or force-size=on (the "
1034                                "latter makes the image incompatible with "
1035                                "Virtual PC)",
1036                          total_sectors * BDRV_SECTOR_SIZE);
1037        ret = -EINVAL;
1038        goto out;
1039    }
1040
1041    /* Prepare the Hard Disk Footer */
1042    memset(buf, 0, 1024);
1043
1044    memcpy(footer->creator, "conectix", 8);
1045    if (vpc_opts->force_size) {
1046        memcpy(footer->creator_app, "qem2", 4);
1047    } else {
1048        memcpy(footer->creator_app, "qemu", 4);
1049    }
1050    memcpy(footer->creator_os, "Wi2k", 4);
1051
1052    footer->features = cpu_to_be32(0x02);
1053    footer->version = cpu_to_be32(0x00010000);
1054    if (disk_type == VHD_DYNAMIC) {
1055        footer->data_offset = cpu_to_be64(HEADER_SIZE);
1056    } else {
1057        footer->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
1058    }
1059    footer->timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE);
1060
1061    /* Version of Virtual PC 2007 */
1062    footer->major = cpu_to_be16(0x0005);
1063    footer->minor = cpu_to_be16(0x0003);
1064    footer->orig_size = cpu_to_be64(total_size);
1065    footer->current_size = cpu_to_be64(total_size);
1066    footer->cyls = cpu_to_be16(cyls);
1067    footer->heads = heads;
1068    footer->secs_per_cyl = secs_per_cyl;
1069
1070    footer->type = cpu_to_be32(disk_type);
1071
1072    qemu_uuid_generate(&uuid);
1073    footer->uuid = uuid;
1074
1075    footer->checksum = cpu_to_be32(vpc_checksum(buf, HEADER_SIZE));
1076
1077    if (disk_type == VHD_DYNAMIC) {
1078        ret = create_dynamic_disk(blk, buf, total_sectors);
1079        if (ret < 0) {
1080            error_setg(errp, "Unable to create or write VHD header");
1081        }
1082    } else {
1083        ret = create_fixed_disk(blk, buf, total_size, errp);
1084    }
1085
1086out:
1087    blk_unref(blk);
1088    bdrv_unref(bs);
1089    return ret;
1090}
1091
1092static int coroutine_fn vpc_co_create_opts(const char *filename,
1093                                           QemuOpts *opts, Error **errp)
1094{
1095    BlockdevCreateOptions *create_options = NULL;
1096    QDict *qdict;
1097    Visitor *v;
1098    BlockDriverState *bs = NULL;
1099    Error *local_err = NULL;
1100    int ret;
1101
1102    static const QDictRenames opt_renames[] = {
1103        { VPC_OPT_FORCE_SIZE,           "force-size" },
1104        { NULL, NULL },
1105    };
1106
1107    /* Parse options and convert legacy syntax */
1108    qdict = qemu_opts_to_qdict_filtered(opts, NULL, &vpc_create_opts, true);
1109
1110    if (!qdict_rename_keys(qdict, opt_renames, errp)) {
1111        ret = -EINVAL;
1112        goto fail;
1113    }
1114
1115    /* Create and open the file (protocol layer) */
1116    ret = bdrv_create_file(filename, opts, &local_err);
1117    if (ret < 0) {
1118        error_propagate(errp, local_err);
1119        goto fail;
1120    }
1121
1122    bs = bdrv_open(filename, NULL, NULL,
1123                   BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
1124    if (bs == NULL) {
1125        ret = -EIO;
1126        goto fail;
1127    }
1128
1129    /* Now get the QAPI type BlockdevCreateOptions */
1130    qdict_put_str(qdict, "driver", "vpc");
1131    qdict_put_str(qdict, "file", bs->node_name);
1132
1133    v = qobject_input_visitor_new_flat_confused(qdict, errp);
1134    if (!v) {
1135        ret = -EINVAL;
1136        goto fail;
1137    }
1138
1139    visit_type_BlockdevCreateOptions(v, NULL, &create_options, &local_err);
1140    visit_free(v);
1141
1142    if (local_err) {
1143        error_propagate(errp, local_err);
1144        ret = -EINVAL;
1145        goto fail;
1146    }
1147
1148    /* Silently round up size */
1149    assert(create_options->driver == BLOCKDEV_DRIVER_VPC);
1150    create_options->u.vpc.size =
1151        ROUND_UP(create_options->u.vpc.size, BDRV_SECTOR_SIZE);
1152
1153    if (!create_options->u.vpc.force_size) {
1154        int64_t total_sectors;
1155        ret = calculate_rounded_image_size(&create_options->u.vpc, NULL, NULL,
1156                                           NULL, &total_sectors, errp);
1157        if (ret < 0) {
1158            goto fail;
1159        }
1160
1161        create_options->u.vpc.size = total_sectors * BDRV_SECTOR_SIZE;
1162    }
1163
1164
1165    /* Create the vpc image (format layer) */
1166    ret = vpc_co_create(create_options, errp);
1167
1168fail:
1169    qobject_unref(qdict);
1170    bdrv_unref(bs);
1171    qapi_free_BlockdevCreateOptions(create_options);
1172    return ret;
1173}
1174
1175
1176static int vpc_has_zero_init(BlockDriverState *bs)
1177{
1178    BDRVVPCState *s = bs->opaque;
1179    VHDFooter *footer =  (VHDFooter *) s->footer_buf;
1180
1181    if (be32_to_cpu(footer->type) == VHD_FIXED) {
1182        return bdrv_has_zero_init(bs->file->bs);
1183    } else {
1184        return 1;
1185    }
1186}
1187
1188static void vpc_close(BlockDriverState *bs)
1189{
1190    BDRVVPCState *s = bs->opaque;
1191    qemu_vfree(s->pagetable);
1192#ifdef CACHE
1193    g_free(s->pageentry_u8);
1194#endif
1195
1196    migrate_del_blocker(s->migration_blocker);
1197    error_free(s->migration_blocker);
1198}
1199
1200static QemuOptsList vpc_create_opts = {
1201    .name = "vpc-create-opts",
1202    .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head),
1203    .desc = {
1204        {
1205            .name = BLOCK_OPT_SIZE,
1206            .type = QEMU_OPT_SIZE,
1207            .help = "Virtual disk size"
1208        },
1209        {
1210            .name = BLOCK_OPT_SUBFMT,
1211            .type = QEMU_OPT_STRING,
1212            .help =
1213                "Type of virtual hard disk format. Supported formats are "
1214                "{dynamic (default) | fixed} "
1215        },
1216        {
1217            .name = VPC_OPT_FORCE_SIZE,
1218            .type = QEMU_OPT_BOOL,
1219            .help = "Force disk size calculation to use the actual size "
1220                    "specified, rather than using the nearest CHS-based "
1221                    "calculation"
1222        },
1223        { /* end of list */ }
1224    }
1225};
1226
1227static const char *const vpc_strong_runtime_opts[] = {
1228    VPC_OPT_SIZE_CALC,
1229
1230    NULL
1231};
1232
1233static BlockDriver bdrv_vpc = {
1234    .format_name    = "vpc",
1235    .instance_size  = sizeof(BDRVVPCState),
1236
1237    .bdrv_probe             = vpc_probe,
1238    .bdrv_open              = vpc_open,
1239    .bdrv_close             = vpc_close,
1240    .bdrv_reopen_prepare    = vpc_reopen_prepare,
1241    .bdrv_child_perm        = bdrv_format_default_perms,
1242    .bdrv_co_create         = vpc_co_create,
1243    .bdrv_co_create_opts    = vpc_co_create_opts,
1244
1245    .bdrv_co_preadv             = vpc_co_preadv,
1246    .bdrv_co_pwritev            = vpc_co_pwritev,
1247    .bdrv_co_block_status       = vpc_co_block_status,
1248
1249    .bdrv_get_info          = vpc_get_info,
1250
1251    .create_opts            = &vpc_create_opts,
1252    .bdrv_has_zero_init     = vpc_has_zero_init,
1253    .strong_runtime_opts    = vpc_strong_runtime_opts,
1254};
1255
1256static void bdrv_vpc_init(void)
1257{
1258    bdrv_register(&bdrv_vpc);
1259}
1260
1261block_init(bdrv_vpc_init);
1262