qemu/block/vpc.c
<<
>>
Prefs
   1/*
   2 * Block driver for Connectix / Microsoft Virtual PC images
   3 *
   4 * Copyright (c) 2005 Alex Beregszaszi
   5 * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
   6 *
   7 * Permission is hereby granted, free of charge, to any person obtaining a copy
   8 * of this software and associated documentation files (the "Software"), to deal
   9 * in the Software without restriction, including without limitation the rights
  10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 * copies of the Software, and to permit persons to whom the Software is
  12 * furnished to do so, subject to the following conditions:
  13 *
  14 * The above copyright notice and this permission notice shall be included in
  15 * all copies or substantial portions of the Software.
  16 *
  17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  23 * THE SOFTWARE.
  24 */
  25
  26#include "qemu/osdep.h"
  27#include "qapi/error.h"
  28#include "block/block_int.h"
  29#include "block/qdict.h"
  30#include "sysemu/block-backend.h"
  31#include "qemu/module.h"
  32#include "qemu/option.h"
  33#include "migration/blocker.h"
  34#include "qemu/bswap.h"
  35#include "qemu/uuid.h"
  36#include "qapi/qmp/qdict.h"
  37#include "qapi/qobject-input-visitor.h"
  38#include "qapi/qapi-visit-block-core.h"
  39
  40/**************************************************************/
  41
  42#define HEADER_SIZE 512
  43
  44//#define CACHE
  45
  46enum vhd_type {
  47    VHD_FIXED           = 2,
  48    VHD_DYNAMIC         = 3,
  49    VHD_DIFFERENCING    = 4,
  50};
  51
  52/* Seconds since Jan 1, 2000 0:00:00 (UTC) */
  53#define VHD_TIMESTAMP_BASE 946684800
  54
  55#define VHD_CHS_MAX_C   65535LL
  56#define VHD_CHS_MAX_H   16
  57#define VHD_CHS_MAX_S   255
  58
  59#define VHD_MAX_SECTORS       0xff000000    /* 2040 GiB max image size */
  60#define VHD_MAX_GEOMETRY      (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S)
  61
  62#define VPC_OPT_FORCE_SIZE "force_size"
  63
  64/* always big-endian */
  65typedef struct vhd_footer {
  66    char        creator[8]; /* "conectix" */
  67    uint32_t    features;
  68    uint32_t    version;
  69
  70    /* Offset of next header structure, 0xFFFFFFFF if none */
  71    uint64_t    data_offset;
  72
  73    /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
  74    uint32_t    timestamp;
  75
  76    char        creator_app[4]; /*  e.g., "vpc " */
  77    uint16_t    major;
  78    uint16_t    minor;
  79    char        creator_os[4]; /* "Wi2k" */
  80
  81    uint64_t    orig_size;
  82    uint64_t    current_size;
  83
  84    uint16_t    cyls;
  85    uint8_t     heads;
  86    uint8_t     secs_per_cyl;
  87
  88    uint32_t    type;
  89
  90    /* Checksum of the Hard Disk Footer ("one's complement of the sum of all
  91       the bytes in the footer without the checksum field") */
  92    uint32_t    checksum;
  93
  94    /* UUID used to identify a parent hard disk (backing file) */
  95    QemuUUID    uuid;
  96
  97    uint8_t     in_saved_state;
  98} QEMU_PACKED VHDFooter;
  99
 100typedef struct vhd_dyndisk_header {
 101    char        magic[8]; /* "cxsparse" */
 102
 103    /* Offset of next header structure, 0xFFFFFFFF if none */
 104    uint64_t    data_offset;
 105
 106    /* Offset of the Block Allocation Table (BAT) */
 107    uint64_t    table_offset;
 108
 109    uint32_t    version;
 110    uint32_t    max_table_entries; /* 32bit/entry */
 111
 112    /* 2 MB by default, must be a power of two */
 113    uint32_t    block_size;
 114
 115    uint32_t    checksum;
 116    uint8_t     parent_uuid[16];
 117    uint32_t    parent_timestamp;
 118    uint32_t    reserved;
 119
 120    /* Backing file name (in UTF-16) */
 121    uint8_t     parent_name[512];
 122
 123    struct {
 124        uint32_t    platform;
 125        uint32_t    data_space;
 126        uint32_t    data_length;
 127        uint32_t    reserved;
 128        uint64_t    data_offset;
 129    } parent_locator[8];
 130} QEMU_PACKED VHDDynDiskHeader;
 131
 132typedef struct BDRVVPCState {
 133    CoMutex lock;
 134    uint8_t footer_buf[HEADER_SIZE];
 135    uint64_t free_data_block_offset;
 136    int max_table_entries;
 137    uint32_t *pagetable;
 138    uint64_t bat_offset;
 139    uint64_t last_bitmap_offset;
 140
 141    uint32_t block_size;
 142    uint32_t bitmap_size;
 143    bool force_use_chs;
 144    bool force_use_sz;
 145
 146#ifdef CACHE
 147    uint8_t *pageentry_u8;
 148    uint32_t *pageentry_u32;
 149    uint16_t *pageentry_u16;
 150
 151    uint64_t last_bitmap;
 152#endif
 153
 154    Error *migration_blocker;
 155} BDRVVPCState;
 156
 157#define VPC_OPT_SIZE_CALC "force_size_calc"
 158static QemuOptsList vpc_runtime_opts = {
 159    .name = "vpc-runtime-opts",
 160    .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head),
 161    .desc = {
 162        {
 163            .name = VPC_OPT_SIZE_CALC,
 164            .type = QEMU_OPT_STRING,
 165            .help = "Force disk size calculation to use either CHS geometry, "
 166                    "or use the disk current_size specified in the VHD footer. "
 167                    "{chs, current_size}"
 168        },
 169        { /* end of list */ }
 170    }
 171};
 172
 173static QemuOptsList vpc_create_opts;
 174
 175static uint32_t vpc_checksum(uint8_t* buf, size_t size)
 176{
 177    uint32_t res = 0;
 178    int i;
 179
 180    for (i = 0; i < size; i++)
 181        res += buf[i];
 182
 183    return ~res;
 184}
 185
 186
 187static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
 188{
 189    if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
 190        return 100;
 191    return 0;
 192}
 193
 194static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts,
 195                              Error **errp)
 196{
 197    BDRVVPCState *s = bs->opaque;
 198    const char *size_calc;
 199
 200    size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC);
 201
 202    if (!size_calc) {
 203       /* no override, use autodetect only */
 204    } else if (!strcmp(size_calc, "current_size")) {
 205        s->force_use_sz = true;
 206    } else if (!strcmp(size_calc, "chs")) {
 207        s->force_use_chs = true;
 208    } else {
 209        error_setg(errp, "Invalid size calculation mode: '%s'", size_calc);
 210    }
 211}
 212
 213static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
 214                    Error **errp)
 215{
 216    BDRVVPCState *s = bs->opaque;
 217    int i;
 218    VHDFooter *footer;
 219    VHDDynDiskHeader *dyndisk_header;
 220    QemuOpts *opts = NULL;
 221    Error *local_err = NULL;
 222    bool use_chs;
 223    uint8_t buf[HEADER_SIZE];
 224    uint32_t checksum;
 225    uint64_t computed_size;
 226    uint64_t pagetable_size;
 227    int disk_type = VHD_DYNAMIC;
 228    int ret;
 229    int64_t bs_size;
 230
 231    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
 232                               false, errp);
 233    if (!bs->file) {
 234        return -EINVAL;
 235    }
 236
 237    opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
 238    qemu_opts_absorb_qdict(opts, options, &local_err);
 239    if (local_err) {
 240        error_propagate(errp, local_err);
 241        ret = -EINVAL;
 242        goto fail;
 243    }
 244
 245    vpc_parse_options(bs, opts, &local_err);
 246    if (local_err) {
 247        error_propagate(errp, local_err);
 248        ret = -EINVAL;
 249        goto fail;
 250    }
 251
 252    ret = bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE);
 253    if (ret < 0) {
 254        error_setg(errp, "Unable to read VHD header");
 255        goto fail;
 256    }
 257
 258    footer = (VHDFooter *) s->footer_buf;
 259    if (strncmp(footer->creator, "conectix", 8)) {
 260        int64_t offset = bdrv_getlength(bs->file->bs);
 261        if (offset < 0) {
 262            ret = offset;
 263            error_setg(errp, "Invalid file size");
 264            goto fail;
 265        } else if (offset < HEADER_SIZE) {
 266            ret = -EINVAL;
 267            error_setg(errp, "File too small for a VHD header");
 268            goto fail;
 269        }
 270
 271        /* If a fixed disk, the footer is found only at the end of the file */
 272        ret = bdrv_pread(bs->file, offset-HEADER_SIZE, s->footer_buf,
 273                         HEADER_SIZE);
 274        if (ret < 0) {
 275            goto fail;
 276        }
 277        if (strncmp(footer->creator, "conectix", 8)) {
 278            error_setg(errp, "invalid VPC image");
 279            ret = -EINVAL;
 280            goto fail;
 281        }
 282        disk_type = VHD_FIXED;
 283    }
 284
 285    checksum = be32_to_cpu(footer->checksum);
 286    footer->checksum = 0;
 287    if (vpc_checksum(s->footer_buf, HEADER_SIZE) != checksum)
 288        fprintf(stderr, "block-vpc: The header checksum of '%s' is "
 289            "incorrect.\n", bs->filename);
 290
 291    /* Write 'checksum' back to footer, or else will leave it with zero. */
 292    footer->checksum = cpu_to_be32(checksum);
 293
 294    /* The visible size of a image in Virtual PC depends on the geometry
 295       rather than on the size stored in the footer (the size in the footer
 296       is too large usually) */
 297    bs->total_sectors = (int64_t)
 298        be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
 299
 300    /* Microsoft Virtual PC and Microsoft Hyper-V produce and read
 301     * VHD image sizes differently.  VPC will rely on CHS geometry,
 302     * while Hyper-V and disk2vhd use the size specified in the footer.
 303     *
 304     * We use a couple of approaches to try and determine the correct method:
 305     * look at the Creator App field, and look for images that have CHS
 306     * geometry that is the maximum value.
 307     *
 308     * If the CHS geometry is the maximum CHS geometry, then we assume that
 309     * the size is the footer->current_size to avoid truncation.  Otherwise,
 310     * we follow the table based on footer->creator_app:
 311     *
 312     *  Known creator apps:
 313     *      'vpc '  :  CHS              Virtual PC (uses disk geometry)
 314     *      'qemu'  :  CHS              QEMU (uses disk geometry)
 315     *      'qem2'  :  current_size     QEMU (uses current_size)
 316     *      'win '  :  current_size     Hyper-V
 317     *      'd2v '  :  current_size     Disk2vhd
 318     *      'tap\0' :  current_size     XenServer
 319     *      'CTXS'  :  current_size     XenConverter
 320     *
 321     *  The user can override the table values via drive options, however
 322     *  even with an override we will still use current_size for images
 323     *  that have CHS geometry of the maximum size.
 324     */
 325    use_chs = (!!strncmp(footer->creator_app, "win ", 4) &&
 326               !!strncmp(footer->creator_app, "qem2", 4) &&
 327               !!strncmp(footer->creator_app, "d2v ", 4) &&
 328               !!strncmp(footer->creator_app, "CTXS", 4) &&
 329               !!memcmp(footer->creator_app, "tap", 4)) || s->force_use_chs;
 330
 331    if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) {
 332        bs->total_sectors = be64_to_cpu(footer->current_size) /
 333                                        BDRV_SECTOR_SIZE;
 334    }
 335
 336    /* Allow a maximum disk size of 2040 GiB */
 337    if (bs->total_sectors > VHD_MAX_SECTORS) {
 338        ret = -EFBIG;
 339        goto fail;
 340    }
 341
 342    if (disk_type == VHD_DYNAMIC) {
 343        ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf,
 344                         HEADER_SIZE);
 345        if (ret < 0) {
 346            error_setg(errp, "Error reading dynamic VHD header");
 347            goto fail;
 348        }
 349
 350        dyndisk_header = (VHDDynDiskHeader *) buf;
 351
 352        if (strncmp(dyndisk_header->magic, "cxsparse", 8)) {
 353            error_setg(errp, "Invalid header magic");
 354            ret = -EINVAL;
 355            goto fail;
 356        }
 357
 358        s->block_size = be32_to_cpu(dyndisk_header->block_size);
 359        if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) {
 360            error_setg(errp, "Invalid block size %" PRIu32, s->block_size);
 361            ret = -EINVAL;
 362            goto fail;
 363        }
 364        s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
 365
 366        s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries);
 367
 368        if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
 369            error_setg(errp, "Too many blocks");
 370            ret = -EINVAL;
 371            goto fail;
 372        }
 373
 374        computed_size = (uint64_t) s->max_table_entries * s->block_size;
 375        if (computed_size < bs->total_sectors * 512) {
 376            error_setg(errp, "Page table too small");
 377            ret = -EINVAL;
 378            goto fail;
 379        }
 380
 381        if (s->max_table_entries > SIZE_MAX / 4 ||
 382            s->max_table_entries > (int) INT_MAX / 4) {
 383            error_setg(errp, "Max Table Entries too large (%" PRId32 ")",
 384                        s->max_table_entries);
 385            ret = -EINVAL;
 386            goto fail;
 387        }
 388
 389        pagetable_size = (uint64_t) s->max_table_entries * 4;
 390
 391        s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size);
 392        if (s->pagetable == NULL) {
 393            error_setg(errp, "Unable to allocate memory for page table");
 394            ret = -ENOMEM;
 395            goto fail;
 396        }
 397
 398        s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);
 399
 400        ret = bdrv_pread(bs->file, s->bat_offset, s->pagetable,
 401                         pagetable_size);
 402        if (ret < 0) {
 403            error_setg(errp, "Error reading pagetable");
 404            goto fail;
 405        }
 406
 407        s->free_data_block_offset =
 408            ROUND_UP(s->bat_offset + pagetable_size, 512);
 409
 410        for (i = 0; i < s->max_table_entries; i++) {
 411            be32_to_cpus(&s->pagetable[i]);
 412            if (s->pagetable[i] != 0xFFFFFFFF) {
 413                int64_t next = (512 * (int64_t) s->pagetable[i]) +
 414                    s->bitmap_size + s->block_size;
 415
 416                if (next > s->free_data_block_offset) {
 417                    s->free_data_block_offset = next;
 418                }
 419            }
 420        }
 421
 422        bs_size = bdrv_getlength(bs->file->bs);
 423        if (bs_size < 0) {
 424            error_setg_errno(errp, -bs_size, "Unable to learn image size");
 425            ret = bs_size;
 426            goto fail;
 427        }
 428        if (s->free_data_block_offset > bs_size) {
 429            error_setg(errp, "block-vpc: free_data_block_offset points after "
 430                             "the end of file. The image has been truncated.");
 431            ret = -EINVAL;
 432            goto fail;
 433        }
 434
 435        s->last_bitmap_offset = (int64_t) -1;
 436
 437#ifdef CACHE
 438        s->pageentry_u8 = g_malloc(512);
 439        s->pageentry_u32 = s->pageentry_u8;
 440        s->pageentry_u16 = s->pageentry_u8;
 441        s->last_pagetable = -1;
 442#endif
 443    }
 444
 445    /* Disable migration when VHD images are used */
 446    error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
 447               "does not support live migration",
 448               bdrv_get_device_or_node_name(bs));
 449    ret = migrate_add_blocker(s->migration_blocker, &local_err);
 450    if (local_err) {
 451        error_propagate(errp, local_err);
 452        error_free(s->migration_blocker);
 453        goto fail;
 454    }
 455
 456    qemu_co_mutex_init(&s->lock);
 457
 458    return 0;
 459
 460fail:
 461    qemu_vfree(s->pagetable);
 462#ifdef CACHE
 463    g_free(s->pageentry_u8);
 464#endif
 465    return ret;
 466}
 467
 468static int vpc_reopen_prepare(BDRVReopenState *state,
 469                              BlockReopenQueue *queue, Error **errp)
 470{
 471    return 0;
 472}
 473
 474/*
 475 * Returns the absolute byte offset of the given sector in the image file.
 476 * If the sector is not allocated, -1 is returned instead.
 477 * If an error occurred trying to write an updated block bitmap back to
 478 * the file, -2 is returned, and the error value is written to *err.
 479 * This can only happen for a write operation.
 480 *
 481 * The parameter write must be 1 if the offset will be used for a write
 482 * operation (the block bitmaps is updated then), 0 otherwise.
 483 * If write is true then err must not be NULL.
 484 */
 485static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
 486                                       bool write, int *err)
 487{
 488    BDRVVPCState *s = bs->opaque;
 489    uint64_t bitmap_offset, block_offset;
 490    uint32_t pagetable_index, offset_in_block;
 491
 492    assert(!(write && err == NULL));
 493
 494    pagetable_index = offset / s->block_size;
 495    offset_in_block = offset % s->block_size;
 496
 497    if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
 498        return -1; /* not allocated */
 499
 500    bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
 501    block_offset = bitmap_offset + s->bitmap_size + offset_in_block;
 502
 503    /* We must ensure that we don't write to any sectors which are marked as
 504       unused in the bitmap. We get away with setting all bits in the block
 505       bitmap each time we write to a new block. This might cause Virtual PC to
 506       miss sparse read optimization, but it's not a problem in terms of
 507       correctness. */
 508    if (write && (s->last_bitmap_offset != bitmap_offset)) {
 509        uint8_t bitmap[s->bitmap_size];
 510        int r;
 511
 512        s->last_bitmap_offset = bitmap_offset;
 513        memset(bitmap, 0xff, s->bitmap_size);
 514        r = bdrv_pwrite_sync(bs->file, bitmap_offset, bitmap, s->bitmap_size);
 515        if (r < 0) {
 516            *err = r;
 517            return -2;
 518        }
 519    }
 520
 521    return block_offset;
 522}
 523
 524/*
 525 * Writes the footer to the end of the image file. This is needed when the
 526 * file grows as it overwrites the old footer
 527 *
 528 * Returns 0 on success and < 0 on error
 529 */
 530static int rewrite_footer(BlockDriverState* bs)
 531{
 532    int ret;
 533    BDRVVPCState *s = bs->opaque;
 534    int64_t offset = s->free_data_block_offset;
 535
 536    ret = bdrv_pwrite_sync(bs->file, offset, s->footer_buf, HEADER_SIZE);
 537    if (ret < 0)
 538        return ret;
 539
 540    return 0;
 541}
 542
 543/*
 544 * Allocates a new block. This involves writing a new footer and updating
 545 * the Block Allocation Table to use the space at the old end of the image
 546 * file (overwriting the old footer)
 547 *
 548 * Returns the sectors' offset in the image file on success and < 0 on error
 549 */
 550static int64_t alloc_block(BlockDriverState* bs, int64_t offset)
 551{
 552    BDRVVPCState *s = bs->opaque;
 553    int64_t bat_offset;
 554    uint32_t index, bat_value;
 555    int ret;
 556    uint8_t bitmap[s->bitmap_size];
 557
 558    /* Check if sector_num is valid */
 559    if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
 560        return -EINVAL;
 561    }
 562
 563    /* Write entry into in-memory BAT */
 564    index = offset / s->block_size;
 565    assert(s->pagetable[index] == 0xFFFFFFFF);
 566    s->pagetable[index] = s->free_data_block_offset / 512;
 567
 568    /* Initialize the block's bitmap */
 569    memset(bitmap, 0xff, s->bitmap_size);
 570    ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset, bitmap,
 571        s->bitmap_size);
 572    if (ret < 0) {
 573        return ret;
 574    }
 575
 576    /* Write new footer (the old one will be overwritten) */
 577    s->free_data_block_offset += s->block_size + s->bitmap_size;
 578    ret = rewrite_footer(bs);
 579    if (ret < 0)
 580        goto fail;
 581
 582    /* Write BAT entry to disk */
 583    bat_offset = s->bat_offset + (4 * index);
 584    bat_value = cpu_to_be32(s->pagetable[index]);
 585    ret = bdrv_pwrite_sync(bs->file, bat_offset, &bat_value, 4);
 586    if (ret < 0)
 587        goto fail;
 588
 589    return get_image_offset(bs, offset, false, NULL);
 590
 591fail:
 592    s->free_data_block_offset -= (s->block_size + s->bitmap_size);
 593    return ret;
 594}
 595
 596static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 597{
 598    BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
 599    VHDFooter *footer = (VHDFooter *) s->footer_buf;
 600
 601    if (be32_to_cpu(footer->type) != VHD_FIXED) {
 602        bdi->cluster_size = s->block_size;
 603    }
 604
 605    bdi->unallocated_blocks_are_zero = true;
 606    return 0;
 607}
 608
 609static int coroutine_fn
 610vpc_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
 611              QEMUIOVector *qiov, int flags)
 612{
 613    BDRVVPCState *s = bs->opaque;
 614    int ret;
 615    int64_t image_offset;
 616    int64_t n_bytes;
 617    int64_t bytes_done = 0;
 618    VHDFooter *footer = (VHDFooter *) s->footer_buf;
 619    QEMUIOVector local_qiov;
 620
 621    if (be32_to_cpu(footer->type) == VHD_FIXED) {
 622        return bdrv_co_preadv(bs->file, offset, bytes, qiov, 0);
 623    }
 624
 625    qemu_co_mutex_lock(&s->lock);
 626    qemu_iovec_init(&local_qiov, qiov->niov);
 627
 628    while (bytes > 0) {
 629        image_offset = get_image_offset(bs, offset, false, NULL);
 630        n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
 631
 632        if (image_offset == -1) {
 633            qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
 634        } else {
 635            qemu_iovec_reset(&local_qiov);
 636            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
 637
 638            ret = bdrv_co_preadv(bs->file, image_offset, n_bytes,
 639                                 &local_qiov, 0);
 640            if (ret < 0) {
 641                goto fail;
 642            }
 643        }
 644
 645        bytes -= n_bytes;
 646        offset += n_bytes;
 647        bytes_done += n_bytes;
 648    }
 649
 650    ret = 0;
 651fail:
 652    qemu_iovec_destroy(&local_qiov);
 653    qemu_co_mutex_unlock(&s->lock);
 654
 655    return ret;
 656}
 657
 658static int coroutine_fn
 659vpc_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
 660               QEMUIOVector *qiov, int flags)
 661{
 662    BDRVVPCState *s = bs->opaque;
 663    int64_t image_offset;
 664    int64_t n_bytes;
 665    int64_t bytes_done = 0;
 666    int ret = 0;
 667    VHDFooter *footer =  (VHDFooter *) s->footer_buf;
 668    QEMUIOVector local_qiov;
 669
 670    if (be32_to_cpu(footer->type) == VHD_FIXED) {
 671        return bdrv_co_pwritev(bs->file, offset, bytes, qiov, 0);
 672    }
 673
 674    qemu_co_mutex_lock(&s->lock);
 675    qemu_iovec_init(&local_qiov, qiov->niov);
 676
 677    while (bytes > 0) {
 678        image_offset = get_image_offset(bs, offset, true, &ret);
 679        if (image_offset == -2) {
 680            /* Failed to write block bitmap: can't proceed with write */
 681            goto fail;
 682        }
 683        n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
 684
 685        if (image_offset == -1) {
 686            image_offset = alloc_block(bs, offset);
 687            if (image_offset < 0) {
 688                ret = image_offset;
 689                goto fail;
 690            }
 691        }
 692
 693        qemu_iovec_reset(&local_qiov);
 694        qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
 695
 696        ret = bdrv_co_pwritev(bs->file, image_offset, n_bytes,
 697                              &local_qiov, 0);
 698        if (ret < 0) {
 699            goto fail;
 700        }
 701
 702        bytes -= n_bytes;
 703        offset += n_bytes;
 704        bytes_done += n_bytes;
 705    }
 706
 707    ret = 0;
 708fail:
 709    qemu_iovec_destroy(&local_qiov);
 710    qemu_co_mutex_unlock(&s->lock);
 711
 712    return ret;
 713}
 714
 715static int coroutine_fn vpc_co_block_status(BlockDriverState *bs,
 716                                            bool want_zero,
 717                                            int64_t offset, int64_t bytes,
 718                                            int64_t *pnum, int64_t *map,
 719                                            BlockDriverState **file)
 720{
 721    BDRVVPCState *s = bs->opaque;
 722    VHDFooter *footer = (VHDFooter*) s->footer_buf;
 723    int64_t image_offset;
 724    bool allocated;
 725    int ret;
 726    int64_t n;
 727
 728    if (be32_to_cpu(footer->type) == VHD_FIXED) {
 729        *pnum = bytes;
 730        *map = offset;
 731        *file = bs->file->bs;
 732        return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
 733    }
 734
 735    qemu_co_mutex_lock(&s->lock);
 736
 737    image_offset = get_image_offset(bs, offset, false, NULL);
 738    allocated = (image_offset != -1);
 739    *pnum = 0;
 740    ret = 0;
 741
 742    do {
 743        /* All sectors in a block are contiguous (without using the bitmap) */
 744        n = ROUND_UP(offset + 1, s->block_size) - offset;
 745        n = MIN(n, bytes);
 746
 747        *pnum += n;
 748        offset += n;
 749        bytes -= n;
 750        /* *pnum can't be greater than one block for allocated
 751         * sectors since there is always a bitmap in between. */
 752        if (allocated) {
 753            *file = bs->file->bs;
 754            *map = image_offset;
 755            ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
 756            break;
 757        }
 758        if (bytes == 0) {
 759            break;
 760        }
 761        image_offset = get_image_offset(bs, offset, false, NULL);
 762    } while (image_offset == -1);
 763
 764    qemu_co_mutex_unlock(&s->lock);
 765    return ret;
 766}
 767
 768/*
 769 * Calculates the number of cylinders, heads and sectors per cylinder
 770 * based on a given number of sectors. This is the algorithm described
 771 * in the VHD specification.
 772 *
 773 * Note that the geometry doesn't always exactly match total_sectors but
 774 * may round it down.
 775 *
 776 * Returns 0 on success, -EFBIG if the size is larger than 2040 GiB. Override
 777 * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
 778 * and instead allow up to 255 heads.
 779 */
 780static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
 781    uint8_t* heads, uint8_t* secs_per_cyl)
 782{
 783    uint32_t cyls_times_heads;
 784
 785    total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY);
 786
 787    if (total_sectors >= 65535LL * 16 * 63) {
 788        *secs_per_cyl = 255;
 789        *heads = 16;
 790        cyls_times_heads = total_sectors / *secs_per_cyl;
 791    } else {
 792        *secs_per_cyl = 17;
 793        cyls_times_heads = total_sectors / *secs_per_cyl;
 794        *heads = DIV_ROUND_UP(cyls_times_heads, 1024);
 795
 796        if (*heads < 4) {
 797            *heads = 4;
 798        }
 799
 800        if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
 801            *secs_per_cyl = 31;
 802            *heads = 16;
 803            cyls_times_heads = total_sectors / *secs_per_cyl;
 804        }
 805
 806        if (cyls_times_heads >= (*heads * 1024)) {
 807            *secs_per_cyl = 63;
 808            *heads = 16;
 809            cyls_times_heads = total_sectors / *secs_per_cyl;
 810        }
 811    }
 812
 813    *cyls = cyls_times_heads / *heads;
 814
 815    return 0;
 816}
 817
 818static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf,
 819                               int64_t total_sectors)
 820{
 821    VHDDynDiskHeader *dyndisk_header =
 822        (VHDDynDiskHeader *) buf;
 823    size_t block_size, num_bat_entries;
 824    int i;
 825    int ret;
 826    int64_t offset = 0;
 827
 828    /* Write the footer (twice: at the beginning and at the end) */
 829    block_size = 0x200000;
 830    num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);
 831
 832    ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
 833    if (ret < 0) {
 834        goto fail;
 835    }
 836
 837    offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
 838    ret = blk_pwrite(blk, offset, buf, HEADER_SIZE, 0);
 839    if (ret < 0) {
 840        goto fail;
 841    }
 842
 843    /* Write the initial BAT */
 844    offset = 3 * 512;
 845
 846    memset(buf, 0xFF, 512);
 847    for (i = 0; i < DIV_ROUND_UP(num_bat_entries * 4, 512); i++) {
 848        ret = blk_pwrite(blk, offset, buf, 512, 0);
 849        if (ret < 0) {
 850            goto fail;
 851        }
 852        offset += 512;
 853    }
 854
 855    /* Prepare the Dynamic Disk Header */
 856    memset(buf, 0, 1024);
 857
 858    memcpy(dyndisk_header->magic, "cxsparse", 8);
 859
 860    /*
 861     * Note: The spec is actually wrong here for data_offset, it says
 862     * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
 863     */
 864    dyndisk_header->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
 865    dyndisk_header->table_offset = cpu_to_be64(3 * 512);
 866    dyndisk_header->version = cpu_to_be32(0x00010000);
 867    dyndisk_header->block_size = cpu_to_be32(block_size);
 868    dyndisk_header->max_table_entries = cpu_to_be32(num_bat_entries);
 869
 870    dyndisk_header->checksum = cpu_to_be32(vpc_checksum(buf, 1024));
 871
 872    /* Write the header */
 873    offset = 512;
 874
 875    ret = blk_pwrite(blk, offset, buf, 1024, 0);
 876    if (ret < 0) {
 877        goto fail;
 878    }
 879
 880 fail:
 881    return ret;
 882}
 883
 884static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
 885                             int64_t total_size, Error **errp)
 886{
 887    int ret;
 888
 889    /* Add footer to total size */
 890    total_size += HEADER_SIZE;
 891
 892    ret = blk_truncate(blk, total_size, PREALLOC_MODE_OFF, errp);
 893    if (ret < 0) {
 894        return ret;
 895    }
 896
 897    ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE, 0);
 898    if (ret < 0) {
 899        error_setg_errno(errp, -ret, "Unable to write VHD header");
 900        return ret;
 901    }
 902
 903    return ret;
 904}
 905
 906static int calculate_rounded_image_size(BlockdevCreateOptionsVpc *vpc_opts,
 907                                        uint16_t *out_cyls,
 908                                        uint8_t *out_heads,
 909                                        uint8_t *out_secs_per_cyl,
 910                                        int64_t *out_total_sectors,
 911                                        Error **errp)
 912{
 913    int64_t total_size = vpc_opts->size;
 914    uint16_t cyls = 0;
 915    uint8_t heads = 0;
 916    uint8_t secs_per_cyl = 0;
 917    int64_t total_sectors;
 918    int i;
 919
 920    /*
 921     * Calculate matching total_size and geometry. Increase the number of
 922     * sectors requested until we get enough (or fail). This ensures that
 923     * qemu-img convert doesn't truncate images, but rather rounds up.
 924     *
 925     * If the image size can't be represented by a spec conformant CHS geometry,
 926     * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use
 927     * the image size from the VHD footer to calculate total_sectors.
 928     */
 929    if (vpc_opts->force_size) {
 930        /* This will force the use of total_size for sector count, below */
 931        cyls         = VHD_CHS_MAX_C;
 932        heads        = VHD_CHS_MAX_H;
 933        secs_per_cyl = VHD_CHS_MAX_S;
 934    } else {
 935        total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
 936        for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
 937            calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
 938        }
 939    }
 940
 941    if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) {
 942        total_sectors = total_size / BDRV_SECTOR_SIZE;
 943        /* Allow a maximum disk size of 2040 GiB */
 944        if (total_sectors > VHD_MAX_SECTORS) {
 945            error_setg(errp, "Disk size is too large, max size is 2040 GiB");
 946            return -EFBIG;
 947        }
 948    } else {
 949        total_sectors = (int64_t) cyls * heads * secs_per_cyl;
 950    }
 951
 952    *out_total_sectors = total_sectors;
 953    if (out_cyls) {
 954        *out_cyls = cyls;
 955        *out_heads = heads;
 956        *out_secs_per_cyl = secs_per_cyl;
 957    }
 958
 959    return 0;
 960}
 961
 962static int coroutine_fn vpc_co_create(BlockdevCreateOptions *opts,
 963                                      Error **errp)
 964{
 965    BlockdevCreateOptionsVpc *vpc_opts;
 966    BlockBackend *blk = NULL;
 967    BlockDriverState *bs = NULL;
 968
 969    uint8_t buf[1024];
 970    VHDFooter *footer = (VHDFooter *) buf;
 971    uint16_t cyls = 0;
 972    uint8_t heads = 0;
 973    uint8_t secs_per_cyl = 0;
 974    int64_t total_sectors;
 975    int64_t total_size;
 976    int disk_type;
 977    int ret = -EIO;
 978
 979    assert(opts->driver == BLOCKDEV_DRIVER_VPC);
 980    vpc_opts = &opts->u.vpc;
 981
 982    /* Validate options and set default values */
 983    total_size = vpc_opts->size;
 984
 985    if (!vpc_opts->has_subformat) {
 986        vpc_opts->subformat = BLOCKDEV_VPC_SUBFORMAT_DYNAMIC;
 987    }
 988    switch (vpc_opts->subformat) {
 989    case BLOCKDEV_VPC_SUBFORMAT_DYNAMIC:
 990        disk_type = VHD_DYNAMIC;
 991        break;
 992    case BLOCKDEV_VPC_SUBFORMAT_FIXED:
 993        disk_type = VHD_FIXED;
 994        break;
 995    default:
 996        g_assert_not_reached();
 997    }
 998
 999    /* Create BlockBackend to write to the image */
1000    bs = bdrv_open_blockdev_ref(vpc_opts->file, errp);
1001    if (bs == NULL) {
1002        return -EIO;
1003    }
1004
1005    blk = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
1006    ret = blk_insert_bs(blk, bs, errp);
1007    if (ret < 0) {
1008        goto out;
1009    }
1010    blk_set_allow_write_beyond_eof(blk, true);
1011
1012    /* Get geometry and check that it matches the image size*/
1013    ret = calculate_rounded_image_size(vpc_opts, &cyls, &heads, &secs_per_cyl,
1014                                       &total_sectors, errp);
1015    if (ret < 0) {
1016        goto out;
1017    }
1018
1019    if (total_size != total_sectors * BDRV_SECTOR_SIZE) {
1020        error_setg(errp, "The requested image size cannot be represented in "
1021                         "CHS geometry");
1022        error_append_hint(errp, "Try size=%llu or force-size=on (the "
1023                                "latter makes the image incompatible with "
1024                                "Virtual PC)",
1025                          total_sectors * BDRV_SECTOR_SIZE);
1026        ret = -EINVAL;
1027        goto out;
1028    }
1029
1030    /* Prepare the Hard Disk Footer */
1031    memset(buf, 0, 1024);
1032
1033    memcpy(footer->creator, "conectix", 8);
1034    if (vpc_opts->force_size) {
1035        memcpy(footer->creator_app, "qem2", 4);
1036    } else {
1037        memcpy(footer->creator_app, "qemu", 4);
1038    }
1039    memcpy(footer->creator_os, "Wi2k", 4);
1040
1041    footer->features = cpu_to_be32(0x02);
1042    footer->version = cpu_to_be32(0x00010000);
1043    if (disk_type == VHD_DYNAMIC) {
1044        footer->data_offset = cpu_to_be64(HEADER_SIZE);
1045    } else {
1046        footer->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
1047    }
1048    footer->timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE);
1049
1050    /* Version of Virtual PC 2007 */
1051    footer->major = cpu_to_be16(0x0005);
1052    footer->minor = cpu_to_be16(0x0003);
1053    footer->orig_size = cpu_to_be64(total_size);
1054    footer->current_size = cpu_to_be64(total_size);
1055    footer->cyls = cpu_to_be16(cyls);
1056    footer->heads = heads;
1057    footer->secs_per_cyl = secs_per_cyl;
1058
1059    footer->type = cpu_to_be32(disk_type);
1060
1061    qemu_uuid_generate(&footer->uuid);
1062
1063    footer->checksum = cpu_to_be32(vpc_checksum(buf, HEADER_SIZE));
1064
1065    if (disk_type == VHD_DYNAMIC) {
1066        ret = create_dynamic_disk(blk, buf, total_sectors);
1067        if (ret < 0) {
1068            error_setg(errp, "Unable to create or write VHD header");
1069        }
1070    } else {
1071        ret = create_fixed_disk(blk, buf, total_size, errp);
1072    }
1073
1074out:
1075    blk_unref(blk);
1076    bdrv_unref(bs);
1077    return ret;
1078}
1079
1080static int coroutine_fn vpc_co_create_opts(const char *filename,
1081                                           QemuOpts *opts, Error **errp)
1082{
1083    BlockdevCreateOptions *create_options = NULL;
1084    QDict *qdict;
1085    Visitor *v;
1086    BlockDriverState *bs = NULL;
1087    Error *local_err = NULL;
1088    int ret;
1089
1090    static const QDictRenames opt_renames[] = {
1091        { VPC_OPT_FORCE_SIZE,           "force-size" },
1092        { NULL, NULL },
1093    };
1094
1095    /* Parse options and convert legacy syntax */
1096    qdict = qemu_opts_to_qdict_filtered(opts, NULL, &vpc_create_opts, true);
1097
1098    if (!qdict_rename_keys(qdict, opt_renames, errp)) {
1099        ret = -EINVAL;
1100        goto fail;
1101    }
1102
1103    /* Create and open the file (protocol layer) */
1104    ret = bdrv_create_file(filename, opts, &local_err);
1105    if (ret < 0) {
1106        error_propagate(errp, local_err);
1107        goto fail;
1108    }
1109
1110    bs = bdrv_open(filename, NULL, NULL,
1111                   BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
1112    if (bs == NULL) {
1113        ret = -EIO;
1114        goto fail;
1115    }
1116
1117    /* Now get the QAPI type BlockdevCreateOptions */
1118    qdict_put_str(qdict, "driver", "vpc");
1119    qdict_put_str(qdict, "file", bs->node_name);
1120
1121    v = qobject_input_visitor_new_flat_confused(qdict, errp);
1122    if (!v) {
1123        ret = -EINVAL;
1124        goto fail;
1125    }
1126
1127    visit_type_BlockdevCreateOptions(v, NULL, &create_options, &local_err);
1128    visit_free(v);
1129
1130    if (local_err) {
1131        error_propagate(errp, local_err);
1132        ret = -EINVAL;
1133        goto fail;
1134    }
1135
1136    /* Silently round up size */
1137    assert(create_options->driver == BLOCKDEV_DRIVER_VPC);
1138    create_options->u.vpc.size =
1139        ROUND_UP(create_options->u.vpc.size, BDRV_SECTOR_SIZE);
1140
1141    if (!create_options->u.vpc.force_size) {
1142        int64_t total_sectors;
1143        ret = calculate_rounded_image_size(&create_options->u.vpc, NULL, NULL,
1144                                           NULL, &total_sectors, errp);
1145        if (ret < 0) {
1146            goto fail;
1147        }
1148
1149        create_options->u.vpc.size = total_sectors * BDRV_SECTOR_SIZE;
1150    }
1151
1152
1153    /* Create the vpc image (format layer) */
1154    ret = vpc_co_create(create_options, errp);
1155
1156fail:
1157    qobject_unref(qdict);
1158    bdrv_unref(bs);
1159    qapi_free_BlockdevCreateOptions(create_options);
1160    return ret;
1161}
1162
1163
1164static int vpc_has_zero_init(BlockDriverState *bs)
1165{
1166    BDRVVPCState *s = bs->opaque;
1167    VHDFooter *footer =  (VHDFooter *) s->footer_buf;
1168
1169    if (be32_to_cpu(footer->type) == VHD_FIXED) {
1170        return bdrv_has_zero_init(bs->file->bs);
1171    } else {
1172        return 1;
1173    }
1174}
1175
1176static void vpc_close(BlockDriverState *bs)
1177{
1178    BDRVVPCState *s = bs->opaque;
1179    qemu_vfree(s->pagetable);
1180#ifdef CACHE
1181    g_free(s->pageentry_u8);
1182#endif
1183
1184    migrate_del_blocker(s->migration_blocker);
1185    error_free(s->migration_blocker);
1186}
1187
1188static QemuOptsList vpc_create_opts = {
1189    .name = "vpc-create-opts",
1190    .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head),
1191    .desc = {
1192        {
1193            .name = BLOCK_OPT_SIZE,
1194            .type = QEMU_OPT_SIZE,
1195            .help = "Virtual disk size"
1196        },
1197        {
1198            .name = BLOCK_OPT_SUBFMT,
1199            .type = QEMU_OPT_STRING,
1200            .help =
1201                "Type of virtual hard disk format. Supported formats are "
1202                "{dynamic (default) | fixed} "
1203        },
1204        {
1205            .name = VPC_OPT_FORCE_SIZE,
1206            .type = QEMU_OPT_BOOL,
1207            .help = "Force disk size calculation to use the actual size "
1208                    "specified, rather than using the nearest CHS-based "
1209                    "calculation"
1210        },
1211        { /* end of list */ }
1212    }
1213};
1214
1215static BlockDriver bdrv_vpc = {
1216    .format_name    = "vpc",
1217    .instance_size  = sizeof(BDRVVPCState),
1218
1219    .bdrv_probe             = vpc_probe,
1220    .bdrv_open              = vpc_open,
1221    .bdrv_close             = vpc_close,
1222    .bdrv_reopen_prepare    = vpc_reopen_prepare,
1223    .bdrv_child_perm        = bdrv_format_default_perms,
1224    .bdrv_co_create         = vpc_co_create,
1225    .bdrv_co_create_opts    = vpc_co_create_opts,
1226
1227    .bdrv_co_preadv             = vpc_co_preadv,
1228    .bdrv_co_pwritev            = vpc_co_pwritev,
1229    .bdrv_co_block_status       = vpc_co_block_status,
1230
1231    .bdrv_get_info          = vpc_get_info,
1232
1233    .create_opts            = &vpc_create_opts,
1234    .bdrv_has_zero_init     = vpc_has_zero_init,
1235};
1236
1237static void bdrv_vpc_init(void)
1238{
1239    bdrv_register(&bdrv_vpc);
1240}
1241
1242block_init(bdrv_vpc_init);
1243