qemu/block/vpc.c
<<
>>
Prefs
   1/*
   2 * Block driver for Connectix / Microsoft Virtual PC images
   3 *
   4 * Copyright (c) 2005 Alex Beregszaszi
   5 * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
   6 *
   7 * Permission is hereby granted, free of charge, to any person obtaining a copy
   8 * of this software and associated documentation files (the "Software"), to deal
   9 * in the Software without restriction, including without limitation the rights
  10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 * copies of the Software, and to permit persons to whom the Software is
  12 * furnished to do so, subject to the following conditions:
  13 *
  14 * The above copyright notice and this permission notice shall be included in
  15 * all copies or substantial portions of the Software.
  16 *
  17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  23 * THE SOFTWARE.
  24 */
  25
  26#include "qemu/osdep.h"
  27#include "qapi/error.h"
  28#include "block/block_int.h"
  29#include "block/qdict.h"
  30#include "sysemu/block-backend.h"
  31#include "qemu/module.h"
  32#include "qemu/option.h"
  33#include "migration/blocker.h"
  34#include "qemu/bswap.h"
  35#include "qemu/uuid.h"
  36#include "qemu/memalign.h"
  37#include "qapi/qmp/qdict.h"
  38#include "qapi/qobject-input-visitor.h"
  39#include "qapi/qapi-visit-block-core.h"
  40
  41/**************************************************************/
  42
  43//#define CACHE
  44
  45enum vhd_type {
  46    VHD_FIXED           = 2,
  47    VHD_DYNAMIC         = 3,
  48    VHD_DIFFERENCING    = 4,
  49};
  50
  51/* Seconds since Jan 1, 2000 0:00:00 (UTC) */
  52#define VHD_TIMESTAMP_BASE 946684800
  53
  54#define VHD_CHS_MAX_C   65535LL
  55#define VHD_CHS_MAX_H   16
  56#define VHD_CHS_MAX_S   255
  57
  58#define VHD_MAX_SECTORS       0xff000000    /* 2040 GiB max image size */
  59#define VHD_MAX_GEOMETRY      (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S)
  60
  61#define VPC_OPT_FORCE_SIZE "force_size"
  62
  63/* always big-endian */
  64typedef struct vhd_footer {
  65    char        creator[8]; /* "conectix" */
  66    uint32_t    features;
  67    uint32_t    version;
  68
  69    /* Offset of next header structure, 0xFFFFFFFF if none */
  70    uint64_t    data_offset;
  71
  72    /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
  73    uint32_t    timestamp;
  74
  75    char        creator_app[4]; /*  e.g., "vpc " */
  76    uint16_t    major;
  77    uint16_t    minor;
  78    char        creator_os[4]; /* "Wi2k" */
  79
  80    uint64_t    orig_size;
  81    uint64_t    current_size;
  82
  83    uint16_t    cyls;
  84    uint8_t     heads;
  85    uint8_t     secs_per_cyl;
  86
  87    uint32_t    type;
  88
  89    /* Checksum of the Hard Disk Footer ("one's complement of the sum of all
  90       the bytes in the footer without the checksum field") */
  91    uint32_t    checksum;
  92
  93    /* UUID used to identify a parent hard disk (backing file) */
  94    QemuUUID    uuid;
  95
  96    uint8_t     in_saved_state;
  97    uint8_t     reserved[427];
  98} QEMU_PACKED VHDFooter;
  99
 100QEMU_BUILD_BUG_ON(sizeof(VHDFooter) != 512);
 101
 102typedef struct vhd_dyndisk_header {
 103    char        magic[8]; /* "cxsparse" */
 104
 105    /* Offset of next header structure, 0xFFFFFFFF if none */
 106    uint64_t    data_offset;
 107
 108    /* Offset of the Block Allocation Table (BAT) */
 109    uint64_t    table_offset;
 110
 111    uint32_t    version;
 112    uint32_t    max_table_entries; /* 32bit/entry */
 113
 114    /* 2 MB by default, must be a power of two */
 115    uint32_t    block_size;
 116
 117    uint32_t    checksum;
 118    uint8_t     parent_uuid[16];
 119    uint32_t    parent_timestamp;
 120    uint32_t    reserved;
 121
 122    /* Backing file name (in UTF-16) */
 123    uint8_t     parent_name[512];
 124
 125    struct {
 126        uint32_t    platform;
 127        uint32_t    data_space;
 128        uint32_t    data_length;
 129        uint32_t    reserved;
 130        uint64_t    data_offset;
 131    } parent_locator[8];
 132    uint8_t     reserved2[256];
 133} QEMU_PACKED VHDDynDiskHeader;
 134
 135QEMU_BUILD_BUG_ON(sizeof(VHDDynDiskHeader) != 1024);
 136
 137typedef struct BDRVVPCState {
 138    CoMutex lock;
 139    VHDFooter footer;
 140    uint64_t free_data_block_offset;
 141    int max_table_entries;
 142    uint32_t *pagetable;
 143    uint64_t bat_offset;
 144    uint64_t last_bitmap_offset;
 145
 146    uint32_t block_size;
 147    uint32_t bitmap_size;
 148    bool force_use_chs;
 149    bool force_use_sz;
 150
 151#ifdef CACHE
 152    uint8_t *pageentry_u8;
 153    uint32_t *pageentry_u32;
 154    uint16_t *pageentry_u16;
 155
 156    uint64_t last_bitmap;
 157#endif
 158
 159    Error *migration_blocker;
 160} BDRVVPCState;
 161
 162#define VPC_OPT_SIZE_CALC "force_size_calc"
 163static QemuOptsList vpc_runtime_opts = {
 164    .name = "vpc-runtime-opts",
 165    .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head),
 166    .desc = {
 167        {
 168            .name = VPC_OPT_SIZE_CALC,
 169            .type = QEMU_OPT_STRING,
 170            .help = "Force disk size calculation to use either CHS geometry, "
 171                    "or use the disk current_size specified in the VHD footer. "
 172                    "{chs, current_size}"
 173        },
 174        { /* end of list */ }
 175    }
 176};
 177
 178static QemuOptsList vpc_create_opts;
 179
 180static uint32_t vpc_checksum(void *p, size_t size)
 181{
 182    uint8_t *buf = p;
 183    uint32_t res = 0;
 184    int i;
 185
 186    for (i = 0; i < size; i++)
 187        res += buf[i];
 188
 189    return ~res;
 190}
 191
 192
 193static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
 194{
 195    if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
 196        return 100;
 197    return 0;
 198}
 199
 200static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts,
 201                              Error **errp)
 202{
 203    BDRVVPCState *s = bs->opaque;
 204    const char *size_calc;
 205
 206    size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC);
 207
 208    if (!size_calc) {
 209       /* no override, use autodetect only */
 210    } else if (!strcmp(size_calc, "current_size")) {
 211        s->force_use_sz = true;
 212    } else if (!strcmp(size_calc, "chs")) {
 213        s->force_use_chs = true;
 214    } else {
 215        error_setg(errp, "Invalid size calculation mode: '%s'", size_calc);
 216    }
 217}
 218
 219static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
 220                    Error **errp)
 221{
 222    BDRVVPCState *s = bs->opaque;
 223    int i;
 224    VHDFooter *footer;
 225    QemuOpts *opts = NULL;
 226    Error *local_err = NULL;
 227    bool use_chs;
 228    VHDDynDiskHeader dyndisk_header;
 229    uint32_t checksum;
 230    uint64_t computed_size;
 231    uint64_t pagetable_size;
 232    int disk_type = VHD_DYNAMIC;
 233    int ret;
 234    int64_t bs_size;
 235
 236    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
 237                               BDRV_CHILD_IMAGE, false, errp);
 238    if (!bs->file) {
 239        return -EINVAL;
 240    }
 241
 242    opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
 243    if (!qemu_opts_absorb_qdict(opts, options, errp)) {
 244        ret = -EINVAL;
 245        goto fail;
 246    }
 247
 248    vpc_parse_options(bs, opts, &local_err);
 249    if (local_err) {
 250        error_propagate(errp, local_err);
 251        ret = -EINVAL;
 252        goto fail;
 253    }
 254
 255    ret = bdrv_pread(bs->file, 0, &s->footer, sizeof(s->footer));
 256    if (ret < 0) {
 257        error_setg(errp, "Unable to read VHD header");
 258        goto fail;
 259    }
 260
 261    footer = &s->footer;
 262    if (strncmp(footer->creator, "conectix", 8)) {
 263        int64_t offset = bdrv_getlength(bs->file->bs);
 264        if (offset < 0) {
 265            ret = offset;
 266            error_setg(errp, "Invalid file size");
 267            goto fail;
 268        } else if (offset < sizeof(*footer)) {
 269            ret = -EINVAL;
 270            error_setg(errp, "File too small for a VHD header");
 271            goto fail;
 272        }
 273
 274        /* If a fixed disk, the footer is found only at the end of the file */
 275        ret = bdrv_pread(bs->file, offset - sizeof(*footer),
 276                         footer, sizeof(*footer));
 277        if (ret < 0) {
 278            goto fail;
 279        }
 280        if (strncmp(footer->creator, "conectix", 8) ||
 281            be32_to_cpu(footer->type) != VHD_FIXED) {
 282            error_setg(errp, "invalid VPC image");
 283            ret = -EINVAL;
 284            goto fail;
 285        }
 286        disk_type = VHD_FIXED;
 287    }
 288
 289    checksum = be32_to_cpu(footer->checksum);
 290    footer->checksum = 0;
 291    if (vpc_checksum(footer, sizeof(*footer)) != checksum) {
 292        error_setg(errp, "Incorrect header checksum");
 293        ret = -EINVAL;
 294        goto fail;
 295    }
 296
 297    /* Write 'checksum' back to footer, or else will leave it with zero. */
 298    footer->checksum = cpu_to_be32(checksum);
 299
 300    /* The visible size of a image in Virtual PC depends on the geometry
 301       rather than on the size stored in the footer (the size in the footer
 302       is too large usually) */
 303    bs->total_sectors = (int64_t)
 304        be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
 305
 306    /* Microsoft Virtual PC and Microsoft Hyper-V produce and read
 307     * VHD image sizes differently.  VPC will rely on CHS geometry,
 308     * while Hyper-V and disk2vhd use the size specified in the footer.
 309     *
 310     * We use a couple of approaches to try and determine the correct method:
 311     * look at the Creator App field, and look for images that have CHS
 312     * geometry that is the maximum value.
 313     *
 314     * If the CHS geometry is the maximum CHS geometry, then we assume that
 315     * the size is the footer->current_size to avoid truncation.  Otherwise,
 316     * we follow the table based on footer->creator_app:
 317     *
 318     *  Known creator apps:
 319     *      'vpc '  :  CHS              Virtual PC (uses disk geometry)
 320     *      'qemu'  :  CHS              QEMU (uses disk geometry)
 321     *      'qem2'  :  current_size     QEMU (uses current_size)
 322     *      'win '  :  current_size     Hyper-V
 323     *      'd2v '  :  current_size     Disk2vhd
 324     *      'tap\0' :  current_size     XenServer
 325     *      'CTXS'  :  current_size     XenConverter
 326     *
 327     *  The user can override the table values via drive options, however
 328     *  even with an override we will still use current_size for images
 329     *  that have CHS geometry of the maximum size.
 330     */
 331    use_chs = (!!strncmp(footer->creator_app, "win ", 4) &&
 332               !!strncmp(footer->creator_app, "qem2", 4) &&
 333               !!strncmp(footer->creator_app, "d2v ", 4) &&
 334               !!strncmp(footer->creator_app, "CTXS", 4) &&
 335               !!memcmp(footer->creator_app, "tap", 4)) || s->force_use_chs;
 336
 337    if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) {
 338        bs->total_sectors = be64_to_cpu(footer->current_size) /
 339                                        BDRV_SECTOR_SIZE;
 340    }
 341
 342    /* Allow a maximum disk size of 2040 GiB */
 343    if (bs->total_sectors > VHD_MAX_SECTORS) {
 344        ret = -EFBIG;
 345        goto fail;
 346    }
 347
 348    if (disk_type == VHD_DYNAMIC) {
 349        ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset),
 350                         &dyndisk_header, sizeof(dyndisk_header));
 351        if (ret < 0) {
 352            error_setg(errp, "Error reading dynamic VHD header");
 353            goto fail;
 354        }
 355
 356        if (strncmp(dyndisk_header.magic, "cxsparse", 8)) {
 357            error_setg(errp, "Invalid header magic");
 358            ret = -EINVAL;
 359            goto fail;
 360        }
 361
 362        s->block_size = be32_to_cpu(dyndisk_header.block_size);
 363        if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) {
 364            error_setg(errp, "Invalid block size %" PRIu32, s->block_size);
 365            ret = -EINVAL;
 366            goto fail;
 367        }
 368        s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
 369
 370        s->max_table_entries = be32_to_cpu(dyndisk_header.max_table_entries);
 371
 372        if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
 373            error_setg(errp, "Too many blocks");
 374            ret = -EINVAL;
 375            goto fail;
 376        }
 377
 378        computed_size = (uint64_t) s->max_table_entries * s->block_size;
 379        if (computed_size < bs->total_sectors * 512) {
 380            error_setg(errp, "Page table too small");
 381            ret = -EINVAL;
 382            goto fail;
 383        }
 384
 385        if (s->max_table_entries > SIZE_MAX / 4 ||
 386            s->max_table_entries > (int) INT_MAX / 4) {
 387            error_setg(errp, "Max Table Entries too large (%" PRId32 ")",
 388                        s->max_table_entries);
 389            ret = -EINVAL;
 390            goto fail;
 391        }
 392
 393        pagetable_size = (uint64_t) s->max_table_entries * 4;
 394
 395        s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size);
 396        if (s->pagetable == NULL) {
 397            error_setg(errp, "Unable to allocate memory for page table");
 398            ret = -ENOMEM;
 399            goto fail;
 400        }
 401
 402        s->bat_offset = be64_to_cpu(dyndisk_header.table_offset);
 403
 404        ret = bdrv_pread(bs->file, s->bat_offset, s->pagetable,
 405                         pagetable_size);
 406        if (ret < 0) {
 407            error_setg(errp, "Error reading pagetable");
 408            goto fail;
 409        }
 410
 411        s->free_data_block_offset =
 412            ROUND_UP(s->bat_offset + pagetable_size, 512);
 413
 414        for (i = 0; i < s->max_table_entries; i++) {
 415            be32_to_cpus(&s->pagetable[i]);
 416            if (s->pagetable[i] != 0xFFFFFFFF) {
 417                int64_t next = (512 * (int64_t) s->pagetable[i]) +
 418                    s->bitmap_size + s->block_size;
 419
 420                if (next > s->free_data_block_offset) {
 421                    s->free_data_block_offset = next;
 422                }
 423            }
 424        }
 425
 426        bs_size = bdrv_getlength(bs->file->bs);
 427        if (bs_size < 0) {
 428            error_setg_errno(errp, -bs_size, "Unable to learn image size");
 429            ret = bs_size;
 430            goto fail;
 431        }
 432        if (s->free_data_block_offset > bs_size) {
 433            error_setg(errp, "block-vpc: free_data_block_offset points after "
 434                             "the end of file. The image has been truncated.");
 435            ret = -EINVAL;
 436            goto fail;
 437        }
 438
 439        s->last_bitmap_offset = (int64_t) -1;
 440
 441#ifdef CACHE
 442        s->pageentry_u8 = g_malloc(512);
 443        s->pageentry_u32 = s->pageentry_u8;
 444        s->pageentry_u16 = s->pageentry_u8;
 445        s->last_pagetable = -1;
 446#endif
 447    }
 448
 449    /* Disable migration when VHD images are used */
 450    error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
 451               "does not support live migration",
 452               bdrv_get_device_or_node_name(bs));
 453    ret = migrate_add_blocker(s->migration_blocker, errp);
 454    if (ret < 0) {
 455        error_free(s->migration_blocker);
 456        goto fail;
 457    }
 458
 459    qemu_co_mutex_init(&s->lock);
 460    qemu_opts_del(opts);
 461
 462    return 0;
 463
 464fail:
 465    qemu_opts_del(opts);
 466    qemu_vfree(s->pagetable);
 467#ifdef CACHE
 468    g_free(s->pageentry_u8);
 469#endif
 470    return ret;
 471}
 472
 473static int vpc_reopen_prepare(BDRVReopenState *state,
 474                              BlockReopenQueue *queue, Error **errp)
 475{
 476    return 0;
 477}
 478
 479/*
 480 * Returns the absolute byte offset of the given sector in the image file.
 481 * If the sector is not allocated, -1 is returned instead.
 482 * If an error occurred trying to write an updated block bitmap back to
 483 * the file, -2 is returned, and the error value is written to *err.
 484 * This can only happen for a write operation.
 485 *
 486 * The parameter write must be 1 if the offset will be used for a write
 487 * operation (the block bitmaps is updated then), 0 otherwise.
 488 * If write is true then err must not be NULL.
 489 */
 490static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset,
 491                                       bool write, int *err)
 492{
 493    BDRVVPCState *s = bs->opaque;
 494    uint64_t bitmap_offset, block_offset;
 495    uint32_t pagetable_index, offset_in_block;
 496
 497    assert(!(write && err == NULL));
 498
 499    pagetable_index = offset / s->block_size;
 500    offset_in_block = offset % s->block_size;
 501
 502    if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
 503        return -1; /* not allocated */
 504
 505    bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
 506    block_offset = bitmap_offset + s->bitmap_size + offset_in_block;
 507
 508    /* We must ensure that we don't write to any sectors which are marked as
 509       unused in the bitmap. We get away with setting all bits in the block
 510       bitmap each time we write to a new block. This might cause Virtual PC to
 511       miss sparse read optimization, but it's not a problem in terms of
 512       correctness. */
 513    if (write && (s->last_bitmap_offset != bitmap_offset)) {
 514        uint8_t bitmap[s->bitmap_size];
 515        int r;
 516
 517        s->last_bitmap_offset = bitmap_offset;
 518        memset(bitmap, 0xff, s->bitmap_size);
 519        r = bdrv_pwrite_sync(bs->file, bitmap_offset, bitmap, s->bitmap_size);
 520        if (r < 0) {
 521            *err = r;
 522            return -2;
 523        }
 524    }
 525
 526    return block_offset;
 527}
 528
 529/*
 530 * Writes the footer to the end of the image file. This is needed when the
 531 * file grows as it overwrites the old footer
 532 *
 533 * Returns 0 on success and < 0 on error
 534 */
 535static int rewrite_footer(BlockDriverState *bs)
 536{
 537    int ret;
 538    BDRVVPCState *s = bs->opaque;
 539    int64_t offset = s->free_data_block_offset;
 540
 541    ret = bdrv_pwrite_sync(bs->file, offset, &s->footer, sizeof(s->footer));
 542    if (ret < 0)
 543        return ret;
 544
 545    return 0;
 546}
 547
 548/*
 549 * Allocates a new block. This involves writing a new footer and updating
 550 * the Block Allocation Table to use the space at the old end of the image
 551 * file (overwriting the old footer)
 552 *
 553 * Returns the sectors' offset in the image file on success and < 0 on error
 554 */
 555static int64_t alloc_block(BlockDriverState *bs, int64_t offset)
 556{
 557    BDRVVPCState *s = bs->opaque;
 558    int64_t bat_offset;
 559    uint32_t index, bat_value;
 560    int ret;
 561    uint8_t bitmap[s->bitmap_size];
 562
 563    /* Check if sector_num is valid */
 564    if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
 565        return -EINVAL;
 566    }
 567
 568    /* Write entry into in-memory BAT */
 569    index = offset / s->block_size;
 570    assert(s->pagetable[index] == 0xFFFFFFFF);
 571    s->pagetable[index] = s->free_data_block_offset / 512;
 572
 573    /* Initialize the block's bitmap */
 574    memset(bitmap, 0xff, s->bitmap_size);
 575    ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset, bitmap,
 576        s->bitmap_size);
 577    if (ret < 0) {
 578        return ret;
 579    }
 580
 581    /* Write new footer (the old one will be overwritten) */
 582    s->free_data_block_offset += s->block_size + s->bitmap_size;
 583    ret = rewrite_footer(bs);
 584    if (ret < 0)
 585        goto fail;
 586
 587    /* Write BAT entry to disk */
 588    bat_offset = s->bat_offset + (4 * index);
 589    bat_value = cpu_to_be32(s->pagetable[index]);
 590    ret = bdrv_pwrite_sync(bs->file, bat_offset, &bat_value, 4);
 591    if (ret < 0)
 592        goto fail;
 593
 594    return get_image_offset(bs, offset, false, NULL);
 595
 596fail:
 597    s->free_data_block_offset -= (s->block_size + s->bitmap_size);
 598    return ret;
 599}
 600
 601static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 602{
 603    BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
 604
 605    if (be32_to_cpu(s->footer.type) != VHD_FIXED) {
 606        bdi->cluster_size = s->block_size;
 607    }
 608
 609    return 0;
 610}
 611
 612static int coroutine_fn
 613vpc_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
 614              QEMUIOVector *qiov, BdrvRequestFlags flags)
 615{
 616    BDRVVPCState *s = bs->opaque;
 617    int ret;
 618    int64_t image_offset;
 619    int64_t n_bytes;
 620    int64_t bytes_done = 0;
 621    QEMUIOVector local_qiov;
 622
 623    if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
 624        return bdrv_co_preadv(bs->file, offset, bytes, qiov, 0);
 625    }
 626
 627    qemu_co_mutex_lock(&s->lock);
 628    qemu_iovec_init(&local_qiov, qiov->niov);
 629
 630    while (bytes > 0) {
 631        image_offset = get_image_offset(bs, offset, false, NULL);
 632        n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
 633
 634        if (image_offset == -1) {
 635            qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
 636        } else {
 637            qemu_iovec_reset(&local_qiov);
 638            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
 639
 640            qemu_co_mutex_unlock(&s->lock);
 641            ret = bdrv_co_preadv(bs->file, image_offset, n_bytes,
 642                                 &local_qiov, 0);
 643            qemu_co_mutex_lock(&s->lock);
 644            if (ret < 0) {
 645                goto fail;
 646            }
 647        }
 648
 649        bytes -= n_bytes;
 650        offset += n_bytes;
 651        bytes_done += n_bytes;
 652    }
 653
 654    ret = 0;
 655fail:
 656    qemu_iovec_destroy(&local_qiov);
 657    qemu_co_mutex_unlock(&s->lock);
 658
 659    return ret;
 660}
 661
 662static int coroutine_fn
 663vpc_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
 664               QEMUIOVector *qiov, BdrvRequestFlags flags)
 665{
 666    BDRVVPCState *s = bs->opaque;
 667    int64_t image_offset;
 668    int64_t n_bytes;
 669    int64_t bytes_done = 0;
 670    int ret = 0;
 671    QEMUIOVector local_qiov;
 672
 673    if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
 674        return bdrv_co_pwritev(bs->file, offset, bytes, qiov, 0);
 675    }
 676
 677    qemu_co_mutex_lock(&s->lock);
 678    qemu_iovec_init(&local_qiov, qiov->niov);
 679
 680    while (bytes > 0) {
 681        image_offset = get_image_offset(bs, offset, true, &ret);
 682        if (image_offset == -2) {
 683            /* Failed to write block bitmap: can't proceed with write */
 684            goto fail;
 685        }
 686        n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
 687
 688        if (image_offset == -1) {
 689            image_offset = alloc_block(bs, offset);
 690            if (image_offset < 0) {
 691                ret = image_offset;
 692                goto fail;
 693            }
 694        }
 695
 696        qemu_iovec_reset(&local_qiov);
 697        qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
 698
 699        qemu_co_mutex_unlock(&s->lock);
 700        ret = bdrv_co_pwritev(bs->file, image_offset, n_bytes,
 701                              &local_qiov, 0);
 702        qemu_co_mutex_lock(&s->lock);
 703        if (ret < 0) {
 704            goto fail;
 705        }
 706
 707        bytes -= n_bytes;
 708        offset += n_bytes;
 709        bytes_done += n_bytes;
 710    }
 711
 712    ret = 0;
 713fail:
 714    qemu_iovec_destroy(&local_qiov);
 715    qemu_co_mutex_unlock(&s->lock);
 716
 717    return ret;
 718}
 719
 720static int coroutine_fn vpc_co_block_status(BlockDriverState *bs,
 721                                            bool want_zero,
 722                                            int64_t offset, int64_t bytes,
 723                                            int64_t *pnum, int64_t *map,
 724                                            BlockDriverState **file)
 725{
 726    BDRVVPCState *s = bs->opaque;
 727    int64_t image_offset;
 728    bool allocated;
 729    int ret;
 730    int64_t n;
 731
 732    if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
 733        *pnum = bytes;
 734        *map = offset;
 735        *file = bs->file->bs;
 736        return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_RECURSE;
 737    }
 738
 739    qemu_co_mutex_lock(&s->lock);
 740
 741    image_offset = get_image_offset(bs, offset, false, NULL);
 742    allocated = (image_offset != -1);
 743    *pnum = 0;
 744    ret = BDRV_BLOCK_ZERO;
 745
 746    do {
 747        /* All sectors in a block are contiguous (without using the bitmap) */
 748        n = ROUND_UP(offset + 1, s->block_size) - offset;
 749        n = MIN(n, bytes);
 750
 751        *pnum += n;
 752        offset += n;
 753        bytes -= n;
 754        /* *pnum can't be greater than one block for allocated
 755         * sectors since there is always a bitmap in between. */
 756        if (allocated) {
 757            *file = bs->file->bs;
 758            *map = image_offset;
 759            ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
 760            break;
 761        }
 762        if (bytes == 0) {
 763            break;
 764        }
 765        image_offset = get_image_offset(bs, offset, false, NULL);
 766    } while (image_offset == -1);
 767
 768    qemu_co_mutex_unlock(&s->lock);
 769    return ret;
 770}
 771
 772/*
 773 * Calculates the number of cylinders, heads and sectors per cylinder
 774 * based on a given number of sectors. This is the algorithm described
 775 * in the VHD specification.
 776 *
 777 * Note that the geometry doesn't always exactly match total_sectors but
 778 * may round it down.
 779 *
 780 * Returns 0 on success, -EFBIG if the size is larger than 2040 GiB. Override
 781 * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
 782 * and instead allow up to 255 heads.
 783 */
 784static int calculate_geometry(int64_t total_sectors, uint16_t *cyls,
 785    uint8_t *heads, uint8_t *secs_per_cyl)
 786{
 787    uint32_t cyls_times_heads;
 788
 789    total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY);
 790
 791    if (total_sectors >= 65535LL * 16 * 63) {
 792        *secs_per_cyl = 255;
 793        *heads = 16;
 794        cyls_times_heads = total_sectors / *secs_per_cyl;
 795    } else {
 796        *secs_per_cyl = 17;
 797        cyls_times_heads = total_sectors / *secs_per_cyl;
 798        *heads = DIV_ROUND_UP(cyls_times_heads, 1024);
 799
 800        if (*heads < 4) {
 801            *heads = 4;
 802        }
 803
 804        if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
 805            *secs_per_cyl = 31;
 806            *heads = 16;
 807            cyls_times_heads = total_sectors / *secs_per_cyl;
 808        }
 809
 810        if (cyls_times_heads >= (*heads * 1024)) {
 811            *secs_per_cyl = 63;
 812            *heads = 16;
 813            cyls_times_heads = total_sectors / *secs_per_cyl;
 814        }
 815    }
 816
 817    *cyls = cyls_times_heads / *heads;
 818
 819    return 0;
 820}
 821
 822static int create_dynamic_disk(BlockBackend *blk, VHDFooter *footer,
 823                               int64_t total_sectors)
 824{
 825    VHDDynDiskHeader dyndisk_header;
 826    uint8_t bat_sector[512];
 827    size_t block_size, num_bat_entries;
 828    int i;
 829    int ret;
 830    int64_t offset = 0;
 831
 832    /* Write the footer (twice: at the beginning and at the end) */
 833    block_size = 0x200000;
 834    num_bat_entries = DIV_ROUND_UP(total_sectors, block_size / 512);
 835
 836    ret = blk_pwrite(blk, offset, footer, sizeof(*footer), 0);
 837    if (ret < 0) {
 838        goto fail;
 839    }
 840
 841    offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
 842    ret = blk_pwrite(blk, offset, footer, sizeof(*footer), 0);
 843    if (ret < 0) {
 844        goto fail;
 845    }
 846
 847    /* Write the initial BAT */
 848    offset = 3 * 512;
 849
 850    memset(bat_sector, 0xFF, 512);
 851    for (i = 0; i < DIV_ROUND_UP(num_bat_entries * 4, 512); i++) {
 852        ret = blk_pwrite(blk, offset, bat_sector, 512, 0);
 853        if (ret < 0) {
 854            goto fail;
 855        }
 856        offset += 512;
 857    }
 858
 859    /* Prepare the Dynamic Disk Header */
 860    memset(&dyndisk_header, 0, sizeof(dyndisk_header));
 861
 862    memcpy(dyndisk_header.magic, "cxsparse", 8);
 863
 864    /*
 865     * Note: The spec is actually wrong here for data_offset, it says
 866     * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
 867     */
 868    dyndisk_header.data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
 869    dyndisk_header.table_offset = cpu_to_be64(3 * 512);
 870    dyndisk_header.version = cpu_to_be32(0x00010000);
 871    dyndisk_header.block_size = cpu_to_be32(block_size);
 872    dyndisk_header.max_table_entries = cpu_to_be32(num_bat_entries);
 873
 874    dyndisk_header.checksum = cpu_to_be32(
 875        vpc_checksum(&dyndisk_header, sizeof(dyndisk_header)));
 876
 877    /* Write the header */
 878    offset = 512;
 879
 880    ret = blk_pwrite(blk, offset, &dyndisk_header, sizeof(dyndisk_header), 0);
 881    if (ret < 0) {
 882        goto fail;
 883    }
 884
 885    ret = 0;
 886 fail:
 887    return ret;
 888}
 889
 890static int create_fixed_disk(BlockBackend *blk, VHDFooter *footer,
 891                             int64_t total_size, Error **errp)
 892{
 893    int ret;
 894
 895    /* Add footer to total size */
 896    total_size += sizeof(*footer);
 897
 898    ret = blk_truncate(blk, total_size, false, PREALLOC_MODE_OFF, 0, errp);
 899    if (ret < 0) {
 900        return ret;
 901    }
 902
 903    ret = blk_pwrite(blk, total_size - sizeof(*footer),
 904                     footer, sizeof(*footer), 0);
 905    if (ret < 0) {
 906        error_setg_errno(errp, -ret, "Unable to write VHD header");
 907        return ret;
 908    }
 909
 910    return 0;
 911}
 912
 913static int calculate_rounded_image_size(BlockdevCreateOptionsVpc *vpc_opts,
 914                                        uint16_t *out_cyls,
 915                                        uint8_t *out_heads,
 916                                        uint8_t *out_secs_per_cyl,
 917                                        int64_t *out_total_sectors,
 918                                        Error **errp)
 919{
 920    int64_t total_size = vpc_opts->size;
 921    uint16_t cyls = 0;
 922    uint8_t heads = 0;
 923    uint8_t secs_per_cyl = 0;
 924    int64_t total_sectors;
 925    int i;
 926
 927    /*
 928     * Calculate matching total_size and geometry. Increase the number of
 929     * sectors requested until we get enough (or fail). This ensures that
 930     * qemu-img convert doesn't truncate images, but rather rounds up.
 931     *
 932     * If the image size can't be represented by a spec conformant CHS geometry,
 933     * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use
 934     * the image size from the VHD footer to calculate total_sectors.
 935     */
 936    if (vpc_opts->force_size) {
 937        /* This will force the use of total_size for sector count, below */
 938        cyls         = VHD_CHS_MAX_C;
 939        heads        = VHD_CHS_MAX_H;
 940        secs_per_cyl = VHD_CHS_MAX_S;
 941    } else {
 942        total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
 943        for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
 944            calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
 945        }
 946    }
 947
 948    if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) {
 949        total_sectors = total_size / BDRV_SECTOR_SIZE;
 950        /* Allow a maximum disk size of 2040 GiB */
 951        if (total_sectors > VHD_MAX_SECTORS) {
 952            error_setg(errp, "Disk size is too large, max size is 2040 GiB");
 953            return -EFBIG;
 954        }
 955    } else {
 956        total_sectors = (int64_t) cyls * heads * secs_per_cyl;
 957    }
 958
 959    *out_total_sectors = total_sectors;
 960    if (out_cyls) {
 961        *out_cyls = cyls;
 962        *out_heads = heads;
 963        *out_secs_per_cyl = secs_per_cyl;
 964    }
 965
 966    return 0;
 967}
 968
 969static int coroutine_fn vpc_co_create(BlockdevCreateOptions *opts,
 970                                      Error **errp)
 971{
 972    BlockdevCreateOptionsVpc *vpc_opts;
 973    BlockBackend *blk = NULL;
 974    BlockDriverState *bs = NULL;
 975
 976    VHDFooter footer;
 977    uint16_t cyls = 0;
 978    uint8_t heads = 0;
 979    uint8_t secs_per_cyl = 0;
 980    int64_t total_sectors;
 981    int64_t total_size;
 982    int disk_type;
 983    int ret = -EIO;
 984    QemuUUID uuid;
 985
 986    assert(opts->driver == BLOCKDEV_DRIVER_VPC);
 987    vpc_opts = &opts->u.vpc;
 988
 989    /* Validate options and set default values */
 990    total_size = vpc_opts->size;
 991
 992    if (!vpc_opts->has_subformat) {
 993        vpc_opts->subformat = BLOCKDEV_VPC_SUBFORMAT_DYNAMIC;
 994    }
 995    switch (vpc_opts->subformat) {
 996    case BLOCKDEV_VPC_SUBFORMAT_DYNAMIC:
 997        disk_type = VHD_DYNAMIC;
 998        break;
 999    case BLOCKDEV_VPC_SUBFORMAT_FIXED:
1000        disk_type = VHD_FIXED;
1001        break;
1002    default:
1003        g_assert_not_reached();
1004    }
1005
1006    /* Create BlockBackend to write to the image */
1007    bs = bdrv_open_blockdev_ref(vpc_opts->file, errp);
1008    if (bs == NULL) {
1009        return -EIO;
1010    }
1011
1012    blk = blk_new_with_bs(bs, BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL,
1013                          errp);
1014    if (!blk) {
1015        ret = -EPERM;
1016        goto out;
1017    }
1018    blk_set_allow_write_beyond_eof(blk, true);
1019
1020    /* Get geometry and check that it matches the image size*/
1021    ret = calculate_rounded_image_size(vpc_opts, &cyls, &heads, &secs_per_cyl,
1022                                       &total_sectors, errp);
1023    if (ret < 0) {
1024        goto out;
1025    }
1026
1027    if (total_size != total_sectors * BDRV_SECTOR_SIZE) {
1028        error_setg(errp, "The requested image size cannot be represented in "
1029                         "CHS geometry");
1030        error_append_hint(errp, "Try size=%llu or force-size=on (the "
1031                                "latter makes the image incompatible with "
1032                                "Virtual PC)",
1033                          total_sectors * BDRV_SECTOR_SIZE);
1034        ret = -EINVAL;
1035        goto out;
1036    }
1037
1038    /* Prepare the Hard Disk Footer */
1039    memset(&footer, 0, sizeof(footer));
1040
1041    memcpy(footer.creator, "conectix", 8);
1042    if (vpc_opts->force_size) {
1043        memcpy(footer.creator_app, "qem2", 4);
1044    } else {
1045        memcpy(footer.creator_app, "qemu", 4);
1046    }
1047    memcpy(footer.creator_os, "Wi2k", 4);
1048
1049    footer.features = cpu_to_be32(0x02);
1050    footer.version = cpu_to_be32(0x00010000);
1051    if (disk_type == VHD_DYNAMIC) {
1052        footer.data_offset = cpu_to_be64(sizeof(footer));
1053    } else {
1054        footer.data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
1055    }
1056    footer.timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE);
1057
1058    /* Version of Virtual PC 2007 */
1059    footer.major = cpu_to_be16(0x0005);
1060    footer.minor = cpu_to_be16(0x0003);
1061    footer.orig_size = cpu_to_be64(total_size);
1062    footer.current_size = cpu_to_be64(total_size);
1063    footer.cyls = cpu_to_be16(cyls);
1064    footer.heads = heads;
1065    footer.secs_per_cyl = secs_per_cyl;
1066
1067    footer.type = cpu_to_be32(disk_type);
1068
1069    qemu_uuid_generate(&uuid);
1070    footer.uuid = uuid;
1071
1072    footer.checksum = cpu_to_be32(vpc_checksum(&footer, sizeof(footer)));
1073
1074    if (disk_type == VHD_DYNAMIC) {
1075        ret = create_dynamic_disk(blk, &footer, total_sectors);
1076        if (ret < 0) {
1077            error_setg(errp, "Unable to create or write VHD header");
1078        }
1079    } else {
1080        ret = create_fixed_disk(blk, &footer, total_size, errp);
1081    }
1082
1083out:
1084    blk_unref(blk);
1085    bdrv_unref(bs);
1086    return ret;
1087}
1088
1089static int coroutine_fn vpc_co_create_opts(BlockDriver *drv,
1090                                           const char *filename,
1091                                           QemuOpts *opts,
1092                                           Error **errp)
1093{
1094    BlockdevCreateOptions *create_options = NULL;
1095    QDict *qdict;
1096    Visitor *v;
1097    BlockDriverState *bs = NULL;
1098    int ret;
1099
1100    static const QDictRenames opt_renames[] = {
1101        { VPC_OPT_FORCE_SIZE,           "force-size" },
1102        { NULL, NULL },
1103    };
1104
1105    /* Parse options and convert legacy syntax */
1106    qdict = qemu_opts_to_qdict_filtered(opts, NULL, &vpc_create_opts, true);
1107
1108    if (!qdict_rename_keys(qdict, opt_renames, errp)) {
1109        ret = -EINVAL;
1110        goto fail;
1111    }
1112
1113    /* Create and open the file (protocol layer) */
1114    ret = bdrv_create_file(filename, opts, errp);
1115    if (ret < 0) {
1116        goto fail;
1117    }
1118
1119    bs = bdrv_open(filename, NULL, NULL,
1120                   BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
1121    if (bs == NULL) {
1122        ret = -EIO;
1123        goto fail;
1124    }
1125
1126    /* Now get the QAPI type BlockdevCreateOptions */
1127    qdict_put_str(qdict, "driver", "vpc");
1128    qdict_put_str(qdict, "file", bs->node_name);
1129
1130    v = qobject_input_visitor_new_flat_confused(qdict, errp);
1131    if (!v) {
1132        ret = -EINVAL;
1133        goto fail;
1134    }
1135
1136    visit_type_BlockdevCreateOptions(v, NULL, &create_options, errp);
1137    visit_free(v);
1138    if (!create_options) {
1139        ret = -EINVAL;
1140        goto fail;
1141    }
1142
1143    /* Silently round up size */
1144    assert(create_options->driver == BLOCKDEV_DRIVER_VPC);
1145    create_options->u.vpc.size =
1146        ROUND_UP(create_options->u.vpc.size, BDRV_SECTOR_SIZE);
1147
1148    if (!create_options->u.vpc.force_size) {
1149        int64_t total_sectors;
1150        ret = calculate_rounded_image_size(&create_options->u.vpc, NULL, NULL,
1151                                           NULL, &total_sectors, errp);
1152        if (ret < 0) {
1153            goto fail;
1154        }
1155
1156        create_options->u.vpc.size = total_sectors * BDRV_SECTOR_SIZE;
1157    }
1158
1159
1160    /* Create the vpc image (format layer) */
1161    ret = vpc_co_create(create_options, errp);
1162
1163fail:
1164    qobject_unref(qdict);
1165    bdrv_unref(bs);
1166    qapi_free_BlockdevCreateOptions(create_options);
1167    return ret;
1168}
1169
1170
1171static int vpc_has_zero_init(BlockDriverState *bs)
1172{
1173    BDRVVPCState *s = bs->opaque;
1174
1175    if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
1176        return bdrv_has_zero_init(bs->file->bs);
1177    } else {
1178        return 1;
1179    }
1180}
1181
1182static void vpc_close(BlockDriverState *bs)
1183{
1184    BDRVVPCState *s = bs->opaque;
1185    qemu_vfree(s->pagetable);
1186#ifdef CACHE
1187    g_free(s->pageentry_u8);
1188#endif
1189
1190    migrate_del_blocker(s->migration_blocker);
1191    error_free(s->migration_blocker);
1192}
1193
1194static QemuOptsList vpc_create_opts = {
1195    .name = "vpc-create-opts",
1196    .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head),
1197    .desc = {
1198        {
1199            .name = BLOCK_OPT_SIZE,
1200            .type = QEMU_OPT_SIZE,
1201            .help = "Virtual disk size"
1202        },
1203        {
1204            .name = BLOCK_OPT_SUBFMT,
1205            .type = QEMU_OPT_STRING,
1206            .help =
1207                "Type of virtual hard disk format. Supported formats are "
1208                "{dynamic (default) | fixed} "
1209        },
1210        {
1211            .name = VPC_OPT_FORCE_SIZE,
1212            .type = QEMU_OPT_BOOL,
1213            .help = "Force disk size calculation to use the actual size "
1214                    "specified, rather than using the nearest CHS-based "
1215                    "calculation"
1216        },
1217        { /* end of list */ }
1218    }
1219};
1220
1221static const char *const vpc_strong_runtime_opts[] = {
1222    VPC_OPT_SIZE_CALC,
1223
1224    NULL
1225};
1226
1227static BlockDriver bdrv_vpc = {
1228    .format_name    = "vpc",
1229    .instance_size  = sizeof(BDRVVPCState),
1230
1231    .bdrv_probe             = vpc_probe,
1232    .bdrv_open              = vpc_open,
1233    .bdrv_close             = vpc_close,
1234    .bdrv_reopen_prepare    = vpc_reopen_prepare,
1235    .bdrv_child_perm        = bdrv_default_perms,
1236    .bdrv_co_create         = vpc_co_create,
1237    .bdrv_co_create_opts    = vpc_co_create_opts,
1238
1239    .bdrv_co_preadv             = vpc_co_preadv,
1240    .bdrv_co_pwritev            = vpc_co_pwritev,
1241    .bdrv_co_block_status       = vpc_co_block_status,
1242
1243    .bdrv_get_info          = vpc_get_info,
1244
1245    .is_format              = true,
1246    .create_opts            = &vpc_create_opts,
1247    .bdrv_has_zero_init     = vpc_has_zero_init,
1248    .strong_runtime_opts    = vpc_strong_runtime_opts,
1249};
1250
1251static void bdrv_vpc_init(void)
1252{
1253    bdrv_register(&bdrv_vpc);
1254}
1255
1256block_init(bdrv_vpc_init);
1257