qemu/block/vpc.c
<<
>>
Prefs
   1/*
   2 * Block driver for Connectix / Microsoft Virtual PC images
   3 *
   4 * Copyright (c) 2005 Alex Beregszaszi
   5 * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
   6 *
   7 * Permission is hereby granted, free of charge, to any person obtaining a copy
   8 * of this software and associated documentation files (the "Software"), to deal
   9 * in the Software without restriction, including without limitation the rights
  10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 * copies of the Software, and to permit persons to whom the Software is
  12 * furnished to do so, subject to the following conditions:
  13 *
  14 * The above copyright notice and this permission notice shall be included in
  15 * all copies or substantial portions of the Software.
  16 *
  17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  23 * THE SOFTWARE.
  24 */
  25
  26#include "qemu/osdep.h"
  27#include "qapi/error.h"
  28#include "block/block_int.h"
  29#include "block/qdict.h"
  30#include "sysemu/block-backend.h"
  31#include "qemu/module.h"
  32#include "qemu/option.h"
  33#include "migration/blocker.h"
  34#include "qemu/bswap.h"
  35#include "qemu/uuid.h"
  36#include "qemu/memalign.h"
  37#include "qapi/qmp/qdict.h"
  38#include "qapi/qobject-input-visitor.h"
  39#include "qapi/qapi-visit-block-core.h"
  40
  41/**************************************************************/
  42
  43//#define CACHE
  44
  45enum vhd_type {
  46    VHD_FIXED           = 2,
  47    VHD_DYNAMIC         = 3,
  48    VHD_DIFFERENCING    = 4,
  49};
  50
  51/* Seconds since Jan 1, 2000 0:00:00 (UTC) */
  52#define VHD_TIMESTAMP_BASE 946684800
  53
  54#define VHD_CHS_MAX_C   65535LL
  55#define VHD_CHS_MAX_H   16
  56#define VHD_CHS_MAX_S   255
  57
  58#define VHD_MAX_SECTORS       0xff000000    /* 2040 GiB max image size */
  59#define VHD_MAX_GEOMETRY      (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S)
  60
  61#define VPC_OPT_FORCE_SIZE "force_size"
  62
  63/* always big-endian */
  64typedef struct vhd_footer {
  65    char        creator[8]; /* "conectix" */
  66    uint32_t    features;
  67    uint32_t    version;
  68
  69    /* Offset of next header structure, 0xFFFFFFFF if none */
  70    uint64_t    data_offset;
  71
  72    /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
  73    uint32_t    timestamp;
  74
  75    char        creator_app[4]; /*  e.g., "vpc " */
  76    uint16_t    major;
  77    uint16_t    minor;
  78    char        creator_os[4]; /* "Wi2k" */
  79
  80    uint64_t    orig_size;
  81    uint64_t    current_size;
  82
  83    uint16_t    cyls;
  84    uint8_t     heads;
  85    uint8_t     secs_per_cyl;
  86
  87    uint32_t    type;
  88
  89    /* Checksum of the Hard Disk Footer ("one's complement of the sum of all
  90       the bytes in the footer without the checksum field") */
  91    uint32_t    checksum;
  92
  93    /* UUID used to identify a parent hard disk (backing file) */
  94    QemuUUID    uuid;
  95
  96    uint8_t     in_saved_state;
  97    uint8_t     reserved[427];
  98} QEMU_PACKED VHDFooter;
  99
 100QEMU_BUILD_BUG_ON(sizeof(VHDFooter) != 512);
 101
 102typedef struct vhd_dyndisk_header {
 103    char        magic[8]; /* "cxsparse" */
 104
 105    /* Offset of next header structure, 0xFFFFFFFF if none */
 106    uint64_t    data_offset;
 107
 108    /* Offset of the Block Allocation Table (BAT) */
 109    uint64_t    table_offset;
 110
 111    uint32_t    version;
 112    uint32_t    max_table_entries; /* 32bit/entry */
 113
 114    /* 2 MB by default, must be a power of two */
 115    uint32_t    block_size;
 116
 117    uint32_t    checksum;
 118    uint8_t     parent_uuid[16];
 119    uint32_t    parent_timestamp;
 120    uint32_t    reserved;
 121
 122    /* Backing file name (in UTF-16) */
 123    uint8_t     parent_name[512];
 124
 125    struct {
 126        uint32_t    platform;
 127        uint32_t    data_space;
 128        uint32_t    data_length;
 129        uint32_t    reserved;
 130        uint64_t    data_offset;
 131    } parent_locator[8];
 132    uint8_t     reserved2[256];
 133} QEMU_PACKED VHDDynDiskHeader;
 134
 135QEMU_BUILD_BUG_ON(sizeof(VHDDynDiskHeader) != 1024);
 136
 137typedef struct BDRVVPCState {
 138    CoMutex lock;
 139    VHDFooter footer;
 140    uint64_t free_data_block_offset;
 141    int max_table_entries;
 142    uint32_t *pagetable;
 143    uint64_t bat_offset;
 144    uint64_t last_bitmap_offset;
 145
 146    uint32_t block_size;
 147    uint32_t bitmap_size;
 148    bool force_use_chs;
 149    bool force_use_sz;
 150
 151#ifdef CACHE
 152    uint8_t *pageentry_u8;
 153    uint32_t *pageentry_u32;
 154    uint16_t *pageentry_u16;
 155
 156    uint64_t last_bitmap;
 157#endif
 158
 159    Error *migration_blocker;
 160} BDRVVPCState;
 161
 162#define VPC_OPT_SIZE_CALC "force_size_calc"
 163static QemuOptsList vpc_runtime_opts = {
 164    .name = "vpc-runtime-opts",
 165    .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head),
 166    .desc = {
 167        {
 168            .name = VPC_OPT_SIZE_CALC,
 169            .type = QEMU_OPT_STRING,
 170            .help = "Force disk size calculation to use either CHS geometry, "
 171                    "or use the disk current_size specified in the VHD footer. "
 172                    "{chs, current_size}"
 173        },
 174        { /* end of list */ }
 175    }
 176};
 177
 178static QemuOptsList vpc_create_opts;
 179
 180static uint32_t vpc_checksum(void *p, size_t size)
 181{
 182    uint8_t *buf = p;
 183    uint32_t res = 0;
 184    int i;
 185
 186    for (i = 0; i < size; i++)
 187        res += buf[i];
 188
 189    return ~res;
 190}
 191
 192
 193static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
 194{
 195    if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
 196        return 100;
 197    return 0;
 198}
 199
 200static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts,
 201                              Error **errp)
 202{
 203    BDRVVPCState *s = bs->opaque;
 204    const char *size_calc;
 205
 206    size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC);
 207
 208    if (!size_calc) {
 209       /* no override, use autodetect only */
 210    } else if (!strcmp(size_calc, "current_size")) {
 211        s->force_use_sz = true;
 212    } else if (!strcmp(size_calc, "chs")) {
 213        s->force_use_chs = true;
 214    } else {
 215        error_setg(errp, "Invalid size calculation mode: '%s'", size_calc);
 216    }
 217}
 218
 219static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
 220                    Error **errp)
 221{
 222    BDRVVPCState *s = bs->opaque;
 223    int i;
 224    VHDFooter *footer;
 225    QemuOpts *opts = NULL;
 226    Error *local_err = NULL;
 227    bool use_chs;
 228    VHDDynDiskHeader dyndisk_header;
 229    uint32_t checksum;
 230    uint64_t computed_size;
 231    uint64_t pagetable_size;
 232    int disk_type = VHD_DYNAMIC;
 233    int ret;
 234    int64_t bs_size;
 235
 236    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
 237    if (ret < 0) {
 238        return ret;
 239    }
 240
 241    opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
 242    if (!qemu_opts_absorb_qdict(opts, options, errp)) {
 243        ret = -EINVAL;
 244        goto fail;
 245    }
 246
 247    vpc_parse_options(bs, opts, &local_err);
 248    if (local_err) {
 249        error_propagate(errp, local_err);
 250        ret = -EINVAL;
 251        goto fail;
 252    }
 253
 254    ret = bdrv_pread(bs->file, 0, sizeof(s->footer), &s->footer, 0);
 255    if (ret < 0) {
 256        error_setg(errp, "Unable to read VHD header");
 257        goto fail;
 258    }
 259
 260    footer = &s->footer;
 261    if (strncmp(footer->creator, "conectix", 8)) {
 262        int64_t offset = bdrv_getlength(bs->file->bs);
 263        if (offset < 0) {
 264            ret = offset;
 265            error_setg(errp, "Invalid file size");
 266            goto fail;
 267        } else if (offset < sizeof(*footer)) {
 268            ret = -EINVAL;
 269            error_setg(errp, "File too small for a VHD header");
 270            goto fail;
 271        }
 272
 273        /* If a fixed disk, the footer is found only at the end of the file */
 274        ret = bdrv_pread(bs->file, offset - sizeof(*footer), sizeof(*footer),
 275                         footer, 0);
 276        if (ret < 0) {
 277            goto fail;
 278        }
 279        if (strncmp(footer->creator, "conectix", 8) ||
 280            be32_to_cpu(footer->type) != VHD_FIXED) {
 281            error_setg(errp, "invalid VPC image");
 282            ret = -EINVAL;
 283            goto fail;
 284        }
 285        disk_type = VHD_FIXED;
 286    }
 287
 288    checksum = be32_to_cpu(footer->checksum);
 289    footer->checksum = 0;
 290    if (vpc_checksum(footer, sizeof(*footer)) != checksum) {
 291        error_setg(errp, "Incorrect header checksum");
 292        ret = -EINVAL;
 293        goto fail;
 294    }
 295
 296    /* Write 'checksum' back to footer, or else will leave it with zero. */
 297    footer->checksum = cpu_to_be32(checksum);
 298
 299    /* The visible size of a image in Virtual PC depends on the geometry
 300       rather than on the size stored in the footer (the size in the footer
 301       is too large usually) */
 302    bs->total_sectors = (int64_t)
 303        be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
 304
 305    /* Microsoft Virtual PC and Microsoft Hyper-V produce and read
 306     * VHD image sizes differently.  VPC will rely on CHS geometry,
 307     * while Hyper-V and disk2vhd use the size specified in the footer.
 308     *
 309     * We use a couple of approaches to try and determine the correct method:
 310     * look at the Creator App field, and look for images that have CHS
 311     * geometry that is the maximum value.
 312     *
 313     * If the CHS geometry is the maximum CHS geometry, then we assume that
 314     * the size is the footer->current_size to avoid truncation.  Otherwise,
 315     * we follow the table based on footer->creator_app:
 316     *
 317     *  Known creator apps:
 318     *      'vpc '  :  CHS              Virtual PC (uses disk geometry)
 319     *      'qemu'  :  CHS              QEMU (uses disk geometry)
 320     *      'qem2'  :  current_size     QEMU (uses current_size)
 321     *      'win '  :  current_size     Hyper-V
 322     *      'd2v '  :  current_size     Disk2vhd
 323     *      'tap\0' :  current_size     XenServer
 324     *      'CTXS'  :  current_size     XenConverter
 325     *
 326     *  The user can override the table values via drive options, however
 327     *  even with an override we will still use current_size for images
 328     *  that have CHS geometry of the maximum size.
 329     */
 330    use_chs = (!!strncmp(footer->creator_app, "win ", 4) &&
 331               !!strncmp(footer->creator_app, "qem2", 4) &&
 332               !!strncmp(footer->creator_app, "d2v ", 4) &&
 333               !!strncmp(footer->creator_app, "CTXS", 4) &&
 334               !!memcmp(footer->creator_app, "tap", 4)) || s->force_use_chs;
 335
 336    if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) {
 337        bs->total_sectors = be64_to_cpu(footer->current_size) /
 338                                        BDRV_SECTOR_SIZE;
 339    }
 340
 341    /* Allow a maximum disk size of 2040 GiB */
 342    if (bs->total_sectors > VHD_MAX_SECTORS) {
 343        ret = -EFBIG;
 344        goto fail;
 345    }
 346
 347    if (disk_type == VHD_DYNAMIC) {
 348        ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset),
 349                         sizeof(dyndisk_header), &dyndisk_header, 0);
 350        if (ret < 0) {
 351            error_setg(errp, "Error reading dynamic VHD header");
 352            goto fail;
 353        }
 354
 355        if (strncmp(dyndisk_header.magic, "cxsparse", 8)) {
 356            error_setg(errp, "Invalid header magic");
 357            ret = -EINVAL;
 358            goto fail;
 359        }
 360
 361        s->block_size = be32_to_cpu(dyndisk_header.block_size);
 362        if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) {
 363            error_setg(errp, "Invalid block size %" PRIu32, s->block_size);
 364            ret = -EINVAL;
 365            goto fail;
 366        }
 367        s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
 368
 369        s->max_table_entries = be32_to_cpu(dyndisk_header.max_table_entries);
 370
 371        if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
 372            error_setg(errp, "Too many blocks");
 373            ret = -EINVAL;
 374            goto fail;
 375        }
 376
 377        computed_size = (uint64_t) s->max_table_entries * s->block_size;
 378        if (computed_size < bs->total_sectors * 512) {
 379            error_setg(errp, "Page table too small");
 380            ret = -EINVAL;
 381            goto fail;
 382        }
 383
 384        if (s->max_table_entries > SIZE_MAX / 4 ||
 385            s->max_table_entries > (int) INT_MAX / 4) {
 386            error_setg(errp, "Max Table Entries too large (%" PRId32 ")",
 387                        s->max_table_entries);
 388            ret = -EINVAL;
 389            goto fail;
 390        }
 391
 392        pagetable_size = (uint64_t) s->max_table_entries * 4;
 393
 394        s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size);
 395        if (s->pagetable == NULL) {
 396            error_setg(errp, "Unable to allocate memory for page table");
 397            ret = -ENOMEM;
 398            goto fail;
 399        }
 400
 401        s->bat_offset = be64_to_cpu(dyndisk_header.table_offset);
 402
 403        ret = bdrv_pread(bs->file, s->bat_offset, pagetable_size,
 404                         s->pagetable, 0);
 405        if (ret < 0) {
 406            error_setg(errp, "Error reading pagetable");
 407            goto fail;
 408        }
 409
 410        s->free_data_block_offset =
 411            ROUND_UP(s->bat_offset + pagetable_size, 512);
 412
 413        for (i = 0; i < s->max_table_entries; i++) {
 414            be32_to_cpus(&s->pagetable[i]);
 415            if (s->pagetable[i] != 0xFFFFFFFF) {
 416                int64_t next = (512 * (int64_t) s->pagetable[i]) +
 417                    s->bitmap_size + s->block_size;
 418
 419                if (next > s->free_data_block_offset) {
 420                    s->free_data_block_offset = next;
 421                }
 422            }
 423        }
 424
 425        bs_size = bdrv_getlength(bs->file->bs);
 426        if (bs_size < 0) {
 427            error_setg_errno(errp, -bs_size, "Unable to learn image size");
 428            ret = bs_size;
 429            goto fail;
 430        }
 431        if (s->free_data_block_offset > bs_size) {
 432            error_setg(errp, "block-vpc: free_data_block_offset points after "
 433                             "the end of file. The image has been truncated.");
 434            ret = -EINVAL;
 435            goto fail;
 436        }
 437
 438        s->last_bitmap_offset = (int64_t) -1;
 439
 440#ifdef CACHE
 441        s->pageentry_u8 = g_malloc(512);
 442        s->pageentry_u32 = s->pageentry_u8;
 443        s->pageentry_u16 = s->pageentry_u8;
 444        s->last_pagetable = -1;
 445#endif
 446    }
 447
 448    /* Disable migration when VHD images are used */
 449    error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
 450               "does not support live migration",
 451               bdrv_get_device_or_node_name(bs));
 452    ret = migrate_add_blocker(s->migration_blocker, errp);
 453    if (ret < 0) {
 454        error_free(s->migration_blocker);
 455        goto fail;
 456    }
 457
 458    qemu_co_mutex_init(&s->lock);
 459    qemu_opts_del(opts);
 460
 461    return 0;
 462
 463fail:
 464    qemu_opts_del(opts);
 465    qemu_vfree(s->pagetable);
 466#ifdef CACHE
 467    g_free(s->pageentry_u8);
 468#endif
 469    return ret;
 470}
 471
 472static int vpc_reopen_prepare(BDRVReopenState *state,
 473                              BlockReopenQueue *queue, Error **errp)
 474{
 475    return 0;
 476}
 477
 478/*
 479 * Returns the absolute byte offset of the given sector in the image file.
 480 * If the sector is not allocated, -1 is returned instead.
 481 * If an error occurred trying to write an updated block bitmap back to
 482 * the file, -2 is returned, and the error value is written to *err.
 483 * This can only happen for a write operation.
 484 *
 485 * The parameter write must be 1 if the offset will be used for a write
 486 * operation (the block bitmaps is updated then), 0 otherwise.
 487 * If write is true then err must not be NULL.
 488 */
 489static int64_t coroutine_fn GRAPH_RDLOCK
 490get_image_offset(BlockDriverState *bs, uint64_t offset, bool write, int *err)
 491{
 492    BDRVVPCState *s = bs->opaque;
 493    uint64_t bitmap_offset, block_offset;
 494    uint32_t pagetable_index, offset_in_block;
 495
 496    assert(!(write && err == NULL));
 497
 498    pagetable_index = offset / s->block_size;
 499    offset_in_block = offset % s->block_size;
 500
 501    if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
 502        return -1; /* not allocated */
 503
 504    bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
 505    block_offset = bitmap_offset + s->bitmap_size + offset_in_block;
 506
 507    /* We must ensure that we don't write to any sectors which are marked as
 508       unused in the bitmap. We get away with setting all bits in the block
 509       bitmap each time we write to a new block. This might cause Virtual PC to
 510       miss sparse read optimization, but it's not a problem in terms of
 511       correctness. */
 512    if (write && (s->last_bitmap_offset != bitmap_offset)) {
 513        uint8_t bitmap[s->bitmap_size];
 514        int r;
 515
 516        s->last_bitmap_offset = bitmap_offset;
 517        memset(bitmap, 0xff, s->bitmap_size);
 518        r = bdrv_co_pwrite_sync(bs->file, bitmap_offset, s->bitmap_size, bitmap, 0);
 519        if (r < 0) {
 520            *err = r;
 521            return -2;
 522        }
 523    }
 524
 525    return block_offset;
 526}
 527
 528/*
 529 * Writes the footer to the end of the image file. This is needed when the
 530 * file grows as it overwrites the old footer
 531 *
 532 * Returns 0 on success and < 0 on error
 533 */
 534static int coroutine_fn GRAPH_RDLOCK rewrite_footer(BlockDriverState *bs)
 535{
 536    int ret;
 537    BDRVVPCState *s = bs->opaque;
 538    int64_t offset = s->free_data_block_offset;
 539
 540    ret = bdrv_co_pwrite_sync(bs->file, offset, sizeof(s->footer), &s->footer, 0);
 541    if (ret < 0)
 542        return ret;
 543
 544    return 0;
 545}
 546
 547/*
 548 * Allocates a new block. This involves writing a new footer and updating
 549 * the Block Allocation Table to use the space at the old end of the image
 550 * file (overwriting the old footer)
 551 *
 552 * Returns the sectors' offset in the image file on success and < 0 on error
 553 */
 554static int64_t coroutine_fn GRAPH_RDLOCK
 555alloc_block(BlockDriverState *bs, int64_t offset)
 556{
 557    BDRVVPCState *s = bs->opaque;
 558    int64_t bat_offset;
 559    uint32_t index, bat_value;
 560    int ret;
 561    uint8_t bitmap[s->bitmap_size];
 562
 563    /* Check if sector_num is valid */
 564    if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
 565        return -EINVAL;
 566    }
 567
 568    /* Write entry into in-memory BAT */
 569    index = offset / s->block_size;
 570    assert(s->pagetable[index] == 0xFFFFFFFF);
 571    s->pagetable[index] = s->free_data_block_offset / 512;
 572
 573    /* Initialize the block's bitmap */
 574    memset(bitmap, 0xff, s->bitmap_size);
 575    ret = bdrv_co_pwrite_sync(bs->file, s->free_data_block_offset,
 576                              s->bitmap_size, bitmap, 0);
 577    if (ret < 0) {
 578        return ret;
 579    }
 580
 581    /* Write new footer (the old one will be overwritten) */
 582    s->free_data_block_offset += s->block_size + s->bitmap_size;
 583    ret = rewrite_footer(bs);
 584    if (ret < 0)
 585        goto fail;
 586
 587    /* Write BAT entry to disk */
 588    bat_offset = s->bat_offset + (4 * index);
 589    bat_value = cpu_to_be32(s->pagetable[index]);
 590    ret = bdrv_co_pwrite_sync(bs->file, bat_offset, 4, &bat_value, 0);
 591    if (ret < 0)
 592        goto fail;
 593
 594    return get_image_offset(bs, offset, false, NULL);
 595
 596fail:
 597    s->free_data_block_offset -= (s->block_size + s->bitmap_size);
 598    return ret;
 599}
 600
 601static int coroutine_fn
 602vpc_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 603{
 604    BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
 605
 606    if (be32_to_cpu(s->footer.type) != VHD_FIXED) {
 607        bdi->cluster_size = s->block_size;
 608    }
 609
 610    return 0;
 611}
 612
 613static int coroutine_fn GRAPH_RDLOCK
 614vpc_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
 615              QEMUIOVector *qiov, BdrvRequestFlags flags)
 616{
 617    BDRVVPCState *s = bs->opaque;
 618    int ret;
 619    int64_t image_offset;
 620    int64_t n_bytes;
 621    int64_t bytes_done = 0;
 622    QEMUIOVector local_qiov;
 623
 624    if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
 625        return bdrv_co_preadv(bs->file, offset, bytes, qiov, 0);
 626    }
 627
 628    qemu_co_mutex_lock(&s->lock);
 629    qemu_iovec_init(&local_qiov, qiov->niov);
 630
 631    while (bytes > 0) {
 632        image_offset = get_image_offset(bs, offset, false, NULL);
 633        n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
 634
 635        if (image_offset == -1) {
 636            qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
 637        } else {
 638            qemu_iovec_reset(&local_qiov);
 639            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
 640
 641            qemu_co_mutex_unlock(&s->lock);
 642            ret = bdrv_co_preadv(bs->file, image_offset, n_bytes,
 643                                 &local_qiov, 0);
 644            qemu_co_mutex_lock(&s->lock);
 645            if (ret < 0) {
 646                goto fail;
 647            }
 648        }
 649
 650        bytes -= n_bytes;
 651        offset += n_bytes;
 652        bytes_done += n_bytes;
 653    }
 654
 655    ret = 0;
 656fail:
 657    qemu_iovec_destroy(&local_qiov);
 658    qemu_co_mutex_unlock(&s->lock);
 659
 660    return ret;
 661}
 662
 663static int coroutine_fn GRAPH_RDLOCK
 664vpc_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
 665               QEMUIOVector *qiov, BdrvRequestFlags flags)
 666{
 667    BDRVVPCState *s = bs->opaque;
 668    int64_t image_offset;
 669    int64_t n_bytes;
 670    int64_t bytes_done = 0;
 671    int ret = 0;
 672    QEMUIOVector local_qiov;
 673
 674    if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
 675        return bdrv_co_pwritev(bs->file, offset, bytes, qiov, 0);
 676    }
 677
 678    qemu_co_mutex_lock(&s->lock);
 679    qemu_iovec_init(&local_qiov, qiov->niov);
 680
 681    while (bytes > 0) {
 682        image_offset = get_image_offset(bs, offset, true, &ret);
 683        if (image_offset == -2) {
 684            /* Failed to write block bitmap: can't proceed with write */
 685            goto fail;
 686        }
 687        n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
 688
 689        if (image_offset == -1) {
 690            image_offset = alloc_block(bs, offset);
 691            if (image_offset < 0) {
 692                ret = image_offset;
 693                goto fail;
 694            }
 695        }
 696
 697        qemu_iovec_reset(&local_qiov);
 698        qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
 699
 700        qemu_co_mutex_unlock(&s->lock);
 701        ret = bdrv_co_pwritev(bs->file, image_offset, n_bytes,
 702                              &local_qiov, 0);
 703        qemu_co_mutex_lock(&s->lock);
 704        if (ret < 0) {
 705            goto fail;
 706        }
 707
 708        bytes -= n_bytes;
 709        offset += n_bytes;
 710        bytes_done += n_bytes;
 711    }
 712
 713    ret = 0;
 714fail:
 715    qemu_iovec_destroy(&local_qiov);
 716    qemu_co_mutex_unlock(&s->lock);
 717
 718    return ret;
 719}
 720
 721static int coroutine_fn GRAPH_RDLOCK
 722vpc_co_block_status(BlockDriverState *bs, bool want_zero,
 723                    int64_t offset, int64_t bytes,
 724                    int64_t *pnum, int64_t *map,
 725                    BlockDriverState **file)
 726{
 727    BDRVVPCState *s = bs->opaque;
 728    int64_t image_offset;
 729    bool allocated;
 730    int ret;
 731    int64_t n;
 732
 733    if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
 734        *pnum = bytes;
 735        *map = offset;
 736        *file = bs->file->bs;
 737        return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_RECURSE;
 738    }
 739
 740    qemu_co_mutex_lock(&s->lock);
 741
 742    image_offset = get_image_offset(bs, offset, false, NULL);
 743    allocated = (image_offset != -1);
 744    *pnum = 0;
 745    ret = BDRV_BLOCK_ZERO;
 746
 747    do {
 748        /* All sectors in a block are contiguous (without using the bitmap) */
 749        n = ROUND_UP(offset + 1, s->block_size) - offset;
 750        n = MIN(n, bytes);
 751
 752        *pnum += n;
 753        offset += n;
 754        bytes -= n;
 755        /* *pnum can't be greater than one block for allocated
 756         * sectors since there is always a bitmap in between. */
 757        if (allocated) {
 758            *file = bs->file->bs;
 759            *map = image_offset;
 760            ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
 761            break;
 762        }
 763        if (bytes == 0) {
 764            break;
 765        }
 766        image_offset = get_image_offset(bs, offset, false, NULL);
 767    } while (image_offset == -1);
 768
 769    qemu_co_mutex_unlock(&s->lock);
 770    return ret;
 771}
 772
 773/*
 774 * Calculates the number of cylinders, heads and sectors per cylinder
 775 * based on a given number of sectors. This is the algorithm described
 776 * in the VHD specification.
 777 *
 778 * Note that the geometry doesn't always exactly match total_sectors but
 779 * may round it down.
 780 *
 781 * Returns 0 on success, -EFBIG if the size is larger than 2040 GiB. Override
 782 * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
 783 * and instead allow up to 255 heads.
 784 */
 785static int calculate_geometry(int64_t total_sectors, uint16_t *cyls,
 786    uint8_t *heads, uint8_t *secs_per_cyl)
 787{
 788    uint32_t cyls_times_heads;
 789
 790    total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY);
 791
 792    if (total_sectors >= 65535LL * 16 * 63) {
 793        *secs_per_cyl = 255;
 794        *heads = 16;
 795        cyls_times_heads = total_sectors / *secs_per_cyl;
 796    } else {
 797        *secs_per_cyl = 17;
 798        cyls_times_heads = total_sectors / *secs_per_cyl;
 799        *heads = DIV_ROUND_UP(cyls_times_heads, 1024);
 800
 801        if (*heads < 4) {
 802            *heads = 4;
 803        }
 804
 805        if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
 806            *secs_per_cyl = 31;
 807            *heads = 16;
 808            cyls_times_heads = total_sectors / *secs_per_cyl;
 809        }
 810
 811        if (cyls_times_heads >= (*heads * 1024)) {
 812            *secs_per_cyl = 63;
 813            *heads = 16;
 814            cyls_times_heads = total_sectors / *secs_per_cyl;
 815        }
 816    }
 817
 818    *cyls = cyls_times_heads / *heads;
 819
 820    return 0;
 821}
 822
 823static int coroutine_fn create_dynamic_disk(BlockBackend *blk, VHDFooter *footer,
 824                                            int64_t total_sectors)
 825{
 826    VHDDynDiskHeader dyndisk_header;
 827    uint8_t bat_sector[512];
 828    size_t block_size, num_bat_entries;
 829    int i;
 830    int ret;
 831    int64_t offset = 0;
 832
 833    /* Write the footer (twice: at the beginning and at the end) */
 834    block_size = 0x200000;
 835    num_bat_entries = DIV_ROUND_UP(total_sectors, block_size / 512);
 836
 837    ret = blk_co_pwrite(blk, offset, sizeof(*footer), footer, 0);
 838    if (ret < 0) {
 839        goto fail;
 840    }
 841
 842    offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
 843    ret = blk_co_pwrite(blk, offset, sizeof(*footer), footer, 0);
 844    if (ret < 0) {
 845        goto fail;
 846    }
 847
 848    /* Write the initial BAT */
 849    offset = 3 * 512;
 850
 851    memset(bat_sector, 0xFF, 512);
 852    for (i = 0; i < DIV_ROUND_UP(num_bat_entries * 4, 512); i++) {
 853        ret = blk_co_pwrite(blk, offset, 512, bat_sector, 0);
 854        if (ret < 0) {
 855            goto fail;
 856        }
 857        offset += 512;
 858    }
 859
 860    /* Prepare the Dynamic Disk Header */
 861    memset(&dyndisk_header, 0, sizeof(dyndisk_header));
 862
 863    memcpy(dyndisk_header.magic, "cxsparse", 8);
 864
 865    /*
 866     * Note: The spec is actually wrong here for data_offset, it says
 867     * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
 868     */
 869    dyndisk_header.data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
 870    dyndisk_header.table_offset = cpu_to_be64(3 * 512);
 871    dyndisk_header.version = cpu_to_be32(0x00010000);
 872    dyndisk_header.block_size = cpu_to_be32(block_size);
 873    dyndisk_header.max_table_entries = cpu_to_be32(num_bat_entries);
 874
 875    dyndisk_header.checksum = cpu_to_be32(
 876        vpc_checksum(&dyndisk_header, sizeof(dyndisk_header)));
 877
 878    /* Write the header */
 879    offset = 512;
 880
 881    ret = blk_co_pwrite(blk, offset, sizeof(dyndisk_header), &dyndisk_header, 0);
 882    if (ret < 0) {
 883        goto fail;
 884    }
 885
 886    ret = 0;
 887 fail:
 888    return ret;
 889}
 890
 891static int coroutine_fn create_fixed_disk(BlockBackend *blk, VHDFooter *footer,
 892                                          int64_t total_size, Error **errp)
 893{
 894    int ret;
 895
 896    /* Add footer to total size */
 897    total_size += sizeof(*footer);
 898
 899    ret = blk_co_truncate(blk, total_size, false, PREALLOC_MODE_OFF, 0, errp);
 900    if (ret < 0) {
 901        return ret;
 902    }
 903
 904    ret = blk_co_pwrite(blk, total_size - sizeof(*footer), sizeof(*footer),
 905                        footer, 0);
 906    if (ret < 0) {
 907        error_setg_errno(errp, -ret, "Unable to write VHD header");
 908        return ret;
 909    }
 910
 911    return 0;
 912}
 913
 914static int calculate_rounded_image_size(BlockdevCreateOptionsVpc *vpc_opts,
 915                                        uint16_t *out_cyls,
 916                                        uint8_t *out_heads,
 917                                        uint8_t *out_secs_per_cyl,
 918                                        int64_t *out_total_sectors,
 919                                        Error **errp)
 920{
 921    int64_t total_size = vpc_opts->size;
 922    uint16_t cyls = 0;
 923    uint8_t heads = 0;
 924    uint8_t secs_per_cyl = 0;
 925    int64_t total_sectors;
 926    int i;
 927
 928    /*
 929     * Calculate matching total_size and geometry. Increase the number of
 930     * sectors requested until we get enough (or fail). This ensures that
 931     * qemu-img convert doesn't truncate images, but rather rounds up.
 932     *
 933     * If the image size can't be represented by a spec conformant CHS geometry,
 934     * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use
 935     * the image size from the VHD footer to calculate total_sectors.
 936     */
 937    if (vpc_opts->force_size) {
 938        /* This will force the use of total_size for sector count, below */
 939        cyls         = VHD_CHS_MAX_C;
 940        heads        = VHD_CHS_MAX_H;
 941        secs_per_cyl = VHD_CHS_MAX_S;
 942    } else {
 943        total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
 944        for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
 945            calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
 946        }
 947    }
 948
 949    if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) {
 950        total_sectors = total_size / BDRV_SECTOR_SIZE;
 951        /* Allow a maximum disk size of 2040 GiB */
 952        if (total_sectors > VHD_MAX_SECTORS) {
 953            error_setg(errp, "Disk size is too large, max size is 2040 GiB");
 954            return -EFBIG;
 955        }
 956    } else {
 957        total_sectors = (int64_t) cyls * heads * secs_per_cyl;
 958    }
 959
 960    *out_total_sectors = total_sectors;
 961    if (out_cyls) {
 962        *out_cyls = cyls;
 963        *out_heads = heads;
 964        *out_secs_per_cyl = secs_per_cyl;
 965    }
 966
 967    return 0;
 968}
 969
 970static int coroutine_fn GRAPH_UNLOCKED
 971vpc_co_create(BlockdevCreateOptions *opts, Error **errp)
 972{
 973    BlockdevCreateOptionsVpc *vpc_opts;
 974    BlockBackend *blk = NULL;
 975    BlockDriverState *bs = NULL;
 976
 977    VHDFooter footer;
 978    uint16_t cyls = 0;
 979    uint8_t heads = 0;
 980    uint8_t secs_per_cyl = 0;
 981    int64_t total_sectors;
 982    int64_t total_size;
 983    int disk_type;
 984    int ret = -EIO;
 985    QemuUUID uuid;
 986
 987    assert(opts->driver == BLOCKDEV_DRIVER_VPC);
 988    vpc_opts = &opts->u.vpc;
 989
 990    /* Validate options and set default values */
 991    total_size = vpc_opts->size;
 992
 993    if (!vpc_opts->has_subformat) {
 994        vpc_opts->subformat = BLOCKDEV_VPC_SUBFORMAT_DYNAMIC;
 995    }
 996    switch (vpc_opts->subformat) {
 997    case BLOCKDEV_VPC_SUBFORMAT_DYNAMIC:
 998        disk_type = VHD_DYNAMIC;
 999        break;
1000    case BLOCKDEV_VPC_SUBFORMAT_FIXED:
1001        disk_type = VHD_FIXED;
1002        break;
1003    default:
1004        g_assert_not_reached();
1005    }
1006
1007    /* Create BlockBackend to write to the image */
1008    bs = bdrv_co_open_blockdev_ref(vpc_opts->file, errp);
1009    if (bs == NULL) {
1010        return -EIO;
1011    }
1012
1013    blk = blk_co_new_with_bs(bs, BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL,
1014                             errp);
1015    if (!blk) {
1016        ret = -EPERM;
1017        goto out;
1018    }
1019    blk_set_allow_write_beyond_eof(blk, true);
1020
1021    /* Get geometry and check that it matches the image size*/
1022    ret = calculate_rounded_image_size(vpc_opts, &cyls, &heads, &secs_per_cyl,
1023                                       &total_sectors, errp);
1024    if (ret < 0) {
1025        goto out;
1026    }
1027
1028    if (total_size != total_sectors * BDRV_SECTOR_SIZE) {
1029        error_setg(errp, "The requested image size cannot be represented in "
1030                         "CHS geometry");
1031        error_append_hint(errp, "Try size=%llu or force-size=on (the "
1032                                "latter makes the image incompatible with "
1033                                "Virtual PC)",
1034                          total_sectors * BDRV_SECTOR_SIZE);
1035        ret = -EINVAL;
1036        goto out;
1037    }
1038
1039    /* Prepare the Hard Disk Footer */
1040    memset(&footer, 0, sizeof(footer));
1041
1042    memcpy(footer.creator, "conectix", 8);
1043    if (vpc_opts->force_size) {
1044        memcpy(footer.creator_app, "qem2", 4);
1045    } else {
1046        memcpy(footer.creator_app, "qemu", 4);
1047    }
1048    memcpy(footer.creator_os, "Wi2k", 4);
1049
1050    footer.features = cpu_to_be32(0x02);
1051    footer.version = cpu_to_be32(0x00010000);
1052    if (disk_type == VHD_DYNAMIC) {
1053        footer.data_offset = cpu_to_be64(sizeof(footer));
1054    } else {
1055        footer.data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
1056    }
1057    footer.timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE);
1058
1059    /* Version of Virtual PC 2007 */
1060    footer.major = cpu_to_be16(0x0005);
1061    footer.minor = cpu_to_be16(0x0003);
1062    footer.orig_size = cpu_to_be64(total_size);
1063    footer.current_size = cpu_to_be64(total_size);
1064    footer.cyls = cpu_to_be16(cyls);
1065    footer.heads = heads;
1066    footer.secs_per_cyl = secs_per_cyl;
1067
1068    footer.type = cpu_to_be32(disk_type);
1069
1070    qemu_uuid_generate(&uuid);
1071    footer.uuid = uuid;
1072
1073    footer.checksum = cpu_to_be32(vpc_checksum(&footer, sizeof(footer)));
1074
1075    if (disk_type == VHD_DYNAMIC) {
1076        ret = create_dynamic_disk(blk, &footer, total_sectors);
1077        if (ret < 0) {
1078            error_setg(errp, "Unable to create or write VHD header");
1079        }
1080    } else {
1081        ret = create_fixed_disk(blk, &footer, total_size, errp);
1082    }
1083
1084out:
1085    blk_co_unref(blk);
1086    bdrv_co_unref(bs);
1087    return ret;
1088}
1089
1090static int coroutine_fn GRAPH_UNLOCKED
1091vpc_co_create_opts(BlockDriver *drv, const char *filename,
1092                   QemuOpts *opts, Error **errp)
1093{
1094    BlockdevCreateOptions *create_options = NULL;
1095    QDict *qdict;
1096    Visitor *v;
1097    BlockDriverState *bs = NULL;
1098    int ret;
1099
1100    static const QDictRenames opt_renames[] = {
1101        { VPC_OPT_FORCE_SIZE,           "force-size" },
1102        { NULL, NULL },
1103    };
1104
1105    /* Parse options and convert legacy syntax */
1106    qdict = qemu_opts_to_qdict_filtered(opts, NULL, &vpc_create_opts, true);
1107
1108    if (!qdict_rename_keys(qdict, opt_renames, errp)) {
1109        ret = -EINVAL;
1110        goto fail;
1111    }
1112
1113    /* Create and open the file (protocol layer) */
1114    ret = bdrv_co_create_file(filename, opts, errp);
1115    if (ret < 0) {
1116        goto fail;
1117    }
1118
1119    bs = bdrv_co_open(filename, NULL, NULL,
1120                      BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
1121    if (bs == NULL) {
1122        ret = -EIO;
1123        goto fail;
1124    }
1125
1126    /* Now get the QAPI type BlockdevCreateOptions */
1127    qdict_put_str(qdict, "driver", "vpc");
1128    qdict_put_str(qdict, "file", bs->node_name);
1129
1130    v = qobject_input_visitor_new_flat_confused(qdict, errp);
1131    if (!v) {
1132        ret = -EINVAL;
1133        goto fail;
1134    }
1135
1136    visit_type_BlockdevCreateOptions(v, NULL, &create_options, errp);
1137    visit_free(v);
1138    if (!create_options) {
1139        ret = -EINVAL;
1140        goto fail;
1141    }
1142
1143    /* Silently round up size */
1144    assert(create_options->driver == BLOCKDEV_DRIVER_VPC);
1145    create_options->u.vpc.size =
1146        ROUND_UP(create_options->u.vpc.size, BDRV_SECTOR_SIZE);
1147
1148    if (!create_options->u.vpc.force_size) {
1149        int64_t total_sectors;
1150        ret = calculate_rounded_image_size(&create_options->u.vpc, NULL, NULL,
1151                                           NULL, &total_sectors, errp);
1152        if (ret < 0) {
1153            goto fail;
1154        }
1155
1156        create_options->u.vpc.size = total_sectors * BDRV_SECTOR_SIZE;
1157    }
1158
1159
1160    /* Create the vpc image (format layer) */
1161    ret = vpc_co_create(create_options, errp);
1162
1163fail:
1164    qobject_unref(qdict);
1165    bdrv_co_unref(bs);
1166    qapi_free_BlockdevCreateOptions(create_options);
1167    return ret;
1168}
1169
1170
1171static int vpc_has_zero_init(BlockDriverState *bs)
1172{
1173    BDRVVPCState *s = bs->opaque;
1174
1175    if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
1176        return bdrv_has_zero_init(bs->file->bs);
1177    } else {
1178        return 1;
1179    }
1180}
1181
1182static void vpc_close(BlockDriverState *bs)
1183{
1184    BDRVVPCState *s = bs->opaque;
1185    qemu_vfree(s->pagetable);
1186#ifdef CACHE
1187    g_free(s->pageentry_u8);
1188#endif
1189
1190    migrate_del_blocker(s->migration_blocker);
1191    error_free(s->migration_blocker);
1192}
1193
1194static QemuOptsList vpc_create_opts = {
1195    .name = "vpc-create-opts",
1196    .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head),
1197    .desc = {
1198        {
1199            .name = BLOCK_OPT_SIZE,
1200            .type = QEMU_OPT_SIZE,
1201            .help = "Virtual disk size"
1202        },
1203        {
1204            .name = BLOCK_OPT_SUBFMT,
1205            .type = QEMU_OPT_STRING,
1206            .help =
1207                "Type of virtual hard disk format. Supported formats are "
1208                "{dynamic (default) | fixed} "
1209        },
1210        {
1211            .name = VPC_OPT_FORCE_SIZE,
1212            .type = QEMU_OPT_BOOL,
1213            .help = "Force disk size calculation to use the actual size "
1214                    "specified, rather than using the nearest CHS-based "
1215                    "calculation"
1216        },
1217        { /* end of list */ }
1218    }
1219};
1220
1221static const char *const vpc_strong_runtime_opts[] = {
1222    VPC_OPT_SIZE_CALC,
1223
1224    NULL
1225};
1226
1227static BlockDriver bdrv_vpc = {
1228    .format_name    = "vpc",
1229    .instance_size  = sizeof(BDRVVPCState),
1230
1231    .bdrv_probe             = vpc_probe,
1232    .bdrv_open              = vpc_open,
1233    .bdrv_close             = vpc_close,
1234    .bdrv_reopen_prepare    = vpc_reopen_prepare,
1235    .bdrv_child_perm        = bdrv_default_perms,
1236    .bdrv_co_create         = vpc_co_create,
1237    .bdrv_co_create_opts    = vpc_co_create_opts,
1238
1239    .bdrv_co_preadv             = vpc_co_preadv,
1240    .bdrv_co_pwritev            = vpc_co_pwritev,
1241    .bdrv_co_block_status       = vpc_co_block_status,
1242
1243    .bdrv_co_get_info       = vpc_co_get_info,
1244
1245    .is_format              = true,
1246    .create_opts            = &vpc_create_opts,
1247    .bdrv_has_zero_init     = vpc_has_zero_init,
1248    .strong_runtime_opts    = vpc_strong_runtime_opts,
1249};
1250
1251static void bdrv_vpc_init(void)
1252{
1253    bdrv_register(&bdrv_vpc);
1254}
1255
1256block_init(bdrv_vpc_init);
1257