qemu/block/vmdk.c
<<
>>
Prefs
   1/*
   2 * Block driver for the VMDK format
   3 *
   4 * Copyright (c) 2004 Fabrice Bellard
   5 * Copyright (c) 2005 Filip Navara
   6 *
   7 * Permission is hereby granted, free of charge, to any person obtaining a copy
   8 * of this software and associated documentation files (the "Software"), to deal
   9 * in the Software without restriction, including without limitation the rights
  10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 * copies of the Software, and to permit persons to whom the Software is
  12 * furnished to do so, subject to the following conditions:
  13 *
  14 * The above copyright notice and this permission notice shall be included in
  15 * all copies or substantial portions of the Software.
  16 *
  17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  23 * THE SOFTWARE.
  24 */
  25
  26#include "qemu-common.h"
  27#include "block/block_int.h"
  28#include "qemu/module.h"
  29#include "migration/migration.h"
  30#include <zlib.h>
  31
  32#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
  33#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
  34#define VMDK4_COMPRESSION_DEFLATE 1
  35#define VMDK4_FLAG_NL_DETECT (1 << 0)
  36#define VMDK4_FLAG_RGD (1 << 1)
  37/* Zeroed-grain enable bit */
  38#define VMDK4_FLAG_ZERO_GRAIN   (1 << 2)
  39#define VMDK4_FLAG_COMPRESS (1 << 16)
  40#define VMDK4_FLAG_MARKER (1 << 17)
  41#define VMDK4_GD_AT_END 0xffffffffffffffffULL
  42
  43#define VMDK_GTE_ZEROED 0x1
  44
  45/* VMDK internal error codes */
  46#define VMDK_OK      0
  47#define VMDK_ERROR   (-1)
  48/* Cluster not allocated */
  49#define VMDK_UNALLOC (-2)
  50#define VMDK_ZEROED  (-3)
  51
  52#define BLOCK_OPT_ZEROED_GRAIN "zeroed_grain"
  53
  54typedef struct {
  55    uint32_t version;
  56    uint32_t flags;
  57    uint32_t disk_sectors;
  58    uint32_t granularity;
  59    uint32_t l1dir_offset;
  60    uint32_t l1dir_size;
  61    uint32_t file_sectors;
  62    uint32_t cylinders;
  63    uint32_t heads;
  64    uint32_t sectors_per_track;
  65} VMDK3Header;
  66
  67typedef struct {
  68    uint32_t version;
  69    uint32_t flags;
  70    int64_t capacity;
  71    int64_t granularity;
  72    int64_t desc_offset;
  73    int64_t desc_size;
  74    int32_t num_gtes_per_gte;
  75    int64_t rgd_offset;
  76    int64_t gd_offset;
  77    int64_t grain_offset;
  78    char filler[1];
  79    char check_bytes[4];
  80    uint16_t compressAlgorithm;
  81} QEMU_PACKED VMDK4Header;
  82
  83#define L2_CACHE_SIZE 16
  84
  85typedef struct VmdkExtent {
  86    BlockDriverState *file;
  87    bool flat;
  88    bool compressed;
  89    bool has_marker;
  90    bool has_zero_grain;
  91    int version;
  92    int64_t sectors;
  93    int64_t end_sector;
  94    int64_t flat_start_offset;
  95    int64_t l1_table_offset;
  96    int64_t l1_backup_table_offset;
  97    uint32_t *l1_table;
  98    uint32_t *l1_backup_table;
  99    unsigned int l1_size;
 100    uint32_t l1_entry_sectors;
 101
 102    unsigned int l2_size;
 103    uint32_t *l2_cache;
 104    uint32_t l2_cache_offsets[L2_CACHE_SIZE];
 105    uint32_t l2_cache_counts[L2_CACHE_SIZE];
 106
 107    unsigned int cluster_sectors;
 108} VmdkExtent;
 109
 110typedef struct BDRVVmdkState {
 111    CoMutex lock;
 112    int desc_offset;
 113    bool cid_updated;
 114    uint32_t parent_cid;
 115    int num_extents;
 116    /* Extent array with num_extents entries, ascend ordered by address */
 117    VmdkExtent *extents;
 118    Error *migration_blocker;
 119} BDRVVmdkState;
 120
 121typedef struct VmdkMetaData {
 122    uint32_t offset;
 123    unsigned int l1_index;
 124    unsigned int l2_index;
 125    unsigned int l2_offset;
 126    int valid;
 127    uint32_t *l2_cache_entry;
 128} VmdkMetaData;
 129
 130typedef struct VmdkGrainMarker {
 131    uint64_t lba;
 132    uint32_t size;
 133    uint8_t  data[0];
 134} VmdkGrainMarker;
 135
 136enum {
 137    MARKER_END_OF_STREAM    = 0,
 138    MARKER_GRAIN_TABLE      = 1,
 139    MARKER_GRAIN_DIRECTORY  = 2,
 140    MARKER_FOOTER           = 3,
 141};
 142
 143static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
 144{
 145    uint32_t magic;
 146
 147    if (buf_size < 4) {
 148        return 0;
 149    }
 150    magic = be32_to_cpu(*(uint32_t *)buf);
 151    if (magic == VMDK3_MAGIC ||
 152        magic == VMDK4_MAGIC) {
 153        return 100;
 154    } else {
 155        const char *p = (const char *)buf;
 156        const char *end = p + buf_size;
 157        while (p < end) {
 158            if (*p == '#') {
 159                /* skip comment line */
 160                while (p < end && *p != '\n') {
 161                    p++;
 162                }
 163                p++;
 164                continue;
 165            }
 166            if (*p == ' ') {
 167                while (p < end && *p == ' ') {
 168                    p++;
 169                }
 170                /* skip '\r' if windows line endings used. */
 171                if (p < end && *p == '\r') {
 172                    p++;
 173                }
 174                /* only accept blank lines before 'version=' line */
 175                if (p == end || *p != '\n') {
 176                    return 0;
 177                }
 178                p++;
 179                continue;
 180            }
 181            if (end - p >= strlen("version=X\n")) {
 182                if (strncmp("version=1\n", p, strlen("version=1\n")) == 0 ||
 183                    strncmp("version=2\n", p, strlen("version=2\n")) == 0) {
 184                    return 100;
 185                }
 186            }
 187            if (end - p >= strlen("version=X\r\n")) {
 188                if (strncmp("version=1\r\n", p, strlen("version=1\r\n")) == 0 ||
 189                    strncmp("version=2\r\n", p, strlen("version=2\r\n")) == 0) {
 190                    return 100;
 191                }
 192            }
 193            return 0;
 194        }
 195        return 0;
 196    }
 197}
 198
 199#define CHECK_CID 1
 200
 201#define SECTOR_SIZE 512
 202#define DESC_SIZE (20 * SECTOR_SIZE)    /* 20 sectors of 512 bytes each */
 203#define BUF_SIZE 4096
 204#define HEADER_SIZE 512                 /* first sector of 512 bytes */
 205
 206static void vmdk_free_extents(BlockDriverState *bs)
 207{
 208    int i;
 209    BDRVVmdkState *s = bs->opaque;
 210    VmdkExtent *e;
 211
 212    for (i = 0; i < s->num_extents; i++) {
 213        e = &s->extents[i];
 214        g_free(e->l1_table);
 215        g_free(e->l2_cache);
 216        g_free(e->l1_backup_table);
 217        if (e->file != bs->file) {
 218            bdrv_delete(e->file);
 219        }
 220    }
 221    g_free(s->extents);
 222}
 223
 224static void vmdk_free_last_extent(BlockDriverState *bs)
 225{
 226    BDRVVmdkState *s = bs->opaque;
 227
 228    if (s->num_extents == 0) {
 229        return;
 230    }
 231    s->num_extents--;
 232    s->extents = g_realloc(s->extents, s->num_extents * sizeof(VmdkExtent));
 233}
 234
 235static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
 236{
 237    char desc[DESC_SIZE];
 238    uint32_t cid = 0xffffffff;
 239    const char *p_name, *cid_str;
 240    size_t cid_str_size;
 241    BDRVVmdkState *s = bs->opaque;
 242    int ret;
 243
 244    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
 245    if (ret < 0) {
 246        return 0;
 247    }
 248
 249    if (parent) {
 250        cid_str = "parentCID";
 251        cid_str_size = sizeof("parentCID");
 252    } else {
 253        cid_str = "CID";
 254        cid_str_size = sizeof("CID");
 255    }
 256
 257    desc[DESC_SIZE - 1] = '\0';
 258    p_name = strstr(desc, cid_str);
 259    if (p_name != NULL) {
 260        p_name += cid_str_size;
 261        sscanf(p_name, "%x", &cid);
 262    }
 263
 264    return cid;
 265}
 266
 267static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
 268{
 269    char desc[DESC_SIZE], tmp_desc[DESC_SIZE];
 270    char *p_name, *tmp_str;
 271    BDRVVmdkState *s = bs->opaque;
 272    int ret;
 273
 274    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
 275    if (ret < 0) {
 276        return ret;
 277    }
 278
 279    desc[DESC_SIZE - 1] = '\0';
 280    tmp_str = strstr(desc, "parentCID");
 281    if (tmp_str == NULL) {
 282        return -EINVAL;
 283    }
 284
 285    pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str);
 286    p_name = strstr(desc, "CID");
 287    if (p_name != NULL) {
 288        p_name += sizeof("CID");
 289        snprintf(p_name, sizeof(desc) - (p_name - desc), "%x\n", cid);
 290        pstrcat(desc, sizeof(desc), tmp_desc);
 291    }
 292
 293    ret = bdrv_pwrite_sync(bs->file, s->desc_offset, desc, DESC_SIZE);
 294    if (ret < 0) {
 295        return ret;
 296    }
 297
 298    return 0;
 299}
 300
 301static int vmdk_is_cid_valid(BlockDriverState *bs)
 302{
 303#ifdef CHECK_CID
 304    BDRVVmdkState *s = bs->opaque;
 305    BlockDriverState *p_bs = bs->backing_hd;
 306    uint32_t cur_pcid;
 307
 308    if (p_bs) {
 309        cur_pcid = vmdk_read_cid(p_bs, 0);
 310        if (s->parent_cid != cur_pcid) {
 311            /* CID not valid */
 312            return 0;
 313        }
 314    }
 315#endif
 316    /* CID valid */
 317    return 1;
 318}
 319
 320/* Queue extents, if any, for reopen() */
 321static int vmdk_reopen_prepare(BDRVReopenState *state,
 322                               BlockReopenQueue *queue, Error **errp)
 323{
 324    BDRVVmdkState *s;
 325    int ret = -1;
 326    int i;
 327    VmdkExtent *e;
 328
 329    assert(state != NULL);
 330    assert(state->bs != NULL);
 331
 332    if (queue == NULL) {
 333        error_set(errp, ERROR_CLASS_GENERIC_ERROR,
 334                 "No reopen queue for VMDK extents");
 335        goto exit;
 336    }
 337
 338    s = state->bs->opaque;
 339
 340    assert(s != NULL);
 341
 342    for (i = 0; i < s->num_extents; i++) {
 343        e = &s->extents[i];
 344        if (e->file != state->bs->file) {
 345            bdrv_reopen_queue(queue, e->file, state->flags);
 346        }
 347    }
 348    ret = 0;
 349
 350exit:
 351    return ret;
 352}
 353
 354static int vmdk_parent_open(BlockDriverState *bs)
 355{
 356    char *p_name;
 357    char desc[DESC_SIZE + 1];
 358    BDRVVmdkState *s = bs->opaque;
 359    int ret;
 360
 361    desc[DESC_SIZE] = '\0';
 362    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
 363    if (ret < 0) {
 364        return ret;
 365    }
 366
 367    p_name = strstr(desc, "parentFileNameHint");
 368    if (p_name != NULL) {
 369        char *end_name;
 370
 371        p_name += sizeof("parentFileNameHint") + 1;
 372        end_name = strchr(p_name, '\"');
 373        if (end_name == NULL) {
 374            return -EINVAL;
 375        }
 376        if ((end_name - p_name) > sizeof(bs->backing_file) - 1) {
 377            return -EINVAL;
 378        }
 379
 380        pstrcpy(bs->backing_file, end_name - p_name + 1, p_name);
 381    }
 382
 383    return 0;
 384}
 385
 386/* Create and append extent to the extent array. Return the added VmdkExtent
 387 * address. return NULL if allocation failed. */
 388static VmdkExtent *vmdk_add_extent(BlockDriverState *bs,
 389                           BlockDriverState *file, bool flat, int64_t sectors,
 390                           int64_t l1_offset, int64_t l1_backup_offset,
 391                           uint32_t l1_size,
 392                           int l2_size, unsigned int cluster_sectors)
 393{
 394    VmdkExtent *extent;
 395    BDRVVmdkState *s = bs->opaque;
 396
 397    s->extents = g_realloc(s->extents,
 398                              (s->num_extents + 1) * sizeof(VmdkExtent));
 399    extent = &s->extents[s->num_extents];
 400    s->num_extents++;
 401
 402    memset(extent, 0, sizeof(VmdkExtent));
 403    extent->file = file;
 404    extent->flat = flat;
 405    extent->sectors = sectors;
 406    extent->l1_table_offset = l1_offset;
 407    extent->l1_backup_table_offset = l1_backup_offset;
 408    extent->l1_size = l1_size;
 409    extent->l1_entry_sectors = l2_size * cluster_sectors;
 410    extent->l2_size = l2_size;
 411    extent->cluster_sectors = cluster_sectors;
 412
 413    if (s->num_extents > 1) {
 414        extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
 415    } else {
 416        extent->end_sector = extent->sectors;
 417    }
 418    bs->total_sectors = extent->end_sector;
 419    return extent;
 420}
 421
 422static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent)
 423{
 424    int ret;
 425    int l1_size, i;
 426
 427    /* read the L1 table */
 428    l1_size = extent->l1_size * sizeof(uint32_t);
 429    extent->l1_table = g_malloc(l1_size);
 430    ret = bdrv_pread(extent->file,
 431                    extent->l1_table_offset,
 432                    extent->l1_table,
 433                    l1_size);
 434    if (ret < 0) {
 435        goto fail_l1;
 436    }
 437    for (i = 0; i < extent->l1_size; i++) {
 438        le32_to_cpus(&extent->l1_table[i]);
 439    }
 440
 441    if (extent->l1_backup_table_offset) {
 442        extent->l1_backup_table = g_malloc(l1_size);
 443        ret = bdrv_pread(extent->file,
 444                        extent->l1_backup_table_offset,
 445                        extent->l1_backup_table,
 446                        l1_size);
 447        if (ret < 0) {
 448            goto fail_l1b;
 449        }
 450        for (i = 0; i < extent->l1_size; i++) {
 451            le32_to_cpus(&extent->l1_backup_table[i]);
 452        }
 453    }
 454
 455    extent->l2_cache =
 456        g_malloc(extent->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
 457    return 0;
 458 fail_l1b:
 459    g_free(extent->l1_backup_table);
 460 fail_l1:
 461    g_free(extent->l1_table);
 462    return ret;
 463}
 464
 465static int vmdk_open_vmdk3(BlockDriverState *bs,
 466                           BlockDriverState *file,
 467                           int flags)
 468{
 469    int ret;
 470    uint32_t magic;
 471    VMDK3Header header;
 472    VmdkExtent *extent;
 473
 474    ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
 475    if (ret < 0) {
 476        return ret;
 477    }
 478    extent = vmdk_add_extent(bs,
 479                             bs->file, false,
 480                             le32_to_cpu(header.disk_sectors),
 481                             le32_to_cpu(header.l1dir_offset) << 9,
 482                             0, 1 << 6, 1 << 9,
 483                             le32_to_cpu(header.granularity));
 484    ret = vmdk_init_tables(bs, extent);
 485    if (ret) {
 486        /* free extent allocated by vmdk_add_extent */
 487        vmdk_free_last_extent(bs);
 488    }
 489    return ret;
 490}
 491
 492static int vmdk_open_desc_file(BlockDriverState *bs, int flags,
 493                               int64_t desc_offset);
 494
 495static int vmdk_open_vmdk4(BlockDriverState *bs,
 496                           BlockDriverState *file,
 497                           int flags)
 498{
 499    int ret;
 500    uint32_t magic;
 501    uint32_t l1_size, l1_entry_sectors;
 502    VMDK4Header header;
 503    VmdkExtent *extent;
 504    int64_t l1_backup_offset = 0;
 505
 506    ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
 507    if (ret < 0) {
 508        return ret;
 509    }
 510    if (header.capacity == 0 && header.desc_offset) {
 511        return vmdk_open_desc_file(bs, flags, header.desc_offset << 9);
 512    }
 513
 514    if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
 515        /*
 516         * The footer takes precedence over the header, so read it in. The
 517         * footer starts at offset -1024 from the end: One sector for the
 518         * footer, and another one for the end-of-stream marker.
 519         */
 520        struct {
 521            struct {
 522                uint64_t val;
 523                uint32_t size;
 524                uint32_t type;
 525                uint8_t pad[512 - 16];
 526            } QEMU_PACKED footer_marker;
 527
 528            uint32_t magic;
 529            VMDK4Header header;
 530            uint8_t pad[512 - 4 - sizeof(VMDK4Header)];
 531
 532            struct {
 533                uint64_t val;
 534                uint32_t size;
 535                uint32_t type;
 536                uint8_t pad[512 - 16];
 537            } QEMU_PACKED eos_marker;
 538        } QEMU_PACKED footer;
 539
 540        ret = bdrv_pread(file,
 541            bs->file->total_sectors * 512 - 1536,
 542            &footer, sizeof(footer));
 543        if (ret < 0) {
 544            return ret;
 545        }
 546
 547        /* Some sanity checks for the footer */
 548        if (be32_to_cpu(footer.magic) != VMDK4_MAGIC ||
 549            le32_to_cpu(footer.footer_marker.size) != 0  ||
 550            le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER ||
 551            le64_to_cpu(footer.eos_marker.val) != 0  ||
 552            le32_to_cpu(footer.eos_marker.size) != 0  ||
 553            le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
 554        {
 555            return -EINVAL;
 556        }
 557
 558        header = footer.header;
 559    }
 560
 561    l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gte)
 562                        * le64_to_cpu(header.granularity);
 563    if (l1_entry_sectors == 0) {
 564        return -EINVAL;
 565    }
 566    l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1)
 567                / l1_entry_sectors;
 568    if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) {
 569        l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9;
 570    }
 571    extent = vmdk_add_extent(bs, file, false,
 572                          le64_to_cpu(header.capacity),
 573                          le64_to_cpu(header.gd_offset) << 9,
 574                          l1_backup_offset,
 575                          l1_size,
 576                          le32_to_cpu(header.num_gtes_per_gte),
 577                          le64_to_cpu(header.granularity));
 578    extent->compressed =
 579        le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
 580    extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
 581    extent->version = le32_to_cpu(header.version);
 582    extent->has_zero_grain = le32_to_cpu(header.flags) & VMDK4_FLAG_ZERO_GRAIN;
 583    ret = vmdk_init_tables(bs, extent);
 584    if (ret) {
 585        /* free extent allocated by vmdk_add_extent */
 586        vmdk_free_last_extent(bs);
 587    }
 588    return ret;
 589}
 590
 591/* find an option value out of descriptor file */
 592static int vmdk_parse_description(const char *desc, const char *opt_name,
 593        char *buf, int buf_size)
 594{
 595    char *opt_pos, *opt_end;
 596    const char *end = desc + strlen(desc);
 597
 598    opt_pos = strstr(desc, opt_name);
 599    if (!opt_pos) {
 600        return VMDK_ERROR;
 601    }
 602    /* Skip "=\"" following opt_name */
 603    opt_pos += strlen(opt_name) + 2;
 604    if (opt_pos >= end) {
 605        return VMDK_ERROR;
 606    }
 607    opt_end = opt_pos;
 608    while (opt_end < end && *opt_end != '"') {
 609        opt_end++;
 610    }
 611    if (opt_end == end || buf_size < opt_end - opt_pos + 1) {
 612        return VMDK_ERROR;
 613    }
 614    pstrcpy(buf, opt_end - opt_pos + 1, opt_pos);
 615    return VMDK_OK;
 616}
 617
 618/* Open an extent file and append to bs array */
 619static int vmdk_open_sparse(BlockDriverState *bs,
 620                            BlockDriverState *file,
 621                            int flags)
 622{
 623    uint32_t magic;
 624
 625    if (bdrv_pread(file, 0, &magic, sizeof(magic)) != sizeof(magic)) {
 626        return -EIO;
 627    }
 628
 629    magic = be32_to_cpu(magic);
 630    switch (magic) {
 631        case VMDK3_MAGIC:
 632            return vmdk_open_vmdk3(bs, file, flags);
 633            break;
 634        case VMDK4_MAGIC:
 635            return vmdk_open_vmdk4(bs, file, flags);
 636            break;
 637        default:
 638            return -EMEDIUMTYPE;
 639            break;
 640    }
 641}
 642
 643static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
 644        const char *desc_file_path)
 645{
 646    int ret;
 647    char access[11];
 648    char type[11];
 649    char fname[512];
 650    const char *p = desc;
 651    int64_t sectors = 0;
 652    int64_t flat_offset;
 653    char extent_path[PATH_MAX];
 654    BlockDriverState *extent_file;
 655
 656    while (*p) {
 657        /* parse extent line:
 658         * RW [size in sectors] FLAT "file-name.vmdk" OFFSET
 659         * or
 660         * RW [size in sectors] SPARSE "file-name.vmdk"
 661         */
 662        flat_offset = -1;
 663        ret = sscanf(p, "%10s %" SCNd64 " %10s \"%511[^\n\r\"]\" %" SCNd64,
 664                access, &sectors, type, fname, &flat_offset);
 665        if (ret < 4 || strcmp(access, "RW")) {
 666            goto next_line;
 667        } else if (!strcmp(type, "FLAT")) {
 668            if (ret != 5 || flat_offset < 0) {
 669                return -EINVAL;
 670            }
 671        } else if (ret != 4) {
 672            return -EINVAL;
 673        }
 674
 675        if (sectors <= 0 ||
 676            (strcmp(type, "FLAT") && strcmp(type, "SPARSE")) ||
 677            (strcmp(access, "RW"))) {
 678            goto next_line;
 679        }
 680
 681        path_combine(extent_path, sizeof(extent_path),
 682                desc_file_path, fname);
 683        ret = bdrv_file_open(&extent_file, extent_path, NULL, bs->open_flags);
 684        if (ret) {
 685            return ret;
 686        }
 687
 688        /* save to extents array */
 689        if (!strcmp(type, "FLAT")) {
 690            /* FLAT extent */
 691            VmdkExtent *extent;
 692
 693            extent = vmdk_add_extent(bs, extent_file, true, sectors,
 694                            0, 0, 0, 0, sectors);
 695            extent->flat_start_offset = flat_offset << 9;
 696        } else if (!strcmp(type, "SPARSE")) {
 697            /* SPARSE extent */
 698            ret = vmdk_open_sparse(bs, extent_file, bs->open_flags);
 699            if (ret) {
 700                bdrv_delete(extent_file);
 701                return ret;
 702            }
 703        } else {
 704            fprintf(stderr,
 705                "VMDK: Not supported extent type \"%s\""".\n", type);
 706            return -ENOTSUP;
 707        }
 708next_line:
 709        /* move to next line */
 710        while (*p && *p != '\n') {
 711            p++;
 712        }
 713        p++;
 714    }
 715    return 0;
 716}
 717
 718static int vmdk_open_desc_file(BlockDriverState *bs, int flags,
 719                               int64_t desc_offset)
 720{
 721    int ret;
 722    char buf[2048];
 723    char ct[128];
 724    BDRVVmdkState *s = bs->opaque;
 725
 726    ret = bdrv_pread(bs->file, desc_offset, buf, sizeof(buf));
 727    if (ret < 0) {
 728        return ret;
 729    }
 730    buf[2047] = '\0';
 731    if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) {
 732        return -EMEDIUMTYPE;
 733    }
 734    if (strcmp(ct, "monolithicFlat") &&
 735        strcmp(ct, "twoGbMaxExtentSparse") &&
 736        strcmp(ct, "twoGbMaxExtentFlat")) {
 737        fprintf(stderr,
 738                "VMDK: Not supported image type \"%s\""".\n", ct);
 739        return -ENOTSUP;
 740    }
 741    s->desc_offset = 0;
 742    return vmdk_parse_extents(buf, bs, bs->file->filename);
 743}
 744
 745static int vmdk_open(BlockDriverState *bs, QDict *options, int flags)
 746{
 747    int ret;
 748    BDRVVmdkState *s = bs->opaque;
 749
 750    if (vmdk_open_sparse(bs, bs->file, flags) == 0) {
 751        s->desc_offset = 0x200;
 752    } else {
 753        ret = vmdk_open_desc_file(bs, flags, 0);
 754        if (ret) {
 755            goto fail;
 756        }
 757    }
 758    /* try to open parent images, if exist */
 759    ret = vmdk_parent_open(bs);
 760    if (ret) {
 761        goto fail;
 762    }
 763    s->parent_cid = vmdk_read_cid(bs, 1);
 764    qemu_co_mutex_init(&s->lock);
 765
 766    /* Disable migration when VMDK images are used */
 767    error_set(&s->migration_blocker,
 768              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
 769              "vmdk", bs->device_name, "live migration");
 770    migrate_add_blocker(s->migration_blocker);
 771
 772    return 0;
 773
 774fail:
 775    vmdk_free_extents(bs);
 776    return ret;
 777}
 778
 779static int get_whole_cluster(BlockDriverState *bs,
 780                VmdkExtent *extent,
 781                uint64_t cluster_offset,
 782                uint64_t offset,
 783                bool allocate)
 784{
 785    /* 128 sectors * 512 bytes each = grain size 64KB */
 786    uint8_t  whole_grain[extent->cluster_sectors * 512];
 787
 788    /* we will be here if it's first write on non-exist grain(cluster).
 789     * try to read from parent image, if exist */
 790    if (bs->backing_hd) {
 791        int ret;
 792
 793        if (!vmdk_is_cid_valid(bs)) {
 794            return VMDK_ERROR;
 795        }
 796
 797        /* floor offset to cluster */
 798        offset -= offset % (extent->cluster_sectors * 512);
 799        ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
 800                extent->cluster_sectors);
 801        if (ret < 0) {
 802            return VMDK_ERROR;
 803        }
 804
 805        /* Write grain only into the active image */
 806        ret = bdrv_write(extent->file, cluster_offset, whole_grain,
 807                extent->cluster_sectors);
 808        if (ret < 0) {
 809            return VMDK_ERROR;
 810        }
 811    }
 812    return VMDK_OK;
 813}
 814
 815static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
 816{
 817    uint32_t offset;
 818    QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset));
 819    offset = cpu_to_le32(m_data->offset);
 820    /* update L2 table */
 821    if (bdrv_pwrite_sync(
 822                extent->file,
 823                ((int64_t)m_data->l2_offset * 512)
 824                    + (m_data->l2_index * sizeof(m_data->offset)),
 825                &offset, sizeof(offset)) < 0) {
 826        return VMDK_ERROR;
 827    }
 828    /* update backup L2 table */
 829    if (extent->l1_backup_table_offset != 0) {
 830        m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
 831        if (bdrv_pwrite_sync(
 832                    extent->file,
 833                    ((int64_t)m_data->l2_offset * 512)
 834                        + (m_data->l2_index * sizeof(m_data->offset)),
 835                    &offset, sizeof(offset)) < 0) {
 836            return VMDK_ERROR;
 837        }
 838    }
 839    if (m_data->l2_cache_entry) {
 840        *m_data->l2_cache_entry = offset;
 841    }
 842
 843    return VMDK_OK;
 844}
 845
 846static int get_cluster_offset(BlockDriverState *bs,
 847                                    VmdkExtent *extent,
 848                                    VmdkMetaData *m_data,
 849                                    uint64_t offset,
 850                                    int allocate,
 851                                    uint64_t *cluster_offset)
 852{
 853    unsigned int l1_index, l2_offset, l2_index;
 854    int min_index, i, j;
 855    uint32_t min_count, *l2_table;
 856    bool zeroed = false;
 857
 858    if (m_data) {
 859        m_data->valid = 0;
 860    }
 861    if (extent->flat) {
 862        *cluster_offset = extent->flat_start_offset;
 863        return VMDK_OK;
 864    }
 865
 866    offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
 867    l1_index = (offset >> 9) / extent->l1_entry_sectors;
 868    if (l1_index >= extent->l1_size) {
 869        return VMDK_ERROR;
 870    }
 871    l2_offset = extent->l1_table[l1_index];
 872    if (!l2_offset) {
 873        return VMDK_UNALLOC;
 874    }
 875    for (i = 0; i < L2_CACHE_SIZE; i++) {
 876        if (l2_offset == extent->l2_cache_offsets[i]) {
 877            /* increment the hit count */
 878            if (++extent->l2_cache_counts[i] == 0xffffffff) {
 879                for (j = 0; j < L2_CACHE_SIZE; j++) {
 880                    extent->l2_cache_counts[j] >>= 1;
 881                }
 882            }
 883            l2_table = extent->l2_cache + (i * extent->l2_size);
 884            goto found;
 885        }
 886    }
 887    /* not found: load a new entry in the least used one */
 888    min_index = 0;
 889    min_count = 0xffffffff;
 890    for (i = 0; i < L2_CACHE_SIZE; i++) {
 891        if (extent->l2_cache_counts[i] < min_count) {
 892            min_count = extent->l2_cache_counts[i];
 893            min_index = i;
 894        }
 895    }
 896    l2_table = extent->l2_cache + (min_index * extent->l2_size);
 897    if (bdrv_pread(
 898                extent->file,
 899                (int64_t)l2_offset * 512,
 900                l2_table,
 901                extent->l2_size * sizeof(uint32_t)
 902            ) != extent->l2_size * sizeof(uint32_t)) {
 903        return VMDK_ERROR;
 904    }
 905
 906    extent->l2_cache_offsets[min_index] = l2_offset;
 907    extent->l2_cache_counts[min_index] = 1;
 908 found:
 909    l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
 910    *cluster_offset = le32_to_cpu(l2_table[l2_index]);
 911
 912    if (m_data) {
 913        m_data->valid = 1;
 914        m_data->l1_index = l1_index;
 915        m_data->l2_index = l2_index;
 916        m_data->offset = *cluster_offset;
 917        m_data->l2_offset = l2_offset;
 918        m_data->l2_cache_entry = &l2_table[l2_index];
 919    }
 920    if (extent->has_zero_grain && *cluster_offset == VMDK_GTE_ZEROED) {
 921        zeroed = true;
 922    }
 923
 924    if (!*cluster_offset || zeroed) {
 925        if (!allocate) {
 926            return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
 927        }
 928
 929        /* Avoid the L2 tables update for the images that have snapshots. */
 930        *cluster_offset = bdrv_getlength(extent->file);
 931        if (!extent->compressed) {
 932            bdrv_truncate(
 933                extent->file,
 934                *cluster_offset + (extent->cluster_sectors << 9)
 935            );
 936        }
 937
 938        *cluster_offset >>= 9;
 939        l2_table[l2_index] = cpu_to_le32(*cluster_offset);
 940
 941        /* First of all we write grain itself, to avoid race condition
 942         * that may to corrupt the image.
 943         * This problem may occur because of insufficient space on host disk
 944         * or inappropriate VM shutdown.
 945         */
 946        if (get_whole_cluster(
 947                bs, extent, *cluster_offset, offset, allocate) == -1) {
 948            return VMDK_ERROR;
 949        }
 950
 951        if (m_data) {
 952            m_data->offset = *cluster_offset;
 953        }
 954    }
 955    *cluster_offset <<= 9;
 956    return VMDK_OK;
 957}
 958
 959static VmdkExtent *find_extent(BDRVVmdkState *s,
 960                                int64_t sector_num, VmdkExtent *start_hint)
 961{
 962    VmdkExtent *extent = start_hint;
 963
 964    if (!extent) {
 965        extent = &s->extents[0];
 966    }
 967    while (extent < &s->extents[s->num_extents]) {
 968        if (sector_num < extent->end_sector) {
 969            return extent;
 970        }
 971        extent++;
 972    }
 973    return NULL;
 974}
 975
 976static int coroutine_fn vmdk_co_is_allocated(BlockDriverState *bs,
 977        int64_t sector_num, int nb_sectors, int *pnum)
 978{
 979    BDRVVmdkState *s = bs->opaque;
 980    int64_t index_in_cluster, n, ret;
 981    uint64_t offset;
 982    VmdkExtent *extent;
 983
 984    extent = find_extent(s, sector_num, NULL);
 985    if (!extent) {
 986        return 0;
 987    }
 988    qemu_co_mutex_lock(&s->lock);
 989    ret = get_cluster_offset(bs, extent, NULL,
 990                            sector_num * 512, 0, &offset);
 991    qemu_co_mutex_unlock(&s->lock);
 992
 993    ret = (ret == VMDK_OK || ret == VMDK_ZEROED);
 994
 995    index_in_cluster = sector_num % extent->cluster_sectors;
 996    n = extent->cluster_sectors - index_in_cluster;
 997    if (n > nb_sectors) {
 998        n = nb_sectors;
 999    }
1000    *pnum = n;
1001    return ret;
1002}
1003
1004static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
1005                            int64_t offset_in_cluster, const uint8_t *buf,
1006                            int nb_sectors, int64_t sector_num)
1007{
1008    int ret;
1009    VmdkGrainMarker *data = NULL;
1010    uLongf buf_len;
1011    const uint8_t *write_buf = buf;
1012    int write_len = nb_sectors * 512;
1013
1014    if (extent->compressed) {
1015        if (!extent->has_marker) {
1016            ret = -EINVAL;
1017            goto out;
1018        }
1019        buf_len = (extent->cluster_sectors << 9) * 2;
1020        data = g_malloc(buf_len + sizeof(VmdkGrainMarker));
1021        if (compress(data->data, &buf_len, buf, nb_sectors << 9) != Z_OK ||
1022                buf_len == 0) {
1023            ret = -EINVAL;
1024            goto out;
1025        }
1026        data->lba = sector_num;
1027        data->size = buf_len;
1028        write_buf = (uint8_t *)data;
1029        write_len = buf_len + sizeof(VmdkGrainMarker);
1030    }
1031    ret = bdrv_pwrite(extent->file,
1032                        cluster_offset + offset_in_cluster,
1033                        write_buf,
1034                        write_len);
1035    if (ret != write_len) {
1036        ret = ret < 0 ? ret : -EIO;
1037        goto out;
1038    }
1039    ret = 0;
1040 out:
1041    g_free(data);
1042    return ret;
1043}
1044
1045static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
1046                            int64_t offset_in_cluster, uint8_t *buf,
1047                            int nb_sectors)
1048{
1049    int ret;
1050    int cluster_bytes, buf_bytes;
1051    uint8_t *cluster_buf, *compressed_data;
1052    uint8_t *uncomp_buf;
1053    uint32_t data_len;
1054    VmdkGrainMarker *marker;
1055    uLongf buf_len;
1056
1057
1058    if (!extent->compressed) {
1059        ret = bdrv_pread(extent->file,
1060                          cluster_offset + offset_in_cluster,
1061                          buf, nb_sectors * 512);
1062        if (ret == nb_sectors * 512) {
1063            return 0;
1064        } else {
1065            return -EIO;
1066        }
1067    }
1068    cluster_bytes = extent->cluster_sectors * 512;
1069    /* Read two clusters in case GrainMarker + compressed data > one cluster */
1070    buf_bytes = cluster_bytes * 2;
1071    cluster_buf = g_malloc(buf_bytes);
1072    uncomp_buf = g_malloc(cluster_bytes);
1073    ret = bdrv_pread(extent->file,
1074                cluster_offset,
1075                cluster_buf, buf_bytes);
1076    if (ret < 0) {
1077        goto out;
1078    }
1079    compressed_data = cluster_buf;
1080    buf_len = cluster_bytes;
1081    data_len = cluster_bytes;
1082    if (extent->has_marker) {
1083        marker = (VmdkGrainMarker *)cluster_buf;
1084        compressed_data = marker->data;
1085        data_len = le32_to_cpu(marker->size);
1086    }
1087    if (!data_len || data_len > buf_bytes) {
1088        ret = -EINVAL;
1089        goto out;
1090    }
1091    ret = uncompress(uncomp_buf, &buf_len, compressed_data, data_len);
1092    if (ret != Z_OK) {
1093        ret = -EINVAL;
1094        goto out;
1095
1096    }
1097    if (offset_in_cluster < 0 ||
1098            offset_in_cluster + nb_sectors * 512 > buf_len) {
1099        ret = -EINVAL;
1100        goto out;
1101    }
1102    memcpy(buf, uncomp_buf + offset_in_cluster, nb_sectors * 512);
1103    ret = 0;
1104
1105 out:
1106    g_free(uncomp_buf);
1107    g_free(cluster_buf);
1108    return ret;
1109}
1110
1111static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
1112                    uint8_t *buf, int nb_sectors)
1113{
1114    BDRVVmdkState *s = bs->opaque;
1115    int ret;
1116    uint64_t n, index_in_cluster;
1117    uint64_t extent_begin_sector, extent_relative_sector_num;
1118    VmdkExtent *extent = NULL;
1119    uint64_t cluster_offset;
1120
1121    while (nb_sectors > 0) {
1122        extent = find_extent(s, sector_num, extent);
1123        if (!extent) {
1124            return -EIO;
1125        }
1126        ret = get_cluster_offset(
1127                            bs, extent, NULL,
1128                            sector_num << 9, 0, &cluster_offset);
1129        extent_begin_sector = extent->end_sector - extent->sectors;
1130        extent_relative_sector_num = sector_num - extent_begin_sector;
1131        index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
1132        n = extent->cluster_sectors - index_in_cluster;
1133        if (n > nb_sectors) {
1134            n = nb_sectors;
1135        }
1136        if (ret != VMDK_OK) {
1137            /* if not allocated, try to read from parent image, if exist */
1138            if (bs->backing_hd && ret != VMDK_ZEROED) {
1139                if (!vmdk_is_cid_valid(bs)) {
1140                    return -EINVAL;
1141                }
1142                ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
1143                if (ret < 0) {
1144                    return ret;
1145                }
1146            } else {
1147                memset(buf, 0, 512 * n);
1148            }
1149        } else {
1150            ret = vmdk_read_extent(extent,
1151                            cluster_offset, index_in_cluster * 512,
1152                            buf, n);
1153            if (ret) {
1154                return ret;
1155            }
1156        }
1157        nb_sectors -= n;
1158        sector_num += n;
1159        buf += n * 512;
1160    }
1161    return 0;
1162}
1163
1164static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num,
1165                                     uint8_t *buf, int nb_sectors)
1166{
1167    int ret;
1168    BDRVVmdkState *s = bs->opaque;
1169    qemu_co_mutex_lock(&s->lock);
1170    ret = vmdk_read(bs, sector_num, buf, nb_sectors);
1171    qemu_co_mutex_unlock(&s->lock);
1172    return ret;
1173}
1174
1175/**
1176 * vmdk_write:
1177 * @zeroed:       buf is ignored (data is zero), use zeroed_grain GTE feature
1178 * if possible, otherwise return -ENOTSUP.
1179 * @zero_dry_run: used for zeroed == true only, don't update L2 table, just
1180 *
1181 * Returns: error code with 0 for success.
1182 */
1183static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
1184                      const uint8_t *buf, int nb_sectors,
1185                      bool zeroed, bool zero_dry_run)
1186{
1187    BDRVVmdkState *s = bs->opaque;
1188    VmdkExtent *extent = NULL;
1189    int n, ret;
1190    int64_t index_in_cluster;
1191    uint64_t extent_begin_sector, extent_relative_sector_num;
1192    uint64_t cluster_offset;
1193    VmdkMetaData m_data;
1194
1195    if (sector_num > bs->total_sectors) {
1196        fprintf(stderr,
1197                "(VMDK) Wrong offset: sector_num=0x%" PRIx64
1198                " total_sectors=0x%" PRIx64 "\n",
1199                sector_num, bs->total_sectors);
1200        return -EIO;
1201    }
1202
1203    while (nb_sectors > 0) {
1204        extent = find_extent(s, sector_num, extent);
1205        if (!extent) {
1206            return -EIO;
1207        }
1208        ret = get_cluster_offset(
1209                                bs,
1210                                extent,
1211                                &m_data,
1212                                sector_num << 9, !extent->compressed,
1213                                &cluster_offset);
1214        if (extent->compressed) {
1215            if (ret == VMDK_OK) {
1216                /* Refuse write to allocated cluster for streamOptimized */
1217                fprintf(stderr,
1218                        "VMDK: can't write to allocated cluster"
1219                        " for streamOptimized\n");
1220                return -EIO;
1221            } else {
1222                /* allocate */
1223                ret = get_cluster_offset(
1224                                        bs,
1225                                        extent,
1226                                        &m_data,
1227                                        sector_num << 9, 1,
1228                                        &cluster_offset);
1229            }
1230        }
1231        if (ret == VMDK_ERROR) {
1232            return -EINVAL;
1233        }
1234        extent_begin_sector = extent->end_sector - extent->sectors;
1235        extent_relative_sector_num = sector_num - extent_begin_sector;
1236        index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
1237        n = extent->cluster_sectors - index_in_cluster;
1238        if (n > nb_sectors) {
1239            n = nb_sectors;
1240        }
1241        if (zeroed) {
1242            /* Do zeroed write, buf is ignored */
1243            if (extent->has_zero_grain &&
1244                    index_in_cluster == 0 &&
1245                    n >= extent->cluster_sectors) {
1246                n = extent->cluster_sectors;
1247                if (!zero_dry_run) {
1248                    m_data.offset = VMDK_GTE_ZEROED;
1249                    /* update L2 tables */
1250                    if (vmdk_L2update(extent, &m_data) != VMDK_OK) {
1251                        return -EIO;
1252                    }
1253                }
1254            } else {
1255                return -ENOTSUP;
1256            }
1257        } else {
1258            ret = vmdk_write_extent(extent,
1259                            cluster_offset, index_in_cluster * 512,
1260                            buf, n, sector_num);
1261            if (ret) {
1262                return ret;
1263            }
1264            if (m_data.valid) {
1265                /* update L2 tables */
1266                if (vmdk_L2update(extent, &m_data) != VMDK_OK) {
1267                    return -EIO;
1268                }
1269            }
1270        }
1271        nb_sectors -= n;
1272        sector_num += n;
1273        buf += n * 512;
1274
1275        /* update CID on the first write every time the virtual disk is
1276         * opened */
1277        if (!s->cid_updated) {
1278            ret = vmdk_write_cid(bs, time(NULL));
1279            if (ret < 0) {
1280                return ret;
1281            }
1282            s->cid_updated = true;
1283        }
1284    }
1285    return 0;
1286}
1287
1288static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num,
1289                                      const uint8_t *buf, int nb_sectors)
1290{
1291    int ret;
1292    BDRVVmdkState *s = bs->opaque;
1293    qemu_co_mutex_lock(&s->lock);
1294    ret = vmdk_write(bs, sector_num, buf, nb_sectors, false, false);
1295    qemu_co_mutex_unlock(&s->lock);
1296    return ret;
1297}
1298
1299static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs,
1300                                             int64_t sector_num,
1301                                             int nb_sectors)
1302{
1303    int ret;
1304    BDRVVmdkState *s = bs->opaque;
1305    qemu_co_mutex_lock(&s->lock);
1306    ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, true);
1307    if (!ret) {
1308        ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, false);
1309    }
1310    qemu_co_mutex_unlock(&s->lock);
1311    return ret;
1312}
1313
1314
1315static int vmdk_create_extent(const char *filename, int64_t filesize,
1316                              bool flat, bool compress, bool zeroed_grain)
1317{
1318    int ret, i;
1319    int fd = 0;
1320    VMDK4Header header;
1321    uint32_t tmp, magic, grains, gd_size, gt_size, gt_count;
1322
1323    fd = qemu_open(filename,
1324                   O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
1325                   0644);
1326    if (fd < 0) {
1327        return -errno;
1328    }
1329    if (flat) {
1330        ret = ftruncate(fd, filesize);
1331        if (ret < 0) {
1332            ret = -errno;
1333        }
1334        goto exit;
1335    }
1336    magic = cpu_to_be32(VMDK4_MAGIC);
1337    memset(&header, 0, sizeof(header));
1338    header.version = zeroed_grain ? 2 : 1;
1339    header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT
1340                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
1341                   | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
1342    header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
1343    header.capacity = filesize / 512;
1344    header.granularity = 128;
1345    header.num_gtes_per_gte = 512;
1346
1347    grains = (filesize / 512 + header.granularity - 1) / header.granularity;
1348    gt_size = ((header.num_gtes_per_gte * sizeof(uint32_t)) + 511) >> 9;
1349    gt_count =
1350        (grains + header.num_gtes_per_gte - 1) / header.num_gtes_per_gte;
1351    gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9;
1352
1353    header.desc_offset = 1;
1354    header.desc_size = 20;
1355    header.rgd_offset = header.desc_offset + header.desc_size;
1356    header.gd_offset = header.rgd_offset + gd_size + (gt_size * gt_count);
1357    header.grain_offset =
1358       ((header.gd_offset + gd_size + (gt_size * gt_count) +
1359         header.granularity - 1) / header.granularity) *
1360        header.granularity;
1361    /* swap endianness for all header fields */
1362    header.version = cpu_to_le32(header.version);
1363    header.flags = cpu_to_le32(header.flags);
1364    header.capacity = cpu_to_le64(header.capacity);
1365    header.granularity = cpu_to_le64(header.granularity);
1366    header.num_gtes_per_gte = cpu_to_le32(header.num_gtes_per_gte);
1367    header.desc_offset = cpu_to_le64(header.desc_offset);
1368    header.desc_size = cpu_to_le64(header.desc_size);
1369    header.rgd_offset = cpu_to_le64(header.rgd_offset);
1370    header.gd_offset = cpu_to_le64(header.gd_offset);
1371    header.grain_offset = cpu_to_le64(header.grain_offset);
1372    header.compressAlgorithm = cpu_to_le16(header.compressAlgorithm);
1373
1374    header.check_bytes[0] = 0xa;
1375    header.check_bytes[1] = 0x20;
1376    header.check_bytes[2] = 0xd;
1377    header.check_bytes[3] = 0xa;
1378
1379    /* write all the data */
1380    ret = qemu_write_full(fd, &magic, sizeof(magic));
1381    if (ret != sizeof(magic)) {
1382        ret = -errno;
1383        goto exit;
1384    }
1385    ret = qemu_write_full(fd, &header, sizeof(header));
1386    if (ret != sizeof(header)) {
1387        ret = -errno;
1388        goto exit;
1389    }
1390
1391    ret = ftruncate(fd, le64_to_cpu(header.grain_offset) << 9);
1392    if (ret < 0) {
1393        ret = -errno;
1394        goto exit;
1395    }
1396
1397    /* write grain directory */
1398    lseek(fd, le64_to_cpu(header.rgd_offset) << 9, SEEK_SET);
1399    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_size;
1400         i < gt_count; i++, tmp += gt_size) {
1401        ret = qemu_write_full(fd, &tmp, sizeof(tmp));
1402        if (ret != sizeof(tmp)) {
1403            ret = -errno;
1404            goto exit;
1405        }
1406    }
1407
1408    /* write backup grain directory */
1409    lseek(fd, le64_to_cpu(header.gd_offset) << 9, SEEK_SET);
1410    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_size;
1411         i < gt_count; i++, tmp += gt_size) {
1412        ret = qemu_write_full(fd, &tmp, sizeof(tmp));
1413        if (ret != sizeof(tmp)) {
1414            ret = -errno;
1415            goto exit;
1416        }
1417    }
1418
1419    ret = 0;
1420 exit:
1421    qemu_close(fd);
1422    return ret;
1423}
1424
1425static int filename_decompose(const char *filename, char *path, char *prefix,
1426        char *postfix, size_t buf_len)
1427{
1428    const char *p, *q;
1429
1430    if (filename == NULL || !strlen(filename)) {
1431        fprintf(stderr, "Vmdk: no filename provided.\n");
1432        return VMDK_ERROR;
1433    }
1434    p = strrchr(filename, '/');
1435    if (p == NULL) {
1436        p = strrchr(filename, '\\');
1437    }
1438    if (p == NULL) {
1439        p = strrchr(filename, ':');
1440    }
1441    if (p != NULL) {
1442        p++;
1443        if (p - filename >= buf_len) {
1444            return VMDK_ERROR;
1445        }
1446        pstrcpy(path, p - filename + 1, filename);
1447    } else {
1448        p = filename;
1449        path[0] = '\0';
1450    }
1451    q = strrchr(p, '.');
1452    if (q == NULL) {
1453        pstrcpy(prefix, buf_len, p);
1454        postfix[0] = '\0';
1455    } else {
1456        if (q - p >= buf_len) {
1457            return VMDK_ERROR;
1458        }
1459        pstrcpy(prefix, q - p + 1, p);
1460        pstrcpy(postfix, buf_len, q);
1461    }
1462    return VMDK_OK;
1463}
1464
1465static int relative_path(char *dest, int dest_size,
1466        const char *base, const char *target)
1467{
1468    int i = 0;
1469    int n = 0;
1470    const char *p, *q;
1471#ifdef _WIN32
1472    const char *sep = "\\";
1473#else
1474    const char *sep = "/";
1475#endif
1476
1477    if (!(dest && base && target)) {
1478        return VMDK_ERROR;
1479    }
1480    if (path_is_absolute(target)) {
1481        pstrcpy(dest, dest_size, target);
1482        return VMDK_OK;
1483    }
1484    while (base[i] == target[i]) {
1485        i++;
1486    }
1487    p = &base[i];
1488    q = &target[i];
1489    while (*p) {
1490        if (*p == *sep) {
1491            n++;
1492        }
1493        p++;
1494    }
1495    dest[0] = '\0';
1496    for (; n; n--) {
1497        pstrcat(dest, dest_size, "..");
1498        pstrcat(dest, dest_size, sep);
1499    }
1500    pstrcat(dest, dest_size, q);
1501    return VMDK_OK;
1502}
1503
1504static int vmdk_create(const char *filename, QEMUOptionParameter *options)
1505{
1506    int fd, idx = 0;
1507    char desc[BUF_SIZE];
1508    int64_t total_size = 0, filesize;
1509    const char *adapter_type = NULL;
1510    const char *backing_file = NULL;
1511    const char *fmt = NULL;
1512    int flags = 0;
1513    int ret = 0;
1514    bool flat, split, compress;
1515    char ext_desc_lines[BUF_SIZE] = "";
1516    char path[PATH_MAX], prefix[PATH_MAX], postfix[PATH_MAX];
1517    const int64_t split_size = 0x80000000;  /* VMDK has constant split size */
1518    const char *desc_extent_line;
1519    char parent_desc_line[BUF_SIZE] = "";
1520    uint32_t parent_cid = 0xffffffff;
1521    uint32_t number_heads = 16;
1522    bool zeroed_grain = false;
1523    const char desc_template[] =
1524        "# Disk DescriptorFile\n"
1525        "version=1\n"
1526        "CID=%x\n"
1527        "parentCID=%x\n"
1528        "createType=\"%s\"\n"
1529        "%s"
1530        "\n"
1531        "# Extent description\n"
1532        "%s"
1533        "\n"
1534        "# The Disk Data Base\n"
1535        "#DDB\n"
1536        "\n"
1537        "ddb.virtualHWVersion = \"%d\"\n"
1538        "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
1539        "ddb.geometry.heads = \"%d\"\n"
1540        "ddb.geometry.sectors = \"63\"\n"
1541        "ddb.adapterType = \"%s\"\n";
1542
1543    if (filename_decompose(filename, path, prefix, postfix, PATH_MAX)) {
1544        return -EINVAL;
1545    }
1546    /* Read out options */
1547    while (options && options->name) {
1548        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1549            total_size = options->value.n;
1550        } else if (!strcmp(options->name, BLOCK_OPT_ADAPTER_TYPE)) {
1551            adapter_type = options->value.s;
1552        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
1553            backing_file = options->value.s;
1554        } else if (!strcmp(options->name, BLOCK_OPT_COMPAT6)) {
1555            flags |= options->value.n ? BLOCK_FLAG_COMPAT6 : 0;
1556        } else if (!strcmp(options->name, BLOCK_OPT_SUBFMT)) {
1557            fmt = options->value.s;
1558        } else if (!strcmp(options->name, BLOCK_OPT_ZEROED_GRAIN)) {
1559            zeroed_grain |= options->value.n;
1560        }
1561        options++;
1562    }
1563    if (!adapter_type) {
1564        adapter_type = "ide";
1565    } else if (strcmp(adapter_type, "ide") &&
1566               strcmp(adapter_type, "buslogic") &&
1567               strcmp(adapter_type, "lsilogic") &&
1568               strcmp(adapter_type, "legacyESX")) {
1569        fprintf(stderr, "VMDK: Unknown adapter type: '%s'.\n", adapter_type);
1570        return -EINVAL;
1571    }
1572    if (strcmp(adapter_type, "ide") != 0) {
1573        /* that's the number of heads with which vmware operates when
1574           creating, exporting, etc. vmdk files with a non-ide adapter type */
1575        number_heads = 255;
1576    }
1577    if (!fmt) {
1578        /* Default format to monolithicSparse */
1579        fmt = "monolithicSparse";
1580    } else if (strcmp(fmt, "monolithicFlat") &&
1581               strcmp(fmt, "monolithicSparse") &&
1582               strcmp(fmt, "twoGbMaxExtentSparse") &&
1583               strcmp(fmt, "twoGbMaxExtentFlat") &&
1584               strcmp(fmt, "streamOptimized")) {
1585        fprintf(stderr, "VMDK: Unknown subformat: %s\n", fmt);
1586        return -EINVAL;
1587    }
1588    split = !(strcmp(fmt, "twoGbMaxExtentFlat") &&
1589              strcmp(fmt, "twoGbMaxExtentSparse"));
1590    flat = !(strcmp(fmt, "monolithicFlat") &&
1591             strcmp(fmt, "twoGbMaxExtentFlat"));
1592    compress = !strcmp(fmt, "streamOptimized");
1593    if (flat) {
1594        desc_extent_line = "RW %lld FLAT \"%s\" 0\n";
1595    } else {
1596        desc_extent_line = "RW %lld SPARSE \"%s\"\n";
1597    }
1598    if (flat && backing_file) {
1599        /* not supporting backing file for flat image */
1600        return -ENOTSUP;
1601    }
1602    if (backing_file) {
1603        char parent_filename[PATH_MAX];
1604        BlockDriverState *bs = bdrv_new("");
1605        ret = bdrv_open(bs, backing_file, NULL, 0, NULL);
1606        if (ret != 0) {
1607            bdrv_delete(bs);
1608            return ret;
1609        }
1610        if (strcmp(bs->drv->format_name, "vmdk")) {
1611            bdrv_delete(bs);
1612            return -EINVAL;
1613        }
1614        parent_cid = vmdk_read_cid(bs, 0);
1615        bdrv_delete(bs);
1616        relative_path(parent_filename, sizeof(parent_filename),
1617                      filename, backing_file);
1618        snprintf(parent_desc_line, sizeof(parent_desc_line),
1619                "parentFileNameHint=\"%s\"", parent_filename);
1620    }
1621
1622    /* Create extents */
1623    filesize = total_size;
1624    while (filesize > 0) {
1625        char desc_line[BUF_SIZE];
1626        char ext_filename[PATH_MAX];
1627        char desc_filename[PATH_MAX];
1628        int64_t size = filesize;
1629
1630        if (split && size > split_size) {
1631            size = split_size;
1632        }
1633        if (split) {
1634            snprintf(desc_filename, sizeof(desc_filename), "%s-%c%03d%s",
1635                    prefix, flat ? 'f' : 's', ++idx, postfix);
1636        } else if (flat) {
1637            snprintf(desc_filename, sizeof(desc_filename), "%s-flat%s",
1638                    prefix, postfix);
1639        } else {
1640            snprintf(desc_filename, sizeof(desc_filename), "%s%s",
1641                    prefix, postfix);
1642        }
1643        snprintf(ext_filename, sizeof(ext_filename), "%s%s",
1644                path, desc_filename);
1645
1646        if (vmdk_create_extent(ext_filename, size,
1647                               flat, compress, zeroed_grain)) {
1648            return -EINVAL;
1649        }
1650        filesize -= size;
1651
1652        /* Format description line */
1653        snprintf(desc_line, sizeof(desc_line),
1654                    desc_extent_line, size / 512, desc_filename);
1655        pstrcat(ext_desc_lines, sizeof(ext_desc_lines), desc_line);
1656    }
1657    /* generate descriptor file */
1658    snprintf(desc, sizeof(desc), desc_template,
1659            (unsigned int)time(NULL),
1660            parent_cid,
1661            fmt,
1662            parent_desc_line,
1663            ext_desc_lines,
1664            (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
1665            total_size / (int64_t)(63 * number_heads * 512), number_heads,
1666                adapter_type);
1667    if (split || flat) {
1668        fd = qemu_open(filename,
1669                       O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
1670                       0644);
1671    } else {
1672        fd = qemu_open(filename,
1673                       O_WRONLY | O_BINARY | O_LARGEFILE,
1674                       0644);
1675    }
1676    if (fd < 0) {
1677        return -errno;
1678    }
1679    /* the descriptor offset = 0x200 */
1680    if (!split && !flat && 0x200 != lseek(fd, 0x200, SEEK_SET)) {
1681        ret = -errno;
1682        goto exit;
1683    }
1684    ret = qemu_write_full(fd, desc, strlen(desc));
1685    if (ret != strlen(desc)) {
1686        ret = -errno;
1687        goto exit;
1688    }
1689    ret = 0;
1690exit:
1691    qemu_close(fd);
1692    return ret;
1693}
1694
1695static void vmdk_close(BlockDriverState *bs)
1696{
1697    BDRVVmdkState *s = bs->opaque;
1698
1699    vmdk_free_extents(bs);
1700
1701    migrate_del_blocker(s->migration_blocker);
1702    error_free(s->migration_blocker);
1703}
1704
1705static coroutine_fn int vmdk_co_flush(BlockDriverState *bs)
1706{
1707    BDRVVmdkState *s = bs->opaque;
1708    int i, err;
1709    int ret = 0;
1710
1711    for (i = 0; i < s->num_extents; i++) {
1712        err = bdrv_co_flush(s->extents[i].file);
1713        if (err < 0) {
1714            ret = err;
1715        }
1716    }
1717    return ret;
1718}
1719
1720static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs)
1721{
1722    int i;
1723    int64_t ret = 0;
1724    int64_t r;
1725    BDRVVmdkState *s = bs->opaque;
1726
1727    ret = bdrv_get_allocated_file_size(bs->file);
1728    if (ret < 0) {
1729        return ret;
1730    }
1731    for (i = 0; i < s->num_extents; i++) {
1732        if (s->extents[i].file == bs->file) {
1733            continue;
1734        }
1735        r = bdrv_get_allocated_file_size(s->extents[i].file);
1736        if (r < 0) {
1737            return r;
1738        }
1739        ret += r;
1740    }
1741    return ret;
1742}
1743
1744static QEMUOptionParameter vmdk_create_options[] = {
1745    {
1746        .name = BLOCK_OPT_SIZE,
1747        .type = OPT_SIZE,
1748        .help = "Virtual disk size"
1749    },
1750    {
1751        .name = BLOCK_OPT_ADAPTER_TYPE,
1752        .type = OPT_STRING,
1753        .help = "Virtual adapter type, can be one of "
1754                "ide (default), lsilogic, buslogic or legacyESX"
1755    },
1756    {
1757        .name = BLOCK_OPT_BACKING_FILE,
1758        .type = OPT_STRING,
1759        .help = "File name of a base image"
1760    },
1761    {
1762        .name = BLOCK_OPT_COMPAT6,
1763        .type = OPT_FLAG,
1764        .help = "VMDK version 6 image"
1765    },
1766    {
1767        .name = BLOCK_OPT_SUBFMT,
1768        .type = OPT_STRING,
1769        .help =
1770            "VMDK flat extent format, can be one of "
1771            "{monolithicSparse (default) | monolithicFlat | twoGbMaxExtentSparse | twoGbMaxExtentFlat | streamOptimized} "
1772    },
1773    {
1774        .name = BLOCK_OPT_ZEROED_GRAIN,
1775        .type = OPT_FLAG,
1776        .help = "Enable efficient zero writes using the zeroed-grain GTE feature"
1777    },
1778    { NULL }
1779};
1780
1781static BlockDriver bdrv_vmdk = {
1782    .format_name    = "vmdk",
1783    .instance_size  = sizeof(BDRVVmdkState),
1784    .bdrv_probe     = vmdk_probe,
1785    .bdrv_open      = vmdk_open,
1786    .bdrv_reopen_prepare = vmdk_reopen_prepare,
1787    .bdrv_read      = vmdk_co_read,
1788    .bdrv_write     = vmdk_co_write,
1789    .bdrv_co_write_zeroes = vmdk_co_write_zeroes,
1790    .bdrv_close     = vmdk_close,
1791    .bdrv_create    = vmdk_create,
1792    .bdrv_co_flush_to_disk  = vmdk_co_flush,
1793    .bdrv_co_is_allocated   = vmdk_co_is_allocated,
1794    .bdrv_get_allocated_file_size  = vmdk_get_allocated_file_size,
1795
1796    .create_options = vmdk_create_options,
1797};
1798
1799static void bdrv_vmdk_init(void)
1800{
1801    bdrv_register(&bdrv_vmdk);
1802}
1803
1804block_init(bdrv_vmdk_init);
1805