qemu/block/vmdk.c
<<
>>
Prefs
   1/*
   2 * Block driver for the VMDK format
   3 *
   4 * Copyright (c) 2004 Fabrice Bellard
   5 * Copyright (c) 2005 Filip Navara
   6 *
   7 * Permission is hereby granted, free of charge, to any person obtaining a copy
   8 * of this software and associated documentation files (the "Software"), to deal
   9 * in the Software without restriction, including without limitation the rights
  10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 * copies of the Software, and to permit persons to whom the Software is
  12 * furnished to do so, subject to the following conditions:
  13 *
  14 * The above copyright notice and this permission notice shall be included in
  15 * all copies or substantial portions of the Software.
  16 *
  17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  23 * THE SOFTWARE.
  24 */
  25
  26#include "qemu/osdep.h"
  27#include "qapi/error.h"
  28#include "block/block_int.h"
  29#include "sysemu/block-backend.h"
  30#include "qapi/qmp/qdict.h"
  31#include "qapi/qmp/qerror.h"
  32#include "qemu/error-report.h"
  33#include "qemu/module.h"
  34#include "qemu/option.h"
  35#include "qemu/bswap.h"
  36#include "migration/blocker.h"
  37#include "qemu/cutils.h"
  38#include <zlib.h>
  39
  40#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
  41#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
  42#define VMDK4_COMPRESSION_DEFLATE 1
  43#define VMDK4_FLAG_NL_DETECT (1 << 0)
  44#define VMDK4_FLAG_RGD (1 << 1)
  45/* Zeroed-grain enable bit */
  46#define VMDK4_FLAG_ZERO_GRAIN   (1 << 2)
  47#define VMDK4_FLAG_COMPRESS (1 << 16)
  48#define VMDK4_FLAG_MARKER (1 << 17)
  49#define VMDK4_GD_AT_END 0xffffffffffffffffULL
  50
  51#define VMDK_EXTENT_MAX_SECTORS (1ULL << 32)
  52
  53#define VMDK_GTE_ZEROED 0x1
  54
  55/* VMDK internal error codes */
  56#define VMDK_OK      0
  57#define VMDK_ERROR   (-1)
  58/* Cluster not allocated */
  59#define VMDK_UNALLOC (-2)
  60#define VMDK_ZEROED  (-3)
  61
  62#define BLOCK_OPT_ZEROED_GRAIN "zeroed_grain"
  63#define BLOCK_OPT_TOOLSVERSION "toolsversion"
  64
  65typedef struct {
  66    uint32_t version;
  67    uint32_t flags;
  68    uint32_t disk_sectors;
  69    uint32_t granularity;
  70    uint32_t l1dir_offset;
  71    uint32_t l1dir_size;
  72    uint32_t file_sectors;
  73    uint32_t cylinders;
  74    uint32_t heads;
  75    uint32_t sectors_per_track;
  76} QEMU_PACKED VMDK3Header;
  77
  78typedef struct {
  79    uint32_t version;
  80    uint32_t flags;
  81    uint64_t capacity;
  82    uint64_t granularity;
  83    uint64_t desc_offset;
  84    uint64_t desc_size;
  85    /* Number of GrainTableEntries per GrainTable */
  86    uint32_t num_gtes_per_gt;
  87    uint64_t rgd_offset;
  88    uint64_t gd_offset;
  89    uint64_t grain_offset;
  90    char filler[1];
  91    char check_bytes[4];
  92    uint16_t compressAlgorithm;
  93} QEMU_PACKED VMDK4Header;
  94
  95typedef struct VMDKSESparseConstHeader {
  96    uint64_t magic;
  97    uint64_t version;
  98    uint64_t capacity;
  99    uint64_t grain_size;
 100    uint64_t grain_table_size;
 101    uint64_t flags;
 102    uint64_t reserved1;
 103    uint64_t reserved2;
 104    uint64_t reserved3;
 105    uint64_t reserved4;
 106    uint64_t volatile_header_offset;
 107    uint64_t volatile_header_size;
 108    uint64_t journal_header_offset;
 109    uint64_t journal_header_size;
 110    uint64_t journal_offset;
 111    uint64_t journal_size;
 112    uint64_t grain_dir_offset;
 113    uint64_t grain_dir_size;
 114    uint64_t grain_tables_offset;
 115    uint64_t grain_tables_size;
 116    uint64_t free_bitmap_offset;
 117    uint64_t free_bitmap_size;
 118    uint64_t backmap_offset;
 119    uint64_t backmap_size;
 120    uint64_t grains_offset;
 121    uint64_t grains_size;
 122    uint8_t pad[304];
 123} QEMU_PACKED VMDKSESparseConstHeader;
 124
 125typedef struct VMDKSESparseVolatileHeader {
 126    uint64_t magic;
 127    uint64_t free_gt_number;
 128    uint64_t next_txn_seq_number;
 129    uint64_t replay_journal;
 130    uint8_t pad[480];
 131} QEMU_PACKED VMDKSESparseVolatileHeader;
 132
 133#define L2_CACHE_SIZE 16
 134
 135typedef struct VmdkExtent {
 136    BdrvChild *file;
 137    bool flat;
 138    bool compressed;
 139    bool has_marker;
 140    bool has_zero_grain;
 141    bool sesparse;
 142    uint64_t sesparse_l2_tables_offset;
 143    uint64_t sesparse_clusters_offset;
 144    int32_t entry_size;
 145    int version;
 146    int64_t sectors;
 147    int64_t end_sector;
 148    int64_t flat_start_offset;
 149    int64_t l1_table_offset;
 150    int64_t l1_backup_table_offset;
 151    void *l1_table;
 152    uint32_t *l1_backup_table;
 153    unsigned int l1_size;
 154    uint32_t l1_entry_sectors;
 155
 156    unsigned int l2_size;
 157    void *l2_cache;
 158    uint32_t l2_cache_offsets[L2_CACHE_SIZE];
 159    uint32_t l2_cache_counts[L2_CACHE_SIZE];
 160
 161    int64_t cluster_sectors;
 162    int64_t next_cluster_sector;
 163    char *type;
 164} VmdkExtent;
 165
 166typedef struct BDRVVmdkState {
 167    CoMutex lock;
 168    uint64_t desc_offset;
 169    bool cid_updated;
 170    bool cid_checked;
 171    uint32_t cid;
 172    uint32_t parent_cid;
 173    int num_extents;
 174    /* Extent array with num_extents entries, ascend ordered by address */
 175    VmdkExtent *extents;
 176    Error *migration_blocker;
 177    char *create_type;
 178} BDRVVmdkState;
 179
 180typedef struct VmdkMetaData {
 181    unsigned int l1_index;
 182    unsigned int l2_index;
 183    unsigned int l2_offset;
 184    bool new_allocation;
 185    uint32_t *l2_cache_entry;
 186} VmdkMetaData;
 187
 188typedef struct VmdkGrainMarker {
 189    uint64_t lba;
 190    uint32_t size;
 191    uint8_t  data[];
 192} QEMU_PACKED VmdkGrainMarker;
 193
 194enum {
 195    MARKER_END_OF_STREAM    = 0,
 196    MARKER_GRAIN_TABLE      = 1,
 197    MARKER_GRAIN_DIRECTORY  = 2,
 198    MARKER_FOOTER           = 3,
 199};
 200
 201static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
 202{
 203    uint32_t magic;
 204
 205    if (buf_size < 4) {
 206        return 0;
 207    }
 208    magic = be32_to_cpu(*(uint32_t *)buf);
 209    if (magic == VMDK3_MAGIC ||
 210        magic == VMDK4_MAGIC) {
 211        return 100;
 212    } else {
 213        const char *p = (const char *)buf;
 214        const char *end = p + buf_size;
 215        while (p < end) {
 216            if (*p == '#') {
 217                /* skip comment line */
 218                while (p < end && *p != '\n') {
 219                    p++;
 220                }
 221                p++;
 222                continue;
 223            }
 224            if (*p == ' ') {
 225                while (p < end && *p == ' ') {
 226                    p++;
 227                }
 228                /* skip '\r' if windows line endings used. */
 229                if (p < end && *p == '\r') {
 230                    p++;
 231                }
 232                /* only accept blank lines before 'version=' line */
 233                if (p == end || *p != '\n') {
 234                    return 0;
 235                }
 236                p++;
 237                continue;
 238            }
 239            if (end - p >= strlen("version=X\n")) {
 240                if (strncmp("version=1\n", p, strlen("version=1\n")) == 0 ||
 241                    strncmp("version=2\n", p, strlen("version=2\n")) == 0 ||
 242                    strncmp("version=3\n", p, strlen("version=3\n")) == 0) {
 243                    return 100;
 244                }
 245            }
 246            if (end - p >= strlen("version=X\r\n")) {
 247                if (strncmp("version=1\r\n", p, strlen("version=1\r\n")) == 0 ||
 248                    strncmp("version=2\r\n", p, strlen("version=2\r\n")) == 0 ||
 249                    strncmp("version=3\r\n", p, strlen("version=3\r\n")) == 0) {
 250                    return 100;
 251                }
 252            }
 253            return 0;
 254        }
 255        return 0;
 256    }
 257}
 258
 259#define SECTOR_SIZE 512
 260#define DESC_SIZE (20 * SECTOR_SIZE)    /* 20 sectors of 512 bytes each */
 261#define BUF_SIZE 4096
 262#define HEADER_SIZE 512                 /* first sector of 512 bytes */
 263
 264static void vmdk_free_extents(BlockDriverState *bs)
 265{
 266    int i;
 267    BDRVVmdkState *s = bs->opaque;
 268    VmdkExtent *e;
 269
 270    for (i = 0; i < s->num_extents; i++) {
 271        e = &s->extents[i];
 272        g_free(e->l1_table);
 273        g_free(e->l2_cache);
 274        g_free(e->l1_backup_table);
 275        g_free(e->type);
 276        if (e->file != bs->file) {
 277            bdrv_unref_child(bs, e->file);
 278        }
 279    }
 280    g_free(s->extents);
 281}
 282
 283static void vmdk_free_last_extent(BlockDriverState *bs)
 284{
 285    BDRVVmdkState *s = bs->opaque;
 286
 287    if (s->num_extents == 0) {
 288        return;
 289    }
 290    s->num_extents--;
 291    s->extents = g_renew(VmdkExtent, s->extents, s->num_extents);
 292}
 293
 294/* Return -ve errno, or 0 on success and write CID into *pcid. */
 295static int vmdk_read_cid(BlockDriverState *bs, int parent, uint32_t *pcid)
 296{
 297    char *desc;
 298    uint32_t cid;
 299    const char *p_name, *cid_str;
 300    size_t cid_str_size;
 301    BDRVVmdkState *s = bs->opaque;
 302    int ret;
 303
 304    desc = g_malloc0(DESC_SIZE);
 305    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
 306    if (ret < 0) {
 307        goto out;
 308    }
 309
 310    if (parent) {
 311        cid_str = "parentCID";
 312        cid_str_size = sizeof("parentCID");
 313    } else {
 314        cid_str = "CID";
 315        cid_str_size = sizeof("CID");
 316    }
 317
 318    desc[DESC_SIZE - 1] = '\0';
 319    p_name = strstr(desc, cid_str);
 320    if (p_name == NULL) {
 321        ret = -EINVAL;
 322        goto out;
 323    }
 324    p_name += cid_str_size;
 325    if (sscanf(p_name, "%" SCNx32, &cid) != 1) {
 326        ret = -EINVAL;
 327        goto out;
 328    }
 329    *pcid = cid;
 330    ret = 0;
 331
 332out:
 333    g_free(desc);
 334    return ret;
 335}
 336
 337static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
 338{
 339    char *desc, *tmp_desc;
 340    char *p_name, *tmp_str;
 341    BDRVVmdkState *s = bs->opaque;
 342    int ret = 0;
 343
 344    desc = g_malloc0(DESC_SIZE);
 345    tmp_desc = g_malloc0(DESC_SIZE);
 346    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
 347    if (ret < 0) {
 348        goto out;
 349    }
 350
 351    desc[DESC_SIZE - 1] = '\0';
 352    tmp_str = strstr(desc, "parentCID");
 353    if (tmp_str == NULL) {
 354        ret = -EINVAL;
 355        goto out;
 356    }
 357
 358    pstrcpy(tmp_desc, DESC_SIZE, tmp_str);
 359    p_name = strstr(desc, "CID");
 360    if (p_name != NULL) {
 361        p_name += sizeof("CID");
 362        snprintf(p_name, DESC_SIZE - (p_name - desc), "%" PRIx32 "\n", cid);
 363        pstrcat(desc, DESC_SIZE, tmp_desc);
 364    }
 365
 366    ret = bdrv_pwrite_sync(bs->file, s->desc_offset, desc, DESC_SIZE);
 367
 368out:
 369    g_free(desc);
 370    g_free(tmp_desc);
 371    return ret;
 372}
 373
 374static int vmdk_is_cid_valid(BlockDriverState *bs)
 375{
 376    BDRVVmdkState *s = bs->opaque;
 377    uint32_t cur_pcid;
 378
 379    if (!s->cid_checked && bs->backing) {
 380        BlockDriverState *p_bs = bs->backing->bs;
 381
 382        if (strcmp(p_bs->drv->format_name, "vmdk")) {
 383            /* Backing file is not in vmdk format, so it does not have
 384             * a CID, which makes the overlay's parent CID invalid */
 385            return 0;
 386        }
 387
 388        if (vmdk_read_cid(p_bs, 0, &cur_pcid) != 0) {
 389            /* read failure: report as not valid */
 390            return 0;
 391        }
 392        if (s->parent_cid != cur_pcid) {
 393            /* CID not valid */
 394            return 0;
 395        }
 396    }
 397    s->cid_checked = true;
 398    /* CID valid */
 399    return 1;
 400}
 401
 402/* We have nothing to do for VMDK reopen, stubs just return success */
 403static int vmdk_reopen_prepare(BDRVReopenState *state,
 404                               BlockReopenQueue *queue, Error **errp)
 405{
 406    assert(state != NULL);
 407    assert(state->bs != NULL);
 408    return 0;
 409}
 410
 411static int vmdk_parent_open(BlockDriverState *bs)
 412{
 413    char *p_name;
 414    char *desc;
 415    BDRVVmdkState *s = bs->opaque;
 416    int ret;
 417
 418    desc = g_malloc0(DESC_SIZE + 1);
 419    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
 420    if (ret < 0) {
 421        goto out;
 422    }
 423    ret = 0;
 424
 425    p_name = strstr(desc, "parentFileNameHint");
 426    if (p_name != NULL) {
 427        char *end_name;
 428
 429        p_name += sizeof("parentFileNameHint") + 1;
 430        end_name = strchr(p_name, '\"');
 431        if (end_name == NULL) {
 432            ret = -EINVAL;
 433            goto out;
 434        }
 435        if ((end_name - p_name) > sizeof(bs->auto_backing_file) - 1) {
 436            ret = -EINVAL;
 437            goto out;
 438        }
 439
 440        pstrcpy(bs->auto_backing_file, end_name - p_name + 1, p_name);
 441        pstrcpy(bs->backing_file, sizeof(bs->backing_file),
 442                bs->auto_backing_file);
 443        pstrcpy(bs->backing_format, sizeof(bs->backing_format),
 444                "vmdk");
 445    }
 446
 447out:
 448    g_free(desc);
 449    return ret;
 450}
 451
 452/* Create and append extent to the extent array. Return the added VmdkExtent
 453 * address. return NULL if allocation failed. */
 454static int vmdk_add_extent(BlockDriverState *bs,
 455                           BdrvChild *file, bool flat, int64_t sectors,
 456                           int64_t l1_offset, int64_t l1_backup_offset,
 457                           uint32_t l1_size,
 458                           int l2_size, uint64_t cluster_sectors,
 459                           VmdkExtent **new_extent,
 460                           Error **errp)
 461{
 462    VmdkExtent *extent;
 463    BDRVVmdkState *s = bs->opaque;
 464    int64_t nb_sectors;
 465
 466    if (cluster_sectors > 0x200000) {
 467        /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */
 468        error_setg(errp, "Invalid granularity, image may be corrupt");
 469        return -EFBIG;
 470    }
 471    if (l1_size > 32 * 1024 * 1024) {
 472        /*
 473         * Although with big capacity and small l1_entry_sectors, we can get a
 474         * big l1_size, we don't want unbounded value to allocate the table.
 475         * Limit it to 32M, which is enough to store:
 476         *     8TB  - for both VMDK3 & VMDK4 with
 477         *            minimal cluster size: 512B
 478         *            minimal L2 table size: 512 entries
 479         *            8 TB is still more than the maximal value supported for
 480         *            VMDK3 & VMDK4 which is 2TB.
 481         *     64TB - for "ESXi seSparse Extent"
 482         *            minimal cluster size: 512B (default is 4KB)
 483         *            L2 table size: 4096 entries (const).
 484         *            64TB is more than the maximal value supported for
 485         *            seSparse VMDKs (which is slightly less than 64TB)
 486         */
 487        error_setg(errp, "L1 size too big");
 488        return -EFBIG;
 489    }
 490
 491    nb_sectors = bdrv_nb_sectors(file->bs);
 492    if (nb_sectors < 0) {
 493        return nb_sectors;
 494    }
 495
 496    s->extents = g_renew(VmdkExtent, s->extents, s->num_extents + 1);
 497    extent = &s->extents[s->num_extents];
 498    s->num_extents++;
 499
 500    memset(extent, 0, sizeof(VmdkExtent));
 501    extent->file = file;
 502    extent->flat = flat;
 503    extent->sectors = sectors;
 504    extent->l1_table_offset = l1_offset;
 505    extent->l1_backup_table_offset = l1_backup_offset;
 506    extent->l1_size = l1_size;
 507    extent->l1_entry_sectors = l2_size * cluster_sectors;
 508    extent->l2_size = l2_size;
 509    extent->cluster_sectors = flat ? sectors : cluster_sectors;
 510    extent->next_cluster_sector = ROUND_UP(nb_sectors, cluster_sectors);
 511    extent->entry_size = sizeof(uint32_t);
 512
 513    if (s->num_extents > 1) {
 514        extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
 515    } else {
 516        extent->end_sector = extent->sectors;
 517    }
 518    bs->total_sectors = extent->end_sector;
 519    if (new_extent) {
 520        *new_extent = extent;
 521    }
 522    return 0;
 523}
 524
 525static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,
 526                            Error **errp)
 527{
 528    int ret;
 529    size_t l1_size;
 530    int i;
 531
 532    /* read the L1 table */
 533    l1_size = extent->l1_size * extent->entry_size;
 534    extent->l1_table = g_try_malloc(l1_size);
 535    if (l1_size && extent->l1_table == NULL) {
 536        return -ENOMEM;
 537    }
 538
 539    ret = bdrv_pread(extent->file,
 540                     extent->l1_table_offset,
 541                     extent->l1_table,
 542                     l1_size);
 543    if (ret < 0) {
 544        bdrv_refresh_filename(extent->file->bs);
 545        error_setg_errno(errp, -ret,
 546                         "Could not read l1 table from extent '%s'",
 547                         extent->file->bs->filename);
 548        goto fail_l1;
 549    }
 550    for (i = 0; i < extent->l1_size; i++) {
 551        if (extent->entry_size == sizeof(uint64_t)) {
 552            le64_to_cpus((uint64_t *)extent->l1_table + i);
 553        } else {
 554            assert(extent->entry_size == sizeof(uint32_t));
 555            le32_to_cpus((uint32_t *)extent->l1_table + i);
 556        }
 557    }
 558
 559    if (extent->l1_backup_table_offset) {
 560        assert(!extent->sesparse);
 561        extent->l1_backup_table = g_try_malloc(l1_size);
 562        if (l1_size && extent->l1_backup_table == NULL) {
 563            ret = -ENOMEM;
 564            goto fail_l1;
 565        }
 566        ret = bdrv_pread(extent->file,
 567                         extent->l1_backup_table_offset,
 568                         extent->l1_backup_table,
 569                         l1_size);
 570        if (ret < 0) {
 571            bdrv_refresh_filename(extent->file->bs);
 572            error_setg_errno(errp, -ret,
 573                             "Could not read l1 backup table from extent '%s'",
 574                             extent->file->bs->filename);
 575            goto fail_l1b;
 576        }
 577        for (i = 0; i < extent->l1_size; i++) {
 578            le32_to_cpus(&extent->l1_backup_table[i]);
 579        }
 580    }
 581
 582    extent->l2_cache =
 583        g_malloc(extent->entry_size * extent->l2_size * L2_CACHE_SIZE);
 584    return 0;
 585 fail_l1b:
 586    g_free(extent->l1_backup_table);
 587 fail_l1:
 588    g_free(extent->l1_table);
 589    return ret;
 590}
 591
 592static int vmdk_open_vmfs_sparse(BlockDriverState *bs,
 593                                 BdrvChild *file,
 594                                 int flags, Error **errp)
 595{
 596    int ret;
 597    uint32_t magic;
 598    VMDK3Header header;
 599    VmdkExtent *extent = NULL;
 600
 601    ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
 602    if (ret < 0) {
 603        bdrv_refresh_filename(file->bs);
 604        error_setg_errno(errp, -ret,
 605                         "Could not read header from file '%s'",
 606                         file->bs->filename);
 607        return ret;
 608    }
 609    ret = vmdk_add_extent(bs, file, false,
 610                          le32_to_cpu(header.disk_sectors),
 611                          (int64_t)le32_to_cpu(header.l1dir_offset) << 9,
 612                          0,
 613                          le32_to_cpu(header.l1dir_size),
 614                          4096,
 615                          le32_to_cpu(header.granularity),
 616                          &extent,
 617                          errp);
 618    if (ret < 0) {
 619        return ret;
 620    }
 621    ret = vmdk_init_tables(bs, extent, errp);
 622    if (ret) {
 623        /* free extent allocated by vmdk_add_extent */
 624        vmdk_free_last_extent(bs);
 625    }
 626    return ret;
 627}
 628
 629#define SESPARSE_CONST_HEADER_MAGIC UINT64_C(0x00000000cafebabe)
 630#define SESPARSE_VOLATILE_HEADER_MAGIC UINT64_C(0x00000000cafecafe)
 631
 632/* Strict checks - format not officially documented */
 633static int check_se_sparse_const_header(VMDKSESparseConstHeader *header,
 634                                        Error **errp)
 635{
 636    header->magic = le64_to_cpu(header->magic);
 637    header->version = le64_to_cpu(header->version);
 638    header->grain_size = le64_to_cpu(header->grain_size);
 639    header->grain_table_size = le64_to_cpu(header->grain_table_size);
 640    header->flags = le64_to_cpu(header->flags);
 641    header->reserved1 = le64_to_cpu(header->reserved1);
 642    header->reserved2 = le64_to_cpu(header->reserved2);
 643    header->reserved3 = le64_to_cpu(header->reserved3);
 644    header->reserved4 = le64_to_cpu(header->reserved4);
 645
 646    header->volatile_header_offset =
 647        le64_to_cpu(header->volatile_header_offset);
 648    header->volatile_header_size = le64_to_cpu(header->volatile_header_size);
 649
 650    header->journal_header_offset = le64_to_cpu(header->journal_header_offset);
 651    header->journal_header_size = le64_to_cpu(header->journal_header_size);
 652
 653    header->journal_offset = le64_to_cpu(header->journal_offset);
 654    header->journal_size = le64_to_cpu(header->journal_size);
 655
 656    header->grain_dir_offset = le64_to_cpu(header->grain_dir_offset);
 657    header->grain_dir_size = le64_to_cpu(header->grain_dir_size);
 658
 659    header->grain_tables_offset = le64_to_cpu(header->grain_tables_offset);
 660    header->grain_tables_size = le64_to_cpu(header->grain_tables_size);
 661
 662    header->free_bitmap_offset = le64_to_cpu(header->free_bitmap_offset);
 663    header->free_bitmap_size = le64_to_cpu(header->free_bitmap_size);
 664
 665    header->backmap_offset = le64_to_cpu(header->backmap_offset);
 666    header->backmap_size = le64_to_cpu(header->backmap_size);
 667
 668    header->grains_offset = le64_to_cpu(header->grains_offset);
 669    header->grains_size = le64_to_cpu(header->grains_size);
 670
 671    if (header->magic != SESPARSE_CONST_HEADER_MAGIC) {
 672        error_setg(errp, "Bad const header magic: 0x%016" PRIx64,
 673                   header->magic);
 674        return -EINVAL;
 675    }
 676
 677    if (header->version != 0x0000000200000001) {
 678        error_setg(errp, "Unsupported version: 0x%016" PRIx64,
 679                   header->version);
 680        return -ENOTSUP;
 681    }
 682
 683    if (header->grain_size != 8) {
 684        error_setg(errp, "Unsupported grain size: %" PRIu64,
 685                   header->grain_size);
 686        return -ENOTSUP;
 687    }
 688
 689    if (header->grain_table_size != 64) {
 690        error_setg(errp, "Unsupported grain table size: %" PRIu64,
 691                   header->grain_table_size);
 692        return -ENOTSUP;
 693    }
 694
 695    if (header->flags != 0) {
 696        error_setg(errp, "Unsupported flags: 0x%016" PRIx64,
 697                   header->flags);
 698        return -ENOTSUP;
 699    }
 700
 701    if (header->reserved1 != 0 || header->reserved2 != 0 ||
 702        header->reserved3 != 0 || header->reserved4 != 0) {
 703        error_setg(errp, "Unsupported reserved bits:"
 704                   " 0x%016" PRIx64 " 0x%016" PRIx64
 705                   " 0x%016" PRIx64 " 0x%016" PRIx64,
 706                   header->reserved1, header->reserved2,
 707                   header->reserved3, header->reserved4);
 708        return -ENOTSUP;
 709    }
 710
 711    /* check that padding is 0 */
 712    if (!buffer_is_zero(header->pad, sizeof(header->pad))) {
 713        error_setg(errp, "Unsupported non-zero const header padding");
 714        return -ENOTSUP;
 715    }
 716
 717    return 0;
 718}
 719
 720static int check_se_sparse_volatile_header(VMDKSESparseVolatileHeader *header,
 721                                           Error **errp)
 722{
 723    header->magic = le64_to_cpu(header->magic);
 724    header->free_gt_number = le64_to_cpu(header->free_gt_number);
 725    header->next_txn_seq_number = le64_to_cpu(header->next_txn_seq_number);
 726    header->replay_journal = le64_to_cpu(header->replay_journal);
 727
 728    if (header->magic != SESPARSE_VOLATILE_HEADER_MAGIC) {
 729        error_setg(errp, "Bad volatile header magic: 0x%016" PRIx64,
 730                   header->magic);
 731        return -EINVAL;
 732    }
 733
 734    if (header->replay_journal) {
 735        error_setg(errp, "Image is dirty, Replaying journal not supported");
 736        return -ENOTSUP;
 737    }
 738
 739    /* check that padding is 0 */
 740    if (!buffer_is_zero(header->pad, sizeof(header->pad))) {
 741        error_setg(errp, "Unsupported non-zero volatile header padding");
 742        return -ENOTSUP;
 743    }
 744
 745    return 0;
 746}
 747
 748static int vmdk_open_se_sparse(BlockDriverState *bs,
 749                               BdrvChild *file,
 750                               int flags, Error **errp)
 751{
 752    int ret;
 753    VMDKSESparseConstHeader const_header;
 754    VMDKSESparseVolatileHeader volatile_header;
 755    VmdkExtent *extent = NULL;
 756
 757    ret = bdrv_apply_auto_read_only(bs,
 758            "No write support for seSparse images available", errp);
 759    if (ret < 0) {
 760        return ret;
 761    }
 762
 763    assert(sizeof(const_header) == SECTOR_SIZE);
 764
 765    ret = bdrv_pread(file, 0, &const_header, sizeof(const_header));
 766    if (ret < 0) {
 767        bdrv_refresh_filename(file->bs);
 768        error_setg_errno(errp, -ret,
 769                         "Could not read const header from file '%s'",
 770                         file->bs->filename);
 771        return ret;
 772    }
 773
 774    /* check const header */
 775    ret = check_se_sparse_const_header(&const_header, errp);
 776    if (ret < 0) {
 777        return ret;
 778    }
 779
 780    assert(sizeof(volatile_header) == SECTOR_SIZE);
 781
 782    ret = bdrv_pread(file,
 783                     const_header.volatile_header_offset * SECTOR_SIZE,
 784                     &volatile_header, sizeof(volatile_header));
 785    if (ret < 0) {
 786        bdrv_refresh_filename(file->bs);
 787        error_setg_errno(errp, -ret,
 788                         "Could not read volatile header from file '%s'",
 789                         file->bs->filename);
 790        return ret;
 791    }
 792
 793    /* check volatile header */
 794    ret = check_se_sparse_volatile_header(&volatile_header, errp);
 795    if (ret < 0) {
 796        return ret;
 797    }
 798
 799    ret = vmdk_add_extent(bs, file, false,
 800                          const_header.capacity,
 801                          const_header.grain_dir_offset * SECTOR_SIZE,
 802                          0,
 803                          const_header.grain_dir_size *
 804                          SECTOR_SIZE / sizeof(uint64_t),
 805                          const_header.grain_table_size *
 806                          SECTOR_SIZE / sizeof(uint64_t),
 807                          const_header.grain_size,
 808                          &extent,
 809                          errp);
 810    if (ret < 0) {
 811        return ret;
 812    }
 813
 814    extent->sesparse = true;
 815    extent->sesparse_l2_tables_offset = const_header.grain_tables_offset;
 816    extent->sesparse_clusters_offset = const_header.grains_offset;
 817    extent->entry_size = sizeof(uint64_t);
 818
 819    ret = vmdk_init_tables(bs, extent, errp);
 820    if (ret) {
 821        /* free extent allocated by vmdk_add_extent */
 822        vmdk_free_last_extent(bs);
 823    }
 824
 825    return ret;
 826}
 827
 828static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
 829                               QDict *options, Error **errp);
 830
 831static char *vmdk_read_desc(BdrvChild *file, uint64_t desc_offset, Error **errp)
 832{
 833    int64_t size;
 834    char *buf;
 835    int ret;
 836
 837    size = bdrv_getlength(file->bs);
 838    if (size < 0) {
 839        error_setg_errno(errp, -size, "Could not access file");
 840        return NULL;
 841    }
 842
 843    if (size < 4) {
 844        /* Both descriptor file and sparse image must be much larger than 4
 845         * bytes, also callers of vmdk_read_desc want to compare the first 4
 846         * bytes with VMDK4_MAGIC, let's error out if less is read. */
 847        error_setg(errp, "File is too small, not a valid image");
 848        return NULL;
 849    }
 850
 851    size = MIN(size, (1 << 20) - 1);  /* avoid unbounded allocation */
 852    buf = g_malloc(size + 1);
 853
 854    ret = bdrv_pread(file, desc_offset, buf, size);
 855    if (ret < 0) {
 856        error_setg_errno(errp, -ret, "Could not read from file");
 857        g_free(buf);
 858        return NULL;
 859    }
 860    buf[ret] = 0;
 861
 862    return buf;
 863}
 864
 865static int vmdk_open_vmdk4(BlockDriverState *bs,
 866                           BdrvChild *file,
 867                           int flags, QDict *options, Error **errp)
 868{
 869    int ret;
 870    uint32_t magic;
 871    uint32_t l1_size, l1_entry_sectors;
 872    VMDK4Header header;
 873    VmdkExtent *extent = NULL;
 874    BDRVVmdkState *s = bs->opaque;
 875    int64_t l1_backup_offset = 0;
 876    bool compressed;
 877
 878    ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
 879    if (ret < 0) {
 880        bdrv_refresh_filename(file->bs);
 881        error_setg_errno(errp, -ret,
 882                         "Could not read header from file '%s'",
 883                         file->bs->filename);
 884        return -EINVAL;
 885    }
 886    if (header.capacity == 0) {
 887        uint64_t desc_offset = le64_to_cpu(header.desc_offset);
 888        if (desc_offset) {
 889            char *buf = vmdk_read_desc(file, desc_offset << 9, errp);
 890            if (!buf) {
 891                return -EINVAL;
 892            }
 893            ret = vmdk_open_desc_file(bs, flags, buf, options, errp);
 894            g_free(buf);
 895            return ret;
 896        }
 897    }
 898
 899    if (!s->create_type) {
 900        s->create_type = g_strdup("monolithicSparse");
 901    }
 902
 903    if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
 904        /*
 905         * The footer takes precedence over the header, so read it in. The
 906         * footer starts at offset -1024 from the end: One sector for the
 907         * footer, and another one for the end-of-stream marker.
 908         */
 909        struct {
 910            struct {
 911                uint64_t val;
 912                uint32_t size;
 913                uint32_t type;
 914                uint8_t pad[512 - 16];
 915            } QEMU_PACKED footer_marker;
 916
 917            uint32_t magic;
 918            VMDK4Header header;
 919            uint8_t pad[512 - 4 - sizeof(VMDK4Header)];
 920
 921            struct {
 922                uint64_t val;
 923                uint32_t size;
 924                uint32_t type;
 925                uint8_t pad[512 - 16];
 926            } QEMU_PACKED eos_marker;
 927        } QEMU_PACKED footer;
 928
 929        ret = bdrv_pread(file,
 930            bs->file->bs->total_sectors * 512 - 1536,
 931            &footer, sizeof(footer));
 932        if (ret < 0) {
 933            error_setg_errno(errp, -ret, "Failed to read footer");
 934            return ret;
 935        }
 936
 937        /* Some sanity checks for the footer */
 938        if (be32_to_cpu(footer.magic) != VMDK4_MAGIC ||
 939            le32_to_cpu(footer.footer_marker.size) != 0  ||
 940            le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER ||
 941            le64_to_cpu(footer.eos_marker.val) != 0  ||
 942            le32_to_cpu(footer.eos_marker.size) != 0  ||
 943            le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
 944        {
 945            error_setg(errp, "Invalid footer");
 946            return -EINVAL;
 947        }
 948
 949        header = footer.header;
 950    }
 951
 952    compressed =
 953        le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
 954    if (le32_to_cpu(header.version) > 3) {
 955        error_setg(errp, "Unsupported VMDK version %" PRIu32,
 956                   le32_to_cpu(header.version));
 957        return -ENOTSUP;
 958    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR) &&
 959               !compressed) {
 960        /* VMware KB 2064959 explains that version 3 added support for
 961         * persistent changed block tracking (CBT), and backup software can
 962         * read it as version=1 if it doesn't care about the changed area
 963         * information. So we are safe to enable read only. */
 964        error_setg(errp, "VMDK version 3 must be read only");
 965        return -EINVAL;
 966    }
 967
 968    if (le32_to_cpu(header.num_gtes_per_gt) > 512) {
 969        error_setg(errp, "L2 table size too big");
 970        return -EINVAL;
 971    }
 972
 973    l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gt)
 974                        * le64_to_cpu(header.granularity);
 975    if (l1_entry_sectors == 0) {
 976        error_setg(errp, "L1 entry size is invalid");
 977        return -EINVAL;
 978    }
 979    l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1)
 980                / l1_entry_sectors;
 981    if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) {
 982        l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9;
 983    }
 984    if (bdrv_nb_sectors(file->bs) < le64_to_cpu(header.grain_offset)) {
 985        error_setg(errp, "File truncated, expecting at least %" PRId64 " bytes",
 986                   (int64_t)(le64_to_cpu(header.grain_offset)
 987                             * BDRV_SECTOR_SIZE));
 988        return -EINVAL;
 989    }
 990
 991    ret = vmdk_add_extent(bs, file, false,
 992                          le64_to_cpu(header.capacity),
 993                          le64_to_cpu(header.gd_offset) << 9,
 994                          l1_backup_offset,
 995                          l1_size,
 996                          le32_to_cpu(header.num_gtes_per_gt),
 997                          le64_to_cpu(header.granularity),
 998                          &extent,
 999                          errp);
1000    if (ret < 0) {
1001        return ret;
1002    }
1003    extent->compressed =
1004        le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
1005    if (extent->compressed) {
1006        g_free(s->create_type);
1007        s->create_type = g_strdup("streamOptimized");
1008    }
1009    extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
1010    extent->version = le32_to_cpu(header.version);
1011    extent->has_zero_grain = le32_to_cpu(header.flags) & VMDK4_FLAG_ZERO_GRAIN;
1012    ret = vmdk_init_tables(bs, extent, errp);
1013    if (ret) {
1014        /* free extent allocated by vmdk_add_extent */
1015        vmdk_free_last_extent(bs);
1016    }
1017    return ret;
1018}
1019
1020/* find an option value out of descriptor file */
1021static int vmdk_parse_description(const char *desc, const char *opt_name,
1022        char *buf, int buf_size)
1023{
1024    char *opt_pos, *opt_end;
1025    const char *end = desc + strlen(desc);
1026
1027    opt_pos = strstr(desc, opt_name);
1028    if (!opt_pos) {
1029        return VMDK_ERROR;
1030    }
1031    /* Skip "=\"" following opt_name */
1032    opt_pos += strlen(opt_name) + 2;
1033    if (opt_pos >= end) {
1034        return VMDK_ERROR;
1035    }
1036    opt_end = opt_pos;
1037    while (opt_end < end && *opt_end != '"') {
1038        opt_end++;
1039    }
1040    if (opt_end == end || buf_size < opt_end - opt_pos + 1) {
1041        return VMDK_ERROR;
1042    }
1043    pstrcpy(buf, opt_end - opt_pos + 1, opt_pos);
1044    return VMDK_OK;
1045}
1046
1047/* Open an extent file and append to bs array */
1048static int vmdk_open_sparse(BlockDriverState *bs, BdrvChild *file, int flags,
1049                            char *buf, QDict *options, Error **errp)
1050{
1051    uint32_t magic;
1052
1053    magic = ldl_be_p(buf);
1054    switch (magic) {
1055        case VMDK3_MAGIC:
1056            return vmdk_open_vmfs_sparse(bs, file, flags, errp);
1057        case VMDK4_MAGIC:
1058            return vmdk_open_vmdk4(bs, file, flags, options, errp);
1059        default:
1060            error_setg(errp, "Image not in VMDK format");
1061            return -EINVAL;
1062    }
1063}
1064
1065static const char *next_line(const char *s)
1066{
1067    while (*s) {
1068        if (*s == '\n') {
1069            return s + 1;
1070        }
1071        s++;
1072    }
1073    return s;
1074}
1075
1076static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
1077                              QDict *options, Error **errp)
1078{
1079    int ret;
1080    int matches;
1081    char access[11];
1082    char type[11];
1083    char fname[512];
1084    const char *p, *np;
1085    int64_t sectors = 0;
1086    int64_t flat_offset;
1087    char *desc_file_dir = NULL;
1088    char *extent_path;
1089    BdrvChild *extent_file;
1090    BdrvChildRole extent_role;
1091    BDRVVmdkState *s = bs->opaque;
1092    VmdkExtent *extent = NULL;
1093    char extent_opt_prefix[32];
1094    Error *local_err = NULL;
1095
1096    for (p = desc; *p; p = next_line(p)) {
1097        /* parse extent line in one of below formats:
1098         *
1099         * RW [size in sectors] FLAT "file-name.vmdk" OFFSET
1100         * RW [size in sectors] SPARSE "file-name.vmdk"
1101         * RW [size in sectors] VMFS "file-name.vmdk"
1102         * RW [size in sectors] VMFSSPARSE "file-name.vmdk"
1103         * RW [size in sectors] SESPARSE "file-name.vmdk"
1104         */
1105        flat_offset = -1;
1106        matches = sscanf(p, "%10s %" SCNd64 " %10s \"%511[^\n\r\"]\" %" SCNd64,
1107                         access, &sectors, type, fname, &flat_offset);
1108        if (matches < 4 || strcmp(access, "RW")) {
1109            continue;
1110        } else if (!strcmp(type, "FLAT")) {
1111            if (matches != 5 || flat_offset < 0) {
1112                goto invalid;
1113            }
1114        } else if (!strcmp(type, "VMFS")) {
1115            if (matches == 4) {
1116                flat_offset = 0;
1117            } else {
1118                goto invalid;
1119            }
1120        } else if (matches != 4) {
1121            goto invalid;
1122        }
1123
1124        if (sectors <= 0 ||
1125            (strcmp(type, "FLAT") && strcmp(type, "SPARSE") &&
1126             strcmp(type, "VMFS") && strcmp(type, "VMFSSPARSE") &&
1127             strcmp(type, "SESPARSE")) ||
1128            (strcmp(access, "RW"))) {
1129            continue;
1130        }
1131
1132        if (path_is_absolute(fname)) {
1133            extent_path = g_strdup(fname);
1134        } else {
1135            if (!desc_file_dir) {
1136                desc_file_dir = bdrv_dirname(bs->file->bs, errp);
1137                if (!desc_file_dir) {
1138                    bdrv_refresh_filename(bs->file->bs);
1139                    error_prepend(errp, "Cannot use relative paths with VMDK "
1140                                  "descriptor file '%s': ",
1141                                  bs->file->bs->filename);
1142                    ret = -EINVAL;
1143                    goto out;
1144                }
1145            }
1146
1147            extent_path = g_strconcat(desc_file_dir, fname, NULL);
1148        }
1149
1150        ret = snprintf(extent_opt_prefix, 32, "extents.%d", s->num_extents);
1151        assert(ret < 32);
1152
1153        extent_role = BDRV_CHILD_DATA;
1154        if (strcmp(type, "FLAT") != 0 && strcmp(type, "VMFS") != 0) {
1155            /* non-flat extents have metadata */
1156            extent_role |= BDRV_CHILD_METADATA;
1157        }
1158
1159        extent_file = bdrv_open_child(extent_path, options, extent_opt_prefix,
1160                                      bs, &child_of_bds, extent_role, false,
1161                                      &local_err);
1162        g_free(extent_path);
1163        if (local_err) {
1164            error_propagate(errp, local_err);
1165            ret = -EINVAL;
1166            goto out;
1167        }
1168
1169        /* save to extents array */
1170        if (!strcmp(type, "FLAT") || !strcmp(type, "VMFS")) {
1171            /* FLAT extent */
1172
1173            ret = vmdk_add_extent(bs, extent_file, true, sectors,
1174                            0, 0, 0, 0, 0, &extent, errp);
1175            if (ret < 0) {
1176                bdrv_unref_child(bs, extent_file);
1177                goto out;
1178            }
1179            extent->flat_start_offset = flat_offset << 9;
1180        } else if (!strcmp(type, "SPARSE") || !strcmp(type, "VMFSSPARSE")) {
1181            /* SPARSE extent and VMFSSPARSE extent are both "COWD" sparse file*/
1182            char *buf = vmdk_read_desc(extent_file, 0, errp);
1183            if (!buf) {
1184                ret = -EINVAL;
1185            } else {
1186                ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, buf,
1187                                       options, errp);
1188            }
1189            g_free(buf);
1190            if (ret) {
1191                bdrv_unref_child(bs, extent_file);
1192                goto out;
1193            }
1194            extent = &s->extents[s->num_extents - 1];
1195        } else if (!strcmp(type, "SESPARSE")) {
1196            ret = vmdk_open_se_sparse(bs, extent_file, bs->open_flags, errp);
1197            if (ret) {
1198                bdrv_unref_child(bs, extent_file);
1199                goto out;
1200            }
1201            extent = &s->extents[s->num_extents - 1];
1202        } else {
1203            error_setg(errp, "Unsupported extent type '%s'", type);
1204            bdrv_unref_child(bs, extent_file);
1205            ret = -ENOTSUP;
1206            goto out;
1207        }
1208        extent->type = g_strdup(type);
1209    }
1210
1211    ret = 0;
1212    goto out;
1213
1214invalid:
1215    np = next_line(p);
1216    assert(np != p);
1217    if (np[-1] == '\n') {
1218        np--;
1219    }
1220    error_setg(errp, "Invalid extent line: %.*s", (int)(np - p), p);
1221    ret = -EINVAL;
1222
1223out:
1224    g_free(desc_file_dir);
1225    return ret;
1226}
1227
1228static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
1229                               QDict *options, Error **errp)
1230{
1231    int ret;
1232    char ct[128];
1233    BDRVVmdkState *s = bs->opaque;
1234
1235    if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) {
1236        error_setg(errp, "invalid VMDK image descriptor");
1237        ret = -EINVAL;
1238        goto exit;
1239    }
1240    if (strcmp(ct, "monolithicFlat") &&
1241        strcmp(ct, "vmfs") &&
1242        strcmp(ct, "vmfsSparse") &&
1243        strcmp(ct, "seSparse") &&
1244        strcmp(ct, "twoGbMaxExtentSparse") &&
1245        strcmp(ct, "twoGbMaxExtentFlat")) {
1246        error_setg(errp, "Unsupported image type '%s'", ct);
1247        ret = -ENOTSUP;
1248        goto exit;
1249    }
1250    s->create_type = g_strdup(ct);
1251    s->desc_offset = 0;
1252    ret = vmdk_parse_extents(buf, bs, options, errp);
1253exit:
1254    return ret;
1255}
1256
1257static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
1258                     Error **errp)
1259{
1260    char *buf;
1261    int ret;
1262    BDRVVmdkState *s = bs->opaque;
1263    uint32_t magic;
1264
1265    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
1266                               BDRV_CHILD_IMAGE, false, errp);
1267    if (!bs->file) {
1268        return -EINVAL;
1269    }
1270
1271    buf = vmdk_read_desc(bs->file, 0, errp);
1272    if (!buf) {
1273        return -EINVAL;
1274    }
1275
1276    magic = ldl_be_p(buf);
1277    switch (magic) {
1278        case VMDK3_MAGIC:
1279        case VMDK4_MAGIC:
1280            ret = vmdk_open_sparse(bs, bs->file, flags, buf, options,
1281                                   errp);
1282            s->desc_offset = 0x200;
1283            break;
1284        default:
1285            /* No data in the descriptor file */
1286            bs->file->role &= ~BDRV_CHILD_DATA;
1287
1288            /* Must succeed because we have given up permissions if anything */
1289            bdrv_child_refresh_perms(bs, bs->file, &error_abort);
1290
1291            ret = vmdk_open_desc_file(bs, flags, buf, options, errp);
1292            break;
1293    }
1294    if (ret) {
1295        goto fail;
1296    }
1297
1298    /* try to open parent images, if exist */
1299    ret = vmdk_parent_open(bs);
1300    if (ret) {
1301        goto fail;
1302    }
1303    ret = vmdk_read_cid(bs, 0, &s->cid);
1304    if (ret) {
1305        goto fail;
1306    }
1307    ret = vmdk_read_cid(bs, 1, &s->parent_cid);
1308    if (ret) {
1309        goto fail;
1310    }
1311    qemu_co_mutex_init(&s->lock);
1312
1313    /* Disable migration when VMDK images are used */
1314    error_setg(&s->migration_blocker, "The vmdk format used by node '%s' "
1315               "does not support live migration",
1316               bdrv_get_device_or_node_name(bs));
1317    ret = migrate_add_blocker(s->migration_blocker, errp);
1318    if (ret < 0) {
1319        error_free(s->migration_blocker);
1320        goto fail;
1321    }
1322
1323    g_free(buf);
1324    return 0;
1325
1326fail:
1327    g_free(buf);
1328    g_free(s->create_type);
1329    s->create_type = NULL;
1330    vmdk_free_extents(bs);
1331    return ret;
1332}
1333
1334
1335static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp)
1336{
1337    BDRVVmdkState *s = bs->opaque;
1338    int i;
1339
1340    for (i = 0; i < s->num_extents; i++) {
1341        if (!s->extents[i].flat) {
1342            bs->bl.pwrite_zeroes_alignment =
1343                MAX(bs->bl.pwrite_zeroes_alignment,
1344                    s->extents[i].cluster_sectors << BDRV_SECTOR_BITS);
1345        }
1346    }
1347}
1348
1349/**
1350 * get_whole_cluster
1351 *
1352 * Copy backing file's cluster that covers @sector_num, otherwise write zero,
1353 * to the cluster at @cluster_sector_num. If @zeroed is true, we're overwriting
1354 * a zeroed cluster in the current layer and must not copy data from the
1355 * backing file.
1356 *
1357 * If @skip_start_sector < @skip_end_sector, the relative range
1358 * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave
1359 * it for call to write user data in the request.
1360 */
1361static int get_whole_cluster(BlockDriverState *bs,
1362                             VmdkExtent *extent,
1363                             uint64_t cluster_offset,
1364                             uint64_t offset,
1365                             uint64_t skip_start_bytes,
1366                             uint64_t skip_end_bytes,
1367                             bool zeroed)
1368{
1369    int ret = VMDK_OK;
1370    int64_t cluster_bytes;
1371    uint8_t *whole_grain;
1372    bool copy_from_backing;
1373
1374    /* For COW, align request sector_num to cluster start */
1375    cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS;
1376    offset = QEMU_ALIGN_DOWN(offset, cluster_bytes);
1377    whole_grain = qemu_blockalign(bs, cluster_bytes);
1378    copy_from_backing = bs->backing && !zeroed;
1379
1380    if (!copy_from_backing) {
1381        memset(whole_grain, 0, skip_start_bytes);
1382        memset(whole_grain + skip_end_bytes, 0, cluster_bytes - skip_end_bytes);
1383    }
1384
1385    assert(skip_end_bytes <= cluster_bytes);
1386    /* we will be here if it's first write on non-exist grain(cluster).
1387     * try to read from parent image, if exist */
1388    if (bs->backing && !vmdk_is_cid_valid(bs)) {
1389        ret = VMDK_ERROR;
1390        goto exit;
1391    }
1392
1393    /* Read backing data before skip range */
1394    if (skip_start_bytes > 0) {
1395        if (copy_from_backing) {
1396            /* qcow2 emits this on bs->file instead of bs->backing */
1397            BLKDBG_EVENT(extent->file, BLKDBG_COW_READ);
1398            ret = bdrv_pread(bs->backing, offset, whole_grain,
1399                             skip_start_bytes);
1400            if (ret < 0) {
1401                ret = VMDK_ERROR;
1402                goto exit;
1403            }
1404        }
1405        BLKDBG_EVENT(extent->file, BLKDBG_COW_WRITE);
1406        ret = bdrv_pwrite(extent->file, cluster_offset, whole_grain,
1407                          skip_start_bytes);
1408        if (ret < 0) {
1409            ret = VMDK_ERROR;
1410            goto exit;
1411        }
1412    }
1413    /* Read backing data after skip range */
1414    if (skip_end_bytes < cluster_bytes) {
1415        if (copy_from_backing) {
1416            /* qcow2 emits this on bs->file instead of bs->backing */
1417            BLKDBG_EVENT(extent->file, BLKDBG_COW_READ);
1418            ret = bdrv_pread(bs->backing, offset + skip_end_bytes,
1419                             whole_grain + skip_end_bytes,
1420                             cluster_bytes - skip_end_bytes);
1421            if (ret < 0) {
1422                ret = VMDK_ERROR;
1423                goto exit;
1424            }
1425        }
1426        BLKDBG_EVENT(extent->file, BLKDBG_COW_WRITE);
1427        ret = bdrv_pwrite(extent->file, cluster_offset + skip_end_bytes,
1428                          whole_grain + skip_end_bytes,
1429                          cluster_bytes - skip_end_bytes);
1430        if (ret < 0) {
1431            ret = VMDK_ERROR;
1432            goto exit;
1433        }
1434    }
1435
1436    ret = VMDK_OK;
1437exit:
1438    qemu_vfree(whole_grain);
1439    return ret;
1440}
1441
1442static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
1443                         uint32_t offset)
1444{
1445    offset = cpu_to_le32(offset);
1446    /* update L2 table */
1447    BLKDBG_EVENT(extent->file, BLKDBG_L2_UPDATE);
1448    if (bdrv_pwrite(extent->file,
1449                ((int64_t)m_data->l2_offset * 512)
1450                    + (m_data->l2_index * sizeof(offset)),
1451                &offset, sizeof(offset)) < 0) {
1452        return VMDK_ERROR;
1453    }
1454    /* update backup L2 table */
1455    if (extent->l1_backup_table_offset != 0) {
1456        m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
1457        if (bdrv_pwrite(extent->file,
1458                    ((int64_t)m_data->l2_offset * 512)
1459                        + (m_data->l2_index * sizeof(offset)),
1460                    &offset, sizeof(offset)) < 0) {
1461            return VMDK_ERROR;
1462        }
1463    }
1464    if (bdrv_flush(extent->file->bs) < 0) {
1465        return VMDK_ERROR;
1466    }
1467    if (m_data->l2_cache_entry) {
1468        *m_data->l2_cache_entry = offset;
1469    }
1470
1471    return VMDK_OK;
1472}
1473
1474/**
1475 * get_cluster_offset
1476 *
1477 * Look up cluster offset in extent file by sector number, and store in
1478 * @cluster_offset.
1479 *
1480 * For flat extents, the start offset as parsed from the description file is
1481 * returned.
1482 *
1483 * For sparse extents, look up in L1, L2 table. If allocate is true, return an
1484 * offset for a new cluster and update L2 cache. If there is a backing file,
1485 * COW is done before returning; otherwise, zeroes are written to the allocated
1486 * cluster. Both COW and zero writing skips the sector range
1487 * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller
1488 * has new data to write there.
1489 *
1490 * Returns: VMDK_OK if cluster exists and mapped in the image.
1491 *          VMDK_UNALLOC if cluster is not mapped and @allocate is false.
1492 *          VMDK_ERROR if failed.
1493 */
1494static int get_cluster_offset(BlockDriverState *bs,
1495                              VmdkExtent *extent,
1496                              VmdkMetaData *m_data,
1497                              uint64_t offset,
1498                              bool allocate,
1499                              uint64_t *cluster_offset,
1500                              uint64_t skip_start_bytes,
1501                              uint64_t skip_end_bytes)
1502{
1503    unsigned int l1_index, l2_offset, l2_index;
1504    int min_index, i, j;
1505    uint32_t min_count;
1506    void *l2_table;
1507    bool zeroed = false;
1508    int64_t ret;
1509    int64_t cluster_sector;
1510    unsigned int l2_size_bytes = extent->l2_size * extent->entry_size;
1511
1512    if (m_data) {
1513        m_data->new_allocation = false;
1514    }
1515    if (extent->flat) {
1516        *cluster_offset = extent->flat_start_offset;
1517        return VMDK_OK;
1518    }
1519
1520    offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
1521    l1_index = (offset >> 9) / extent->l1_entry_sectors;
1522    if (l1_index >= extent->l1_size) {
1523        return VMDK_ERROR;
1524    }
1525    if (extent->sesparse) {
1526        uint64_t l2_offset_u64;
1527
1528        assert(extent->entry_size == sizeof(uint64_t));
1529
1530        l2_offset_u64 = ((uint64_t *)extent->l1_table)[l1_index];
1531        if (l2_offset_u64 == 0) {
1532            l2_offset = 0;
1533        } else if ((l2_offset_u64 & 0xffffffff00000000) != 0x1000000000000000) {
1534            /*
1535             * Top most nibble is 0x1 if grain table is allocated.
1536             * strict check - top most 4 bytes must be 0x10000000 since max
1537             * supported size is 64TB for disk - so no more than 64TB / 16MB
1538             * grain directories which is smaller than uint32,
1539             * where 16MB is the only supported default grain table coverage.
1540             */
1541            return VMDK_ERROR;
1542        } else {
1543            l2_offset_u64 = l2_offset_u64 & 0x00000000ffffffff;
1544            l2_offset_u64 = extent->sesparse_l2_tables_offset +
1545                l2_offset_u64 * l2_size_bytes / SECTOR_SIZE;
1546            if (l2_offset_u64 > 0x00000000ffffffff) {
1547                return VMDK_ERROR;
1548            }
1549            l2_offset = (unsigned int)(l2_offset_u64);
1550        }
1551    } else {
1552        assert(extent->entry_size == sizeof(uint32_t));
1553        l2_offset = ((uint32_t *)extent->l1_table)[l1_index];
1554    }
1555    if (!l2_offset) {
1556        return VMDK_UNALLOC;
1557    }
1558    for (i = 0; i < L2_CACHE_SIZE; i++) {
1559        if (l2_offset == extent->l2_cache_offsets[i]) {
1560            /* increment the hit count */
1561            if (++extent->l2_cache_counts[i] == 0xffffffff) {
1562                for (j = 0; j < L2_CACHE_SIZE; j++) {
1563                    extent->l2_cache_counts[j] >>= 1;
1564                }
1565            }
1566            l2_table = (char *)extent->l2_cache + (i * l2_size_bytes);
1567            goto found;
1568        }
1569    }
1570    /* not found: load a new entry in the least used one */
1571    min_index = 0;
1572    min_count = 0xffffffff;
1573    for (i = 0; i < L2_CACHE_SIZE; i++) {
1574        if (extent->l2_cache_counts[i] < min_count) {
1575            min_count = extent->l2_cache_counts[i];
1576            min_index = i;
1577        }
1578    }
1579    l2_table = (char *)extent->l2_cache + (min_index * l2_size_bytes);
1580    BLKDBG_EVENT(extent->file, BLKDBG_L2_LOAD);
1581    if (bdrv_pread(extent->file,
1582                (int64_t)l2_offset * 512,
1583                l2_table,
1584                l2_size_bytes
1585            ) != l2_size_bytes) {
1586        return VMDK_ERROR;
1587    }
1588
1589    extent->l2_cache_offsets[min_index] = l2_offset;
1590    extent->l2_cache_counts[min_index] = 1;
1591 found:
1592    l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
1593    if (m_data) {
1594        m_data->l1_index = l1_index;
1595        m_data->l2_index = l2_index;
1596        m_data->l2_offset = l2_offset;
1597        m_data->l2_cache_entry = ((uint32_t *)l2_table) + l2_index;
1598    }
1599
1600    if (extent->sesparse) {
1601        cluster_sector = le64_to_cpu(((uint64_t *)l2_table)[l2_index]);
1602        switch (cluster_sector & 0xf000000000000000) {
1603        case 0x0000000000000000:
1604            /* unallocated grain */
1605            if (cluster_sector != 0) {
1606                return VMDK_ERROR;
1607            }
1608            break;
1609        case 0x1000000000000000:
1610            /* scsi-unmapped grain - fallthrough */
1611        case 0x2000000000000000:
1612            /* zero grain */
1613            zeroed = true;
1614            break;
1615        case 0x3000000000000000:
1616            /* allocated grain */
1617            cluster_sector = (((cluster_sector & 0x0fff000000000000) >> 48) |
1618                              ((cluster_sector & 0x0000ffffffffffff) << 12));
1619            cluster_sector = extent->sesparse_clusters_offset +
1620                cluster_sector * extent->cluster_sectors;
1621            break;
1622        default:
1623            return VMDK_ERROR;
1624        }
1625    } else {
1626        cluster_sector = le32_to_cpu(((uint32_t *)l2_table)[l2_index]);
1627
1628        if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
1629            zeroed = true;
1630        }
1631    }
1632
1633    if (!cluster_sector || zeroed) {
1634        if (!allocate) {
1635            return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
1636        }
1637        assert(!extent->sesparse);
1638
1639        if (extent->next_cluster_sector >= VMDK_EXTENT_MAX_SECTORS) {
1640            return VMDK_ERROR;
1641        }
1642
1643        cluster_sector = extent->next_cluster_sector;
1644        extent->next_cluster_sector += extent->cluster_sectors;
1645
1646        /* First of all we write grain itself, to avoid race condition
1647         * that may to corrupt the image.
1648         * This problem may occur because of insufficient space on host disk
1649         * or inappropriate VM shutdown.
1650         */
1651        ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
1652                                offset, skip_start_bytes, skip_end_bytes,
1653                                zeroed);
1654        if (ret) {
1655            return ret;
1656        }
1657        if (m_data) {
1658            m_data->new_allocation = true;
1659        }
1660    }
1661    *cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
1662    return VMDK_OK;
1663}
1664
1665static VmdkExtent *find_extent(BDRVVmdkState *s,
1666                                int64_t sector_num, VmdkExtent *start_hint)
1667{
1668    VmdkExtent *extent = start_hint;
1669
1670    if (!extent) {
1671        extent = &s->extents[0];
1672    }
1673    while (extent < &s->extents[s->num_extents]) {
1674        if (sector_num < extent->end_sector) {
1675            return extent;
1676        }
1677        extent++;
1678    }
1679    return NULL;
1680}
1681
1682static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
1683                                                   int64_t offset)
1684{
1685    uint64_t extent_begin_offset, extent_relative_offset;
1686    uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;
1687
1688    extent_begin_offset =
1689        (extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE;
1690    extent_relative_offset = offset - extent_begin_offset;
1691    return extent_relative_offset % cluster_size;
1692}
1693
1694static int coroutine_fn vmdk_co_block_status(BlockDriverState *bs,
1695                                             bool want_zero,
1696                                             int64_t offset, int64_t bytes,
1697                                             int64_t *pnum, int64_t *map,
1698                                             BlockDriverState **file)
1699{
1700    BDRVVmdkState *s = bs->opaque;
1701    int64_t index_in_cluster, n, ret;
1702    uint64_t cluster_offset;
1703    VmdkExtent *extent;
1704
1705    extent = find_extent(s, offset >> BDRV_SECTOR_BITS, NULL);
1706    if (!extent) {
1707        return -EIO;
1708    }
1709    qemu_co_mutex_lock(&s->lock);
1710    ret = get_cluster_offset(bs, extent, NULL, offset, false, &cluster_offset,
1711                             0, 0);
1712    qemu_co_mutex_unlock(&s->lock);
1713
1714    index_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
1715    switch (ret) {
1716    case VMDK_ERROR:
1717        ret = -EIO;
1718        break;
1719    case VMDK_UNALLOC:
1720        ret = 0;
1721        break;
1722    case VMDK_ZEROED:
1723        ret = BDRV_BLOCK_ZERO;
1724        break;
1725    case VMDK_OK:
1726        ret = BDRV_BLOCK_DATA;
1727        if (!extent->compressed) {
1728            ret |= BDRV_BLOCK_OFFSET_VALID;
1729            *map = cluster_offset + index_in_cluster;
1730            if (extent->flat) {
1731                ret |= BDRV_BLOCK_RECURSE;
1732            }
1733        }
1734        *file = extent->file->bs;
1735        break;
1736    }
1737
1738    n = extent->cluster_sectors * BDRV_SECTOR_SIZE - index_in_cluster;
1739    *pnum = MIN(n, bytes);
1740    return ret;
1741}
1742
1743static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
1744                            int64_t offset_in_cluster, QEMUIOVector *qiov,
1745                            uint64_t qiov_offset, uint64_t n_bytes,
1746                            uint64_t offset)
1747{
1748    int ret;
1749    VmdkGrainMarker *data = NULL;
1750    uLongf buf_len;
1751    QEMUIOVector local_qiov;
1752    int64_t write_offset;
1753    int64_t write_end_sector;
1754
1755    if (extent->compressed) {
1756        void *compressed_data;
1757
1758        /* Only whole clusters */
1759        if (offset_in_cluster ||
1760            n_bytes > (extent->cluster_sectors * SECTOR_SIZE) ||
1761            (n_bytes < (extent->cluster_sectors * SECTOR_SIZE) &&
1762             offset + n_bytes != extent->end_sector * SECTOR_SIZE))
1763        {
1764            ret = -EINVAL;
1765            goto out;
1766        }
1767
1768        if (!extent->has_marker) {
1769            ret = -EINVAL;
1770            goto out;
1771        }
1772        buf_len = (extent->cluster_sectors << 9) * 2;
1773        data = g_malloc(buf_len + sizeof(VmdkGrainMarker));
1774
1775        compressed_data = g_malloc(n_bytes);
1776        qemu_iovec_to_buf(qiov, qiov_offset, compressed_data, n_bytes);
1777        ret = compress(data->data, &buf_len, compressed_data, n_bytes);
1778        g_free(compressed_data);
1779
1780        if (ret != Z_OK || buf_len == 0) {
1781            ret = -EINVAL;
1782            goto out;
1783        }
1784
1785        data->lba = cpu_to_le64(offset >> BDRV_SECTOR_BITS);
1786        data->size = cpu_to_le32(buf_len);
1787
1788        n_bytes = buf_len + sizeof(VmdkGrainMarker);
1789        qemu_iovec_init_buf(&local_qiov, data, n_bytes);
1790
1791        BLKDBG_EVENT(extent->file, BLKDBG_WRITE_COMPRESSED);
1792    } else {
1793        qemu_iovec_init(&local_qiov, qiov->niov);
1794        qemu_iovec_concat(&local_qiov, qiov, qiov_offset, n_bytes);
1795
1796        BLKDBG_EVENT(extent->file, BLKDBG_WRITE_AIO);
1797    }
1798
1799    write_offset = cluster_offset + offset_in_cluster;
1800    ret = bdrv_co_pwritev(extent->file, write_offset, n_bytes,
1801                          &local_qiov, 0);
1802
1803    write_end_sector = DIV_ROUND_UP(write_offset + n_bytes, BDRV_SECTOR_SIZE);
1804
1805    if (extent->compressed) {
1806        extent->next_cluster_sector = write_end_sector;
1807    } else {
1808        extent->next_cluster_sector = MAX(extent->next_cluster_sector,
1809                                          write_end_sector);
1810    }
1811
1812    if (ret < 0) {
1813        goto out;
1814    }
1815    ret = 0;
1816 out:
1817    g_free(data);
1818    if (!extent->compressed) {
1819        qemu_iovec_destroy(&local_qiov);
1820    }
1821    return ret;
1822}
1823
1824static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
1825                            int64_t offset_in_cluster, QEMUIOVector *qiov,
1826                            int bytes)
1827{
1828    int ret;
1829    int cluster_bytes, buf_bytes;
1830    uint8_t *cluster_buf, *compressed_data;
1831    uint8_t *uncomp_buf;
1832    uint32_t data_len;
1833    VmdkGrainMarker *marker;
1834    uLongf buf_len;
1835
1836
1837    if (!extent->compressed) {
1838        BLKDBG_EVENT(extent->file, BLKDBG_READ_AIO);
1839        ret = bdrv_co_preadv(extent->file,
1840                             cluster_offset + offset_in_cluster, bytes,
1841                             qiov, 0);
1842        if (ret < 0) {
1843            return ret;
1844        }
1845        return 0;
1846    }
1847    cluster_bytes = extent->cluster_sectors * 512;
1848    /* Read two clusters in case GrainMarker + compressed data > one cluster */
1849    buf_bytes = cluster_bytes * 2;
1850    cluster_buf = g_malloc(buf_bytes);
1851    uncomp_buf = g_malloc(cluster_bytes);
1852    BLKDBG_EVENT(extent->file, BLKDBG_READ_COMPRESSED);
1853    ret = bdrv_pread(extent->file,
1854                cluster_offset,
1855                cluster_buf, buf_bytes);
1856    if (ret < 0) {
1857        goto out;
1858    }
1859    compressed_data = cluster_buf;
1860    buf_len = cluster_bytes;
1861    data_len = cluster_bytes;
1862    if (extent->has_marker) {
1863        marker = (VmdkGrainMarker *)cluster_buf;
1864        compressed_data = marker->data;
1865        data_len = le32_to_cpu(marker->size);
1866    }
1867    if (!data_len || data_len > buf_bytes) {
1868        ret = -EINVAL;
1869        goto out;
1870    }
1871    ret = uncompress(uncomp_buf, &buf_len, compressed_data, data_len);
1872    if (ret != Z_OK) {
1873        ret = -EINVAL;
1874        goto out;
1875
1876    }
1877    if (offset_in_cluster < 0 ||
1878            offset_in_cluster + bytes > buf_len) {
1879        ret = -EINVAL;
1880        goto out;
1881    }
1882    qemu_iovec_from_buf(qiov, 0, uncomp_buf + offset_in_cluster, bytes);
1883    ret = 0;
1884
1885 out:
1886    g_free(uncomp_buf);
1887    g_free(cluster_buf);
1888    return ret;
1889}
1890
1891static int coroutine_fn
1892vmdk_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
1893               QEMUIOVector *qiov, BdrvRequestFlags flags)
1894{
1895    BDRVVmdkState *s = bs->opaque;
1896    int ret;
1897    uint64_t n_bytes, offset_in_cluster;
1898    VmdkExtent *extent = NULL;
1899    QEMUIOVector local_qiov;
1900    uint64_t cluster_offset;
1901    uint64_t bytes_done = 0;
1902
1903    qemu_iovec_init(&local_qiov, qiov->niov);
1904    qemu_co_mutex_lock(&s->lock);
1905
1906    while (bytes > 0) {
1907        extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent);
1908        if (!extent) {
1909            ret = -EIO;
1910            goto fail;
1911        }
1912        ret = get_cluster_offset(bs, extent, NULL,
1913                                 offset, false, &cluster_offset, 0, 0);
1914        offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
1915
1916        n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
1917                             - offset_in_cluster);
1918
1919        if (ret != VMDK_OK) {
1920            /* if not allocated, try to read from parent image, if exist */
1921            if (bs->backing && ret != VMDK_ZEROED) {
1922                if (!vmdk_is_cid_valid(bs)) {
1923                    ret = -EINVAL;
1924                    goto fail;
1925                }
1926
1927                qemu_iovec_reset(&local_qiov);
1928                qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
1929
1930                /* qcow2 emits this on bs->file instead of bs->backing */
1931                BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
1932                ret = bdrv_co_preadv(bs->backing, offset, n_bytes,
1933                                     &local_qiov, 0);
1934                if (ret < 0) {
1935                    goto fail;
1936                }
1937            } else {
1938                qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
1939            }
1940        } else {
1941            qemu_iovec_reset(&local_qiov);
1942            qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
1943
1944            ret = vmdk_read_extent(extent, cluster_offset, offset_in_cluster,
1945                                   &local_qiov, n_bytes);
1946            if (ret) {
1947                goto fail;
1948            }
1949        }
1950        bytes -= n_bytes;
1951        offset += n_bytes;
1952        bytes_done += n_bytes;
1953    }
1954
1955    ret = 0;
1956fail:
1957    qemu_co_mutex_unlock(&s->lock);
1958    qemu_iovec_destroy(&local_qiov);
1959
1960    return ret;
1961}
1962
1963/**
1964 * vmdk_write:
1965 * @zeroed:       buf is ignored (data is zero), use zeroed_grain GTE feature
1966 *                if possible, otherwise return -ENOTSUP.
1967 * @zero_dry_run: used for zeroed == true only, don't update L2 table, just try
1968 *                with each cluster. By dry run we can find if the zero write
1969 *                is possible without modifying image data.
1970 *
1971 * Returns: error code with 0 for success.
1972 */
1973static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset,
1974                       uint64_t bytes, QEMUIOVector *qiov,
1975                       bool zeroed, bool zero_dry_run)
1976{
1977    BDRVVmdkState *s = bs->opaque;
1978    VmdkExtent *extent = NULL;
1979    int ret;
1980    int64_t offset_in_cluster, n_bytes;
1981    uint64_t cluster_offset;
1982    uint64_t bytes_done = 0;
1983    VmdkMetaData m_data;
1984
1985    if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) {
1986        error_report("Wrong offset: offset=0x%" PRIx64
1987                     " total_sectors=0x%" PRIx64,
1988                     offset, bs->total_sectors);
1989        return -EIO;
1990    }
1991
1992    while (bytes > 0) {
1993        extent = find_extent(s, offset >> BDRV_SECTOR_BITS, extent);
1994        if (!extent) {
1995            return -EIO;
1996        }
1997        if (extent->sesparse) {
1998            return -ENOTSUP;
1999        }
2000        offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
2001        n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
2002                             - offset_in_cluster);
2003
2004        ret = get_cluster_offset(bs, extent, &m_data, offset,
2005                                 !(extent->compressed || zeroed),
2006                                 &cluster_offset, offset_in_cluster,
2007                                 offset_in_cluster + n_bytes);
2008        if (extent->compressed) {
2009            if (ret == VMDK_OK) {
2010                /* Refuse write to allocated cluster for streamOptimized */
2011                error_report("Could not write to allocated cluster"
2012                              " for streamOptimized");
2013                return -EIO;
2014            } else if (!zeroed) {
2015                /* allocate */
2016                ret = get_cluster_offset(bs, extent, &m_data, offset,
2017                                         true, &cluster_offset, 0, 0);
2018            }
2019        }
2020        if (ret == VMDK_ERROR) {
2021            return -EINVAL;
2022        }
2023        if (zeroed) {
2024            /* Do zeroed write, buf is ignored */
2025            if (extent->has_zero_grain &&
2026                    offset_in_cluster == 0 &&
2027                    n_bytes >= extent->cluster_sectors * BDRV_SECTOR_SIZE) {
2028                n_bytes = extent->cluster_sectors * BDRV_SECTOR_SIZE;
2029                if (!zero_dry_run && ret != VMDK_ZEROED) {
2030                    /* update L2 tables */
2031                    if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED)
2032                            != VMDK_OK) {
2033                        return -EIO;
2034                    }
2035                }
2036            } else {
2037                return -ENOTSUP;
2038            }
2039        } else {
2040            ret = vmdk_write_extent(extent, cluster_offset, offset_in_cluster,
2041                                    qiov, bytes_done, n_bytes, offset);
2042            if (ret) {
2043                return ret;
2044            }
2045            if (m_data.new_allocation) {
2046                /* update L2 tables */
2047                if (vmdk_L2update(extent, &m_data,
2048                                  cluster_offset >> BDRV_SECTOR_BITS)
2049                        != VMDK_OK) {
2050                    return -EIO;
2051                }
2052            }
2053        }
2054        bytes -= n_bytes;
2055        offset += n_bytes;
2056        bytes_done += n_bytes;
2057
2058        /* update CID on the first write every time the virtual disk is
2059         * opened */
2060        if (!s->cid_updated) {
2061            ret = vmdk_write_cid(bs, g_random_int());
2062            if (ret < 0) {
2063                return ret;
2064            }
2065            s->cid_updated = true;
2066        }
2067    }
2068    return 0;
2069}
2070
2071static int coroutine_fn
2072vmdk_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
2073                QEMUIOVector *qiov, BdrvRequestFlags flags)
2074{
2075    int ret;
2076    BDRVVmdkState *s = bs->opaque;
2077    qemu_co_mutex_lock(&s->lock);
2078    ret = vmdk_pwritev(bs, offset, bytes, qiov, false, false);
2079    qemu_co_mutex_unlock(&s->lock);
2080    return ret;
2081}
2082
2083static int coroutine_fn
2084vmdk_co_pwritev_compressed(BlockDriverState *bs, int64_t offset, int64_t bytes,
2085                           QEMUIOVector *qiov)
2086{
2087    if (bytes == 0) {
2088        /* The caller will write bytes 0 to signal EOF.
2089         * When receive it, we align EOF to a sector boundary. */
2090        BDRVVmdkState *s = bs->opaque;
2091        int i, ret;
2092        int64_t length;
2093
2094        for (i = 0; i < s->num_extents; i++) {
2095            length = bdrv_getlength(s->extents[i].file->bs);
2096            if (length < 0) {
2097                return length;
2098            }
2099            length = QEMU_ALIGN_UP(length, BDRV_SECTOR_SIZE);
2100            ret = bdrv_truncate(s->extents[i].file, length, false,
2101                                PREALLOC_MODE_OFF, 0, NULL);
2102            if (ret < 0) {
2103                return ret;
2104            }
2105        }
2106        return 0;
2107    }
2108    return vmdk_co_pwritev(bs, offset, bytes, qiov, 0);
2109}
2110
2111static int coroutine_fn vmdk_co_pwrite_zeroes(BlockDriverState *bs,
2112                                              int64_t offset,
2113                                              int64_t bytes,
2114                                              BdrvRequestFlags flags)
2115{
2116    int ret;
2117    BDRVVmdkState *s = bs->opaque;
2118
2119    qemu_co_mutex_lock(&s->lock);
2120    /* write zeroes could fail if sectors not aligned to cluster, test it with
2121     * dry_run == true before really updating image */
2122    ret = vmdk_pwritev(bs, offset, bytes, NULL, true, true);
2123    if (!ret) {
2124        ret = vmdk_pwritev(bs, offset, bytes, NULL, true, false);
2125    }
2126    qemu_co_mutex_unlock(&s->lock);
2127    return ret;
2128}
2129
2130static int vmdk_init_extent(BlockBackend *blk,
2131                            int64_t filesize, bool flat,
2132                            bool compress, bool zeroed_grain,
2133                            Error **errp)
2134{
2135    int ret, i;
2136    VMDK4Header header;
2137    uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
2138    uint32_t *gd_buf = NULL;
2139    int gd_buf_size;
2140
2141    if (flat) {
2142        ret = blk_truncate(blk, filesize, false, PREALLOC_MODE_OFF, 0, errp);
2143        goto exit;
2144    }
2145    magic = cpu_to_be32(VMDK4_MAGIC);
2146    memset(&header, 0, sizeof(header));
2147    if (compress) {
2148        header.version = 3;
2149    } else if (zeroed_grain) {
2150        header.version = 2;
2151    } else {
2152        header.version = 1;
2153    }
2154    header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT
2155                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
2156                   | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
2157    header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
2158    header.capacity = filesize / BDRV_SECTOR_SIZE;
2159    header.granularity = 128;
2160    header.num_gtes_per_gt = BDRV_SECTOR_SIZE;
2161
2162    grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity);
2163    gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
2164                           BDRV_SECTOR_SIZE);
2165    gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
2166    gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
2167
2168    header.desc_offset = 1;
2169    header.desc_size = 20;
2170    header.rgd_offset = header.desc_offset + header.desc_size;
2171    header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count);
2172    header.grain_offset =
2173        ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count),
2174                 header.granularity);
2175    /* swap endianness for all header fields */
2176    header.version = cpu_to_le32(header.version);
2177    header.flags = cpu_to_le32(header.flags);
2178    header.capacity = cpu_to_le64(header.capacity);
2179    header.granularity = cpu_to_le64(header.granularity);
2180    header.num_gtes_per_gt = cpu_to_le32(header.num_gtes_per_gt);
2181    header.desc_offset = cpu_to_le64(header.desc_offset);
2182    header.desc_size = cpu_to_le64(header.desc_size);
2183    header.rgd_offset = cpu_to_le64(header.rgd_offset);
2184    header.gd_offset = cpu_to_le64(header.gd_offset);
2185    header.grain_offset = cpu_to_le64(header.grain_offset);
2186    header.compressAlgorithm = cpu_to_le16(header.compressAlgorithm);
2187
2188    header.check_bytes[0] = 0xa;
2189    header.check_bytes[1] = 0x20;
2190    header.check_bytes[2] = 0xd;
2191    header.check_bytes[3] = 0xa;
2192
2193    /* write all the data */
2194    ret = blk_pwrite(blk, 0, &magic, sizeof(magic), 0);
2195    if (ret < 0) {
2196        error_setg(errp, QERR_IO_ERROR);
2197        goto exit;
2198    }
2199    ret = blk_pwrite(blk, sizeof(magic), &header, sizeof(header), 0);
2200    if (ret < 0) {
2201        error_setg(errp, QERR_IO_ERROR);
2202        goto exit;
2203    }
2204
2205    ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9, false,
2206                       PREALLOC_MODE_OFF, 0, errp);
2207    if (ret < 0) {
2208        goto exit;
2209    }
2210
2211    /* write grain directory */
2212    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
2213    gd_buf = g_malloc0(gd_buf_size);
2214    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
2215         i < gt_count; i++, tmp += gt_size) {
2216        gd_buf[i] = cpu_to_le32(tmp);
2217    }
2218    ret = blk_pwrite(blk, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
2219                     gd_buf, gd_buf_size, 0);
2220    if (ret < 0) {
2221        error_setg(errp, QERR_IO_ERROR);
2222        goto exit;
2223    }
2224
2225    /* write backup grain directory */
2226    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
2227         i < gt_count; i++, tmp += gt_size) {
2228        gd_buf[i] = cpu_to_le32(tmp);
2229    }
2230    ret = blk_pwrite(blk, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
2231                     gd_buf, gd_buf_size, 0);
2232    if (ret < 0) {
2233        error_setg(errp, QERR_IO_ERROR);
2234    }
2235
2236    ret = 0;
2237exit:
2238    g_free(gd_buf);
2239    return ret;
2240}
2241
2242static int vmdk_create_extent(const char *filename, int64_t filesize,
2243                              bool flat, bool compress, bool zeroed_grain,
2244                              BlockBackend **pbb,
2245                              QemuOpts *opts, Error **errp)
2246{
2247    int ret;
2248    BlockBackend *blk = NULL;
2249
2250    ret = bdrv_create_file(filename, opts, errp);
2251    if (ret < 0) {
2252        goto exit;
2253    }
2254
2255    blk = blk_new_open(filename, NULL, NULL,
2256                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
2257                       errp);
2258    if (blk == NULL) {
2259        ret = -EIO;
2260        goto exit;
2261    }
2262
2263    blk_set_allow_write_beyond_eof(blk, true);
2264
2265    ret = vmdk_init_extent(blk, filesize, flat, compress, zeroed_grain, errp);
2266exit:
2267    if (blk) {
2268        if (pbb) {
2269            *pbb = blk;
2270        } else {
2271            blk_unref(blk);
2272            blk = NULL;
2273        }
2274    }
2275    return ret;
2276}
2277
2278static int filename_decompose(const char *filename, char *path, char *prefix,
2279                              char *postfix, size_t buf_len, Error **errp)
2280{
2281    const char *p, *q;
2282
2283    if (filename == NULL || !strlen(filename)) {
2284        error_setg(errp, "No filename provided");
2285        return VMDK_ERROR;
2286    }
2287    p = strrchr(filename, '/');
2288    if (p == NULL) {
2289        p = strrchr(filename, '\\');
2290    }
2291    if (p == NULL) {
2292        p = strrchr(filename, ':');
2293    }
2294    if (p != NULL) {
2295        p++;
2296        if (p - filename >= buf_len) {
2297            return VMDK_ERROR;
2298        }
2299        pstrcpy(path, p - filename + 1, filename);
2300    } else {
2301        p = filename;
2302        path[0] = '\0';
2303    }
2304    q = strrchr(p, '.');
2305    if (q == NULL) {
2306        pstrcpy(prefix, buf_len, p);
2307        postfix[0] = '\0';
2308    } else {
2309        if (q - p >= buf_len) {
2310            return VMDK_ERROR;
2311        }
2312        pstrcpy(prefix, q - p + 1, p);
2313        pstrcpy(postfix, buf_len, q);
2314    }
2315    return VMDK_OK;
2316}
2317
2318/*
2319 * idx == 0: get or create the descriptor file (also the image file if in a
2320 *           non-split format.
2321 * idx >= 1: get the n-th extent if in a split subformat
2322 */
2323typedef BlockBackend *(*vmdk_create_extent_fn)(int64_t size,
2324                                               int idx,
2325                                               bool flat,
2326                                               bool split,
2327                                               bool compress,
2328                                               bool zeroed_grain,
2329                                               void *opaque,
2330                                               Error **errp);
2331
2332static void vmdk_desc_add_extent(GString *desc,
2333                                 const char *extent_line_fmt,
2334                                 int64_t size, const char *filename)
2335{
2336    char *basename = g_path_get_basename(filename);
2337
2338    g_string_append_printf(desc, extent_line_fmt,
2339                           DIV_ROUND_UP(size, BDRV_SECTOR_SIZE), basename);
2340    g_free(basename);
2341}
2342
2343static int coroutine_fn vmdk_co_do_create(int64_t size,
2344                                          BlockdevVmdkSubformat subformat,
2345                                          BlockdevVmdkAdapterType adapter_type,
2346                                          const char *backing_file,
2347                                          const char *hw_version,
2348                                          const char *toolsversion,
2349                                          bool compat6,
2350                                          bool zeroed_grain,
2351                                          vmdk_create_extent_fn extent_fn,
2352                                          void *opaque,
2353                                          Error **errp)
2354{
2355    int extent_idx;
2356    BlockBackend *blk = NULL;
2357    BlockBackend *extent_blk;
2358    Error *local_err = NULL;
2359    char *desc = NULL;
2360    int ret = 0;
2361    bool flat, split, compress;
2362    GString *ext_desc_lines;
2363    const int64_t split_size = 0x80000000;  /* VMDK has constant split size */
2364    int64_t extent_size;
2365    int64_t created_size = 0;
2366    const char *extent_line_fmt;
2367    char *parent_desc_line = g_malloc0(BUF_SIZE);
2368    uint32_t parent_cid = 0xffffffff;
2369    uint32_t number_heads = 16;
2370    uint32_t desc_offset = 0, desc_len;
2371    const char desc_template[] =
2372        "# Disk DescriptorFile\n"
2373        "version=1\n"
2374        "CID=%" PRIx32 "\n"
2375        "parentCID=%" PRIx32 "\n"
2376        "createType=\"%s\"\n"
2377        "%s"
2378        "\n"
2379        "# Extent description\n"
2380        "%s"
2381        "\n"
2382        "# The Disk Data Base\n"
2383        "#DDB\n"
2384        "\n"
2385        "ddb.virtualHWVersion = \"%s\"\n"
2386        "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
2387        "ddb.geometry.heads = \"%" PRIu32 "\"\n"
2388        "ddb.geometry.sectors = \"63\"\n"
2389        "ddb.adapterType = \"%s\"\n"
2390        "ddb.toolsVersion = \"%s\"\n";
2391
2392    ext_desc_lines = g_string_new(NULL);
2393
2394    /* Read out options */
2395    if (compat6) {
2396        if (hw_version) {
2397            error_setg(errp,
2398                       "compat6 cannot be enabled with hwversion set");
2399            ret = -EINVAL;
2400            goto exit;
2401        }
2402        hw_version = "6";
2403    }
2404    if (!hw_version) {
2405        hw_version = "4";
2406    }
2407    if (!toolsversion) {
2408        toolsversion = "2147483647";
2409    }
2410
2411    if (adapter_type != BLOCKDEV_VMDK_ADAPTER_TYPE_IDE) {
2412        /* that's the number of heads with which vmware operates when
2413           creating, exporting, etc. vmdk files with a non-ide adapter type */
2414        number_heads = 255;
2415    }
2416    split = (subformat == BLOCKDEV_VMDK_SUBFORMAT_TWOGBMAXEXTENTFLAT) ||
2417            (subformat == BLOCKDEV_VMDK_SUBFORMAT_TWOGBMAXEXTENTSPARSE);
2418    flat = (subformat == BLOCKDEV_VMDK_SUBFORMAT_MONOLITHICFLAT) ||
2419           (subformat == BLOCKDEV_VMDK_SUBFORMAT_TWOGBMAXEXTENTFLAT);
2420    compress = subformat == BLOCKDEV_VMDK_SUBFORMAT_STREAMOPTIMIZED;
2421
2422    if (flat) {
2423        extent_line_fmt = "RW %" PRId64 " FLAT \"%s\" 0\n";
2424    } else {
2425        extent_line_fmt = "RW %" PRId64 " SPARSE \"%s\"\n";
2426    }
2427    if (flat && backing_file) {
2428        error_setg(errp, "Flat image can't have backing file");
2429        ret = -ENOTSUP;
2430        goto exit;
2431    }
2432    if (flat && zeroed_grain) {
2433        error_setg(errp, "Flat image can't enable zeroed grain");
2434        ret = -ENOTSUP;
2435        goto exit;
2436    }
2437
2438    /* Create extents */
2439    if (split) {
2440        extent_size = split_size;
2441    } else {
2442        extent_size = size;
2443    }
2444    if (!split && !flat) {
2445        created_size = extent_size;
2446    } else {
2447        created_size = 0;
2448    }
2449    /* Get the descriptor file BDS */
2450    blk = extent_fn(created_size, 0, flat, split, compress, zeroed_grain,
2451                    opaque, errp);
2452    if (!blk) {
2453        ret = -EIO;
2454        goto exit;
2455    }
2456    if (!split && !flat) {
2457        vmdk_desc_add_extent(ext_desc_lines, extent_line_fmt, created_size,
2458                             blk_bs(blk)->filename);
2459    }
2460
2461    if (backing_file) {
2462        BlockBackend *backing;
2463        char *full_backing =
2464            bdrv_get_full_backing_filename_from_filename(blk_bs(blk)->filename,
2465                                                         backing_file,
2466                                                         &local_err);
2467        if (local_err) {
2468            error_propagate(errp, local_err);
2469            ret = -ENOENT;
2470            goto exit;
2471        }
2472        assert(full_backing);
2473
2474        backing = blk_new_open(full_backing, NULL, NULL,
2475                               BDRV_O_NO_BACKING, errp);
2476        g_free(full_backing);
2477        if (backing == NULL) {
2478            ret = -EIO;
2479            goto exit;
2480        }
2481        if (strcmp(blk_bs(backing)->drv->format_name, "vmdk")) {
2482            error_setg(errp, "Invalid backing file format: %s. Must be vmdk",
2483                       blk_bs(backing)->drv->format_name);
2484            blk_unref(backing);
2485            ret = -EINVAL;
2486            goto exit;
2487        }
2488        ret = vmdk_read_cid(blk_bs(backing), 0, &parent_cid);
2489        blk_unref(backing);
2490        if (ret) {
2491            error_setg(errp, "Failed to read parent CID");
2492            goto exit;
2493        }
2494        snprintf(parent_desc_line, BUF_SIZE,
2495                "parentFileNameHint=\"%s\"", backing_file);
2496    }
2497    extent_idx = 1;
2498    while (created_size < size) {
2499        int64_t cur_size = MIN(size - created_size, extent_size);
2500        extent_blk = extent_fn(cur_size, extent_idx, flat, split, compress,
2501                               zeroed_grain, opaque, errp);
2502        if (!extent_blk) {
2503            ret = -EINVAL;
2504            goto exit;
2505        }
2506        vmdk_desc_add_extent(ext_desc_lines, extent_line_fmt, cur_size,
2507                             blk_bs(extent_blk)->filename);
2508        created_size += cur_size;
2509        extent_idx++;
2510        blk_unref(extent_blk);
2511    }
2512
2513    /* Check whether we got excess extents */
2514    extent_blk = extent_fn(-1, extent_idx, flat, split, compress, zeroed_grain,
2515                           opaque, NULL);
2516    if (extent_blk) {
2517        blk_unref(extent_blk);
2518        error_setg(errp, "List of extents contains unused extents");
2519        ret = -EINVAL;
2520        goto exit;
2521    }
2522
2523    /* generate descriptor file */
2524    desc = g_strdup_printf(desc_template,
2525                           g_random_int(),
2526                           parent_cid,
2527                           BlockdevVmdkSubformat_str(subformat),
2528                           parent_desc_line,
2529                           ext_desc_lines->str,
2530                           hw_version,
2531                           size /
2532                               (int64_t)(63 * number_heads * BDRV_SECTOR_SIZE),
2533                           number_heads,
2534                           BlockdevVmdkAdapterType_str(adapter_type),
2535                           toolsversion);
2536    desc_len = strlen(desc);
2537    /* the descriptor offset = 0x200 */
2538    if (!split && !flat) {
2539        desc_offset = 0x200;
2540    }
2541
2542    ret = blk_pwrite(blk, desc_offset, desc, desc_len, 0);
2543    if (ret < 0) {
2544        error_setg_errno(errp, -ret, "Could not write description");
2545        goto exit;
2546    }
2547    /* bdrv_pwrite write padding zeros to align to sector, we don't need that
2548     * for description file */
2549    if (desc_offset == 0) {
2550        ret = blk_truncate(blk, desc_len, false, PREALLOC_MODE_OFF, 0, errp);
2551        if (ret < 0) {
2552            goto exit;
2553        }
2554    }
2555    ret = 0;
2556exit:
2557    if (blk) {
2558        blk_unref(blk);
2559    }
2560    g_free(desc);
2561    g_free(parent_desc_line);
2562    g_string_free(ext_desc_lines, true);
2563    return ret;
2564}
2565
2566typedef struct {
2567    char *path;
2568    char *prefix;
2569    char *postfix;
2570    QemuOpts *opts;
2571} VMDKCreateOptsData;
2572
2573static BlockBackend *vmdk_co_create_opts_cb(int64_t size, int idx,
2574                                            bool flat, bool split, bool compress,
2575                                            bool zeroed_grain, void *opaque,
2576                                            Error **errp)
2577{
2578    BlockBackend *blk = NULL;
2579    BlockDriverState *bs = NULL;
2580    VMDKCreateOptsData *data = opaque;
2581    char *ext_filename = NULL;
2582    char *rel_filename = NULL;
2583
2584    /* We're done, don't create excess extents. */
2585    if (size == -1) {
2586        assert(errp == NULL);
2587        return NULL;
2588    }
2589
2590    if (idx == 0) {
2591        rel_filename = g_strdup_printf("%s%s", data->prefix, data->postfix);
2592    } else if (split) {
2593        rel_filename = g_strdup_printf("%s-%c%03d%s",
2594                                       data->prefix,
2595                                       flat ? 'f' : 's', idx, data->postfix);
2596    } else {
2597        assert(idx == 1);
2598        rel_filename = g_strdup_printf("%s-flat%s", data->prefix, data->postfix);
2599    }
2600
2601    ext_filename = g_strdup_printf("%s%s", data->path, rel_filename);
2602    g_free(rel_filename);
2603
2604    if (vmdk_create_extent(ext_filename, size,
2605                           flat, compress, zeroed_grain, &blk, data->opts,
2606                           errp)) {
2607        goto exit;
2608    }
2609    bdrv_unref(bs);
2610exit:
2611    g_free(ext_filename);
2612    return blk;
2613}
2614
2615static int coroutine_fn vmdk_co_create_opts(BlockDriver *drv,
2616                                            const char *filename,
2617                                            QemuOpts *opts,
2618                                            Error **errp)
2619{
2620    Error *local_err = NULL;
2621    char *desc = NULL;
2622    int64_t total_size = 0;
2623    char *adapter_type = NULL;
2624    BlockdevVmdkAdapterType adapter_type_enum;
2625    char *backing_file = NULL;
2626    char *hw_version = NULL;
2627    char *toolsversion = NULL;
2628    char *fmt = NULL;
2629    BlockdevVmdkSubformat subformat;
2630    int ret = 0;
2631    char *path = g_malloc0(PATH_MAX);
2632    char *prefix = g_malloc0(PATH_MAX);
2633    char *postfix = g_malloc0(PATH_MAX);
2634    char *desc_line = g_malloc0(BUF_SIZE);
2635    char *ext_filename = g_malloc0(PATH_MAX);
2636    char *desc_filename = g_malloc0(PATH_MAX);
2637    char *parent_desc_line = g_malloc0(BUF_SIZE);
2638    bool zeroed_grain;
2639    bool compat6;
2640    VMDKCreateOptsData data;
2641    char *backing_fmt = NULL;
2642
2643    backing_fmt = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FMT);
2644    if (backing_fmt && strcmp(backing_fmt, "vmdk") != 0) {
2645        error_setg(errp, "backing_file must be a vmdk image");
2646        ret = -EINVAL;
2647        goto exit;
2648    }
2649
2650    if (filename_decompose(filename, path, prefix, postfix, PATH_MAX, errp)) {
2651        ret = -EINVAL;
2652        goto exit;
2653    }
2654    /* Read out options */
2655    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
2656                          BDRV_SECTOR_SIZE);
2657    adapter_type = qemu_opt_get_del(opts, BLOCK_OPT_ADAPTER_TYPE);
2658    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
2659    hw_version = qemu_opt_get_del(opts, BLOCK_OPT_HWVERSION);
2660    toolsversion = qemu_opt_get_del(opts, BLOCK_OPT_TOOLSVERSION);
2661    compat6 = qemu_opt_get_bool_del(opts, BLOCK_OPT_COMPAT6, false);
2662    if (strcmp(hw_version, "undefined") == 0) {
2663        g_free(hw_version);
2664        hw_version = NULL;
2665    }
2666    fmt = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
2667    zeroed_grain = qemu_opt_get_bool_del(opts, BLOCK_OPT_ZEROED_GRAIN, false);
2668
2669    if (adapter_type) {
2670        adapter_type_enum = qapi_enum_parse(&BlockdevVmdkAdapterType_lookup,
2671                                            adapter_type,
2672                                            BLOCKDEV_VMDK_ADAPTER_TYPE_IDE,
2673                                            &local_err);
2674        if (local_err) {
2675            error_propagate(errp, local_err);
2676            ret = -EINVAL;
2677            goto exit;
2678        }
2679    } else {
2680        adapter_type_enum = BLOCKDEV_VMDK_ADAPTER_TYPE_IDE;
2681    }
2682
2683    if (!fmt) {
2684        /* Default format to monolithicSparse */
2685        subformat = BLOCKDEV_VMDK_SUBFORMAT_MONOLITHICSPARSE;
2686    } else {
2687        subformat = qapi_enum_parse(&BlockdevVmdkSubformat_lookup,
2688                                    fmt,
2689                                    BLOCKDEV_VMDK_SUBFORMAT_MONOLITHICSPARSE,
2690                                    &local_err);
2691        if (local_err) {
2692            error_propagate(errp, local_err);
2693            ret = -EINVAL;
2694            goto exit;
2695        }
2696    }
2697    data = (VMDKCreateOptsData){
2698        .prefix = prefix,
2699        .postfix = postfix,
2700        .path = path,
2701        .opts = opts,
2702    };
2703    ret = vmdk_co_do_create(total_size, subformat, adapter_type_enum,
2704                            backing_file, hw_version, toolsversion, compat6,
2705                            zeroed_grain, vmdk_co_create_opts_cb, &data, errp);
2706
2707exit:
2708    g_free(backing_fmt);
2709    g_free(adapter_type);
2710    g_free(backing_file);
2711    g_free(hw_version);
2712    g_free(toolsversion);
2713    g_free(fmt);
2714    g_free(desc);
2715    g_free(path);
2716    g_free(prefix);
2717    g_free(postfix);
2718    g_free(desc_line);
2719    g_free(ext_filename);
2720    g_free(desc_filename);
2721    g_free(parent_desc_line);
2722    return ret;
2723}
2724
2725static BlockBackend *vmdk_co_create_cb(int64_t size, int idx,
2726                                       bool flat, bool split, bool compress,
2727                                       bool zeroed_grain, void *opaque,
2728                                       Error **errp)
2729{
2730    int ret;
2731    BlockDriverState *bs;
2732    BlockBackend *blk;
2733    BlockdevCreateOptionsVmdk *opts = opaque;
2734
2735    if (idx == 0) {
2736        bs = bdrv_open_blockdev_ref(opts->file, errp);
2737    } else {
2738        int i;
2739        BlockdevRefList *list = opts->extents;
2740        for (i = 1; i < idx; i++) {
2741            if (!list || !list->next) {
2742                error_setg(errp, "Extent [%d] not specified", i);
2743                return NULL;
2744            }
2745            list = list->next;
2746        }
2747        if (!list) {
2748            error_setg(errp, "Extent [%d] not specified", idx - 1);
2749            return NULL;
2750        }
2751        bs = bdrv_open_blockdev_ref(list->value, errp);
2752    }
2753    if (!bs) {
2754        return NULL;
2755    }
2756    blk = blk_new_with_bs(bs,
2757                          BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE | BLK_PERM_RESIZE,
2758                          BLK_PERM_ALL, errp);
2759    if (!blk) {
2760        return NULL;
2761    }
2762    blk_set_allow_write_beyond_eof(blk, true);
2763    bdrv_unref(bs);
2764
2765    if (size != -1) {
2766        ret = vmdk_init_extent(blk, size, flat, compress, zeroed_grain, errp);
2767        if (ret) {
2768            blk_unref(blk);
2769            blk = NULL;
2770        }
2771    }
2772    return blk;
2773}
2774
2775static int coroutine_fn vmdk_co_create(BlockdevCreateOptions *create_options,
2776                                       Error **errp)
2777{
2778    int ret;
2779    BlockdevCreateOptionsVmdk *opts;
2780
2781    opts = &create_options->u.vmdk;
2782
2783    /* Validate options */
2784    if (!QEMU_IS_ALIGNED(opts->size, BDRV_SECTOR_SIZE)) {
2785        error_setg(errp, "Image size must be a multiple of 512 bytes");
2786        ret = -EINVAL;
2787        goto out;
2788    }
2789
2790    ret = vmdk_co_do_create(opts->size,
2791                            opts->subformat,
2792                            opts->adapter_type,
2793                            opts->backing_file,
2794                            opts->hwversion,
2795                            opts->toolsversion,
2796                            false,
2797                            opts->zeroed_grain,
2798                            vmdk_co_create_cb,
2799                            opts, errp);
2800    return ret;
2801
2802out:
2803    return ret;
2804}
2805
2806static void vmdk_close(BlockDriverState *bs)
2807{
2808    BDRVVmdkState *s = bs->opaque;
2809
2810    vmdk_free_extents(bs);
2811    g_free(s->create_type);
2812
2813    migrate_del_blocker(s->migration_blocker);
2814    error_free(s->migration_blocker);
2815}
2816
2817static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs)
2818{
2819    int i;
2820    int64_t ret = 0;
2821    int64_t r;
2822    BDRVVmdkState *s = bs->opaque;
2823
2824    ret = bdrv_get_allocated_file_size(bs->file->bs);
2825    if (ret < 0) {
2826        return ret;
2827    }
2828    for (i = 0; i < s->num_extents; i++) {
2829        if (s->extents[i].file == bs->file) {
2830            continue;
2831        }
2832        r = bdrv_get_allocated_file_size(s->extents[i].file->bs);
2833        if (r < 0) {
2834            return r;
2835        }
2836        ret += r;
2837    }
2838    return ret;
2839}
2840
2841static int vmdk_has_zero_init(BlockDriverState *bs)
2842{
2843    int i;
2844    BDRVVmdkState *s = bs->opaque;
2845
2846    /* If has a flat extent and its underlying storage doesn't have zero init,
2847     * return 0. */
2848    for (i = 0; i < s->num_extents; i++) {
2849        if (s->extents[i].flat) {
2850            if (!bdrv_has_zero_init(s->extents[i].file->bs)) {
2851                return 0;
2852            }
2853        }
2854    }
2855    return 1;
2856}
2857
2858static ImageInfo *vmdk_get_extent_info(VmdkExtent *extent)
2859{
2860    ImageInfo *info = g_new0(ImageInfo, 1);
2861
2862    bdrv_refresh_filename(extent->file->bs);
2863    *info = (ImageInfo){
2864        .filename         = g_strdup(extent->file->bs->filename),
2865        .format           = g_strdup(extent->type),
2866        .virtual_size     = extent->sectors * BDRV_SECTOR_SIZE,
2867        .compressed       = extent->compressed,
2868        .has_compressed   = extent->compressed,
2869        .cluster_size     = extent->cluster_sectors * BDRV_SECTOR_SIZE,
2870        .has_cluster_size = !extent->flat,
2871    };
2872
2873    return info;
2874}
2875
2876static int coroutine_fn vmdk_co_check(BlockDriverState *bs,
2877                                      BdrvCheckResult *result,
2878                                      BdrvCheckMode fix)
2879{
2880    BDRVVmdkState *s = bs->opaque;
2881    VmdkExtent *extent = NULL;
2882    int64_t sector_num = 0;
2883    int64_t total_sectors = bdrv_nb_sectors(bs);
2884    int ret;
2885    uint64_t cluster_offset;
2886
2887    if (fix) {
2888        return -ENOTSUP;
2889    }
2890
2891    for (;;) {
2892        if (sector_num >= total_sectors) {
2893            return 0;
2894        }
2895        extent = find_extent(s, sector_num, extent);
2896        if (!extent) {
2897            fprintf(stderr,
2898                    "ERROR: could not find extent for sector %" PRId64 "\n",
2899                    sector_num);
2900            ret = -EINVAL;
2901            break;
2902        }
2903        ret = get_cluster_offset(bs, extent, NULL,
2904                                 sector_num << BDRV_SECTOR_BITS,
2905                                 false, &cluster_offset, 0, 0);
2906        if (ret == VMDK_ERROR) {
2907            fprintf(stderr,
2908                    "ERROR: could not get cluster_offset for sector %"
2909                    PRId64 "\n", sector_num);
2910            break;
2911        }
2912        if (ret == VMDK_OK) {
2913            int64_t extent_len = bdrv_getlength(extent->file->bs);
2914            if (extent_len < 0) {
2915                fprintf(stderr,
2916                        "ERROR: could not get extent file length for sector %"
2917                        PRId64 "\n", sector_num);
2918                ret = extent_len;
2919                break;
2920            }
2921            if (cluster_offset >= extent_len) {
2922                fprintf(stderr,
2923                        "ERROR: cluster offset for sector %"
2924                        PRId64 " points after EOF\n", sector_num);
2925                ret = -EINVAL;
2926                break;
2927            }
2928        }
2929        sector_num += extent->cluster_sectors;
2930    }
2931
2932    result->corruptions++;
2933    return ret;
2934}
2935
2936static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs,
2937                                                 Error **errp)
2938{
2939    int i;
2940    BDRVVmdkState *s = bs->opaque;
2941    ImageInfoSpecific *spec_info = g_new0(ImageInfoSpecific, 1);
2942    ImageInfoList **tail;
2943
2944    *spec_info = (ImageInfoSpecific){
2945        .type = IMAGE_INFO_SPECIFIC_KIND_VMDK,
2946        .u = {
2947            .vmdk.data = g_new0(ImageInfoSpecificVmdk, 1),
2948        },
2949    };
2950
2951    *spec_info->u.vmdk.data = (ImageInfoSpecificVmdk) {
2952        .create_type = g_strdup(s->create_type),
2953        .cid = s->cid,
2954        .parent_cid = s->parent_cid,
2955    };
2956
2957    tail = &spec_info->u.vmdk.data->extents;
2958    for (i = 0; i < s->num_extents; i++) {
2959        QAPI_LIST_APPEND(tail, vmdk_get_extent_info(&s->extents[i]));
2960    }
2961
2962    return spec_info;
2963}
2964
2965static bool vmdk_extents_type_eq(const VmdkExtent *a, const VmdkExtent *b)
2966{
2967    return a->flat == b->flat &&
2968           a->compressed == b->compressed &&
2969           (a->flat || a->cluster_sectors == b->cluster_sectors);
2970}
2971
2972static int vmdk_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2973{
2974    int i;
2975    BDRVVmdkState *s = bs->opaque;
2976    assert(s->num_extents);
2977
2978    /* See if we have multiple extents but they have different cases */
2979    for (i = 1; i < s->num_extents; i++) {
2980        if (!vmdk_extents_type_eq(&s->extents[0], &s->extents[i])) {
2981            return -ENOTSUP;
2982        }
2983    }
2984    bdi->needs_compressed_writes = s->extents[0].compressed;
2985    if (!s->extents[0].flat) {
2986        bdi->cluster_size = s->extents[0].cluster_sectors << BDRV_SECTOR_BITS;
2987    }
2988    return 0;
2989}
2990
2991static void vmdk_gather_child_options(BlockDriverState *bs, QDict *target,
2992                                      bool backing_overridden)
2993{
2994    /* No children but file and backing can be explicitly specified (TODO) */
2995    qdict_put(target, "file",
2996              qobject_ref(bs->file->bs->full_open_options));
2997
2998    if (backing_overridden) {
2999        if (bs->backing) {
3000            qdict_put(target, "backing",
3001                      qobject_ref(bs->backing->bs->full_open_options));
3002        } else {
3003            qdict_put_null(target, "backing");
3004        }
3005    }
3006}
3007
3008static QemuOptsList vmdk_create_opts = {
3009    .name = "vmdk-create-opts",
3010    .head = QTAILQ_HEAD_INITIALIZER(vmdk_create_opts.head),
3011    .desc = {
3012        {
3013            .name = BLOCK_OPT_SIZE,
3014            .type = QEMU_OPT_SIZE,
3015            .help = "Virtual disk size"
3016        },
3017        {
3018            .name = BLOCK_OPT_ADAPTER_TYPE,
3019            .type = QEMU_OPT_STRING,
3020            .help = "Virtual adapter type, can be one of "
3021                    "ide (default), lsilogic, buslogic or legacyESX"
3022        },
3023        {
3024            .name = BLOCK_OPT_BACKING_FILE,
3025            .type = QEMU_OPT_STRING,
3026            .help = "File name of a base image"
3027        },
3028        {
3029            .name = BLOCK_OPT_BACKING_FMT,
3030            .type = QEMU_OPT_STRING,
3031            .help = "Must be 'vmdk' if present",
3032        },
3033        {
3034            .name = BLOCK_OPT_COMPAT6,
3035            .type = QEMU_OPT_BOOL,
3036            .help = "VMDK version 6 image",
3037            .def_value_str = "off"
3038        },
3039        {
3040            .name = BLOCK_OPT_HWVERSION,
3041            .type = QEMU_OPT_STRING,
3042            .help = "VMDK hardware version",
3043            .def_value_str = "undefined"
3044        },
3045        {
3046            .name = BLOCK_OPT_TOOLSVERSION,
3047            .type = QEMU_OPT_STRING,
3048            .help = "VMware guest tools version",
3049        },
3050        {
3051            .name = BLOCK_OPT_SUBFMT,
3052            .type = QEMU_OPT_STRING,
3053            .help =
3054                "VMDK flat extent format, can be one of "
3055                "{monolithicSparse (default) | monolithicFlat | twoGbMaxExtentSparse | twoGbMaxExtentFlat | streamOptimized} "
3056        },
3057        {
3058            .name = BLOCK_OPT_ZEROED_GRAIN,
3059            .type = QEMU_OPT_BOOL,
3060            .help = "Enable efficient zero writes "
3061                    "using the zeroed-grain GTE feature"
3062        },
3063        { /* end of list */ }
3064    }
3065};
3066
3067static BlockDriver bdrv_vmdk = {
3068    .format_name                  = "vmdk",
3069    .instance_size                = sizeof(BDRVVmdkState),
3070    .bdrv_probe                   = vmdk_probe,
3071    .bdrv_open                    = vmdk_open,
3072    .bdrv_co_check                = vmdk_co_check,
3073    .bdrv_reopen_prepare          = vmdk_reopen_prepare,
3074    .bdrv_child_perm              = bdrv_default_perms,
3075    .bdrv_co_preadv               = vmdk_co_preadv,
3076    .bdrv_co_pwritev              = vmdk_co_pwritev,
3077    .bdrv_co_pwritev_compressed   = vmdk_co_pwritev_compressed,
3078    .bdrv_co_pwrite_zeroes        = vmdk_co_pwrite_zeroes,
3079    .bdrv_close                   = vmdk_close,
3080    .bdrv_co_create_opts          = vmdk_co_create_opts,
3081    .bdrv_co_create               = vmdk_co_create,
3082    .bdrv_co_block_status         = vmdk_co_block_status,
3083    .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size,
3084    .bdrv_has_zero_init           = vmdk_has_zero_init,
3085    .bdrv_get_specific_info       = vmdk_get_specific_info,
3086    .bdrv_refresh_limits          = vmdk_refresh_limits,
3087    .bdrv_get_info                = vmdk_get_info,
3088    .bdrv_gather_child_options    = vmdk_gather_child_options,
3089
3090    .is_format                    = true,
3091    .supports_backing             = true,
3092    .create_opts                  = &vmdk_create_opts,
3093};
3094
3095static void bdrv_vmdk_init(void)
3096{
3097    bdrv_register(&bdrv_vmdk);
3098}
3099
3100block_init(bdrv_vmdk_init);
3101