qemu/block/vhdx-log.c
<<
>>
Prefs
   1/*
   2 * Block driver for Hyper-V VHDX Images
   3 *
   4 * Copyright (c) 2013 Red Hat, Inc.,
   5 *
   6 * Authors:
   7 *  Jeff Cody <jcody@redhat.com>
   8 *
   9 *  This is based on the "VHDX Format Specification v1.00", published 8/25/2012
  10 *  by Microsoft:
  11 *      https://www.microsoft.com/en-us/download/details.aspx?id=34750
  12 *
  13 * This file covers the functionality of the metadata log writing, parsing, and
  14 * replay.
  15 *
  16 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
  17 * See the COPYING.LIB file in the top-level directory.
  18 *
  19 */
  20
  21#include "qemu/osdep.h"
  22#include "qapi/error.h"
  23#include "block/block-io.h"
  24#include "block/block_int.h"
  25#include "qemu/error-report.h"
  26#include "qemu/bswap.h"
  27#include "qemu/memalign.h"
  28#include "vhdx.h"
  29
  30
  31typedef struct VHDXLogSequence {
  32    bool valid;
  33    uint32_t count;
  34    VHDXLogEntries log;
  35    VHDXLogEntryHeader hdr;
  36} VHDXLogSequence;
  37
  38typedef struct VHDXLogDescEntries {
  39    VHDXLogEntryHeader hdr;
  40    VHDXLogDescriptor desc[];
  41} VHDXLogDescEntries;
  42
  43static const MSGUID zero_guid = { 0 };
  44
  45/* The log located on the disk is circular buffer containing
  46 * sectors of 4096 bytes each.
  47 *
  48 * It is assumed for the read/write functions below that the
  49 * circular buffer scheme uses a 'one sector open' to indicate
  50 * the buffer is full.  Given the validation methods used for each
  51 * sector, this method should be compatible with other methods that
  52 * do not waste a sector.
  53 */
  54
  55
  56/* Allow peeking at the hdr entry at the beginning of the current
  57 * read index, without advancing the read index */
  58static int vhdx_log_peek_hdr(BlockDriverState *bs, VHDXLogEntries *log,
  59                             VHDXLogEntryHeader *hdr)
  60{
  61    int ret = 0;
  62    uint64_t offset;
  63    uint32_t read;
  64
  65    assert(hdr != NULL);
  66
  67    /* peek is only supported on sector boundaries */
  68    if (log->read % VHDX_LOG_SECTOR_SIZE) {
  69        ret = -EFAULT;
  70        goto exit;
  71    }
  72
  73    read = log->read;
  74    /* we are guaranteed that a) log sectors are 4096 bytes,
  75     * and b) the log length is a multiple of 1MB. So, there
  76     * is always a round number of sectors in the buffer */
  77    if ((read + sizeof(VHDXLogEntryHeader)) > log->length) {
  78        read = 0;
  79    }
  80
  81    if (read == log->write) {
  82        ret = -EINVAL;
  83        goto exit;
  84    }
  85
  86    offset = log->offset + read;
  87
  88    ret = bdrv_pread(bs->file, offset, sizeof(VHDXLogEntryHeader), hdr, 0);
  89    if (ret < 0) {
  90        goto exit;
  91    }
  92    vhdx_log_entry_hdr_le_import(hdr);
  93
  94exit:
  95    return ret;
  96}
  97
  98/* Index increment for log, based on sector boundaries */
  99static int vhdx_log_inc_idx(uint32_t idx, uint64_t length)
 100{
 101    idx += VHDX_LOG_SECTOR_SIZE;
 102    /* we are guaranteed that a) log sectors are 4096 bytes,
 103     * and b) the log length is a multiple of 1MB. So, there
 104     * is always a round number of sectors in the buffer */
 105    return idx >= length ? 0 : idx;
 106}
 107
 108
 109/* Reset the log to empty */
 110static void vhdx_log_reset(BlockDriverState *bs, BDRVVHDXState *s)
 111{
 112    MSGUID guid = { 0 };
 113    s->log.read = s->log.write = 0;
 114    /* a log guid of 0 indicates an empty log to any parser of v0
 115     * VHDX logs */
 116    vhdx_update_headers(bs, s, false, &guid);
 117}
 118
 119/* Reads num_sectors from the log (all log sectors are 4096 bytes),
 120 * into buffer 'buffer'.  Upon return, *sectors_read will contain
 121 * the number of sectors successfully read.
 122 *
 123 * It is assumed that 'buffer' is already allocated, and of sufficient
 124 * size (i.e. >= 4096*num_sectors).
 125 *
 126 * If 'peek' is true, then the tail (read) pointer for the circular buffer is
 127 * not modified.
 128 *
 129 * 0 is returned on success, -errno otherwise.  */
 130static int vhdx_log_read_sectors(BlockDriverState *bs, VHDXLogEntries *log,
 131                                 uint32_t *sectors_read, void *buffer,
 132                                 uint32_t num_sectors, bool peek)
 133{
 134    int ret = 0;
 135    uint64_t offset;
 136    uint32_t read;
 137
 138    read = log->read;
 139
 140    *sectors_read = 0;
 141    while (num_sectors) {
 142        if (read == log->write) {
 143            /* empty */
 144            break;
 145        }
 146        offset = log->offset + read;
 147
 148        ret = bdrv_pread(bs->file, offset, VHDX_LOG_SECTOR_SIZE, buffer, 0);
 149        if (ret < 0) {
 150            goto exit;
 151        }
 152        read = vhdx_log_inc_idx(read, log->length);
 153
 154        *sectors_read = *sectors_read + 1;
 155        num_sectors--;
 156    }
 157
 158exit:
 159    if (!peek) {
 160        log->read = read;
 161    }
 162    return ret;
 163}
 164
 165/* Writes num_sectors to the log (all log sectors are 4096 bytes),
 166 * from buffer 'buffer'.  Upon return, *sectors_written will contain
 167 * the number of sectors successfully written.
 168 *
 169 * It is assumed that 'buffer' is at least 4096*num_sectors large.
 170 *
 171 * 0 is returned on success, -errno otherwise */
 172static int vhdx_log_write_sectors(BlockDriverState *bs, VHDXLogEntries *log,
 173                                  uint32_t *sectors_written, void *buffer,
 174                                  uint32_t num_sectors)
 175{
 176    int ret = 0;
 177    uint64_t offset;
 178    uint32_t write;
 179    void *buffer_tmp;
 180    BDRVVHDXState *s = bs->opaque;
 181
 182    ret = vhdx_user_visible_write(bs, s);
 183    if (ret < 0) {
 184        goto exit;
 185    }
 186
 187    write = log->write;
 188
 189    buffer_tmp = buffer;
 190    while (num_sectors) {
 191
 192        offset = log->offset + write;
 193        write = vhdx_log_inc_idx(write, log->length);
 194        if (write == log->read) {
 195            /* full */
 196            break;
 197        }
 198        ret = bdrv_pwrite(bs->file, offset, VHDX_LOG_SECTOR_SIZE, buffer_tmp,
 199                          0);
 200        if (ret < 0) {
 201            goto exit;
 202        }
 203        buffer_tmp += VHDX_LOG_SECTOR_SIZE;
 204
 205        log->write = write;
 206        *sectors_written = *sectors_written + 1;
 207        num_sectors--;
 208    }
 209
 210exit:
 211    return ret;
 212}
 213
 214
 215/* Validates a log entry header */
 216static bool vhdx_log_hdr_is_valid(VHDXLogEntries *log, VHDXLogEntryHeader *hdr,
 217                                  BDRVVHDXState *s)
 218{
 219    int valid = false;
 220
 221    if (hdr->signature != VHDX_LOG_SIGNATURE) {
 222        goto exit;
 223    }
 224
 225    /* if the individual entry length is larger than the whole log
 226     * buffer, that is obviously invalid */
 227    if (log->length < hdr->entry_length) {
 228        goto exit;
 229    }
 230
 231    /* length of entire entry must be in units of 4KB (log sector size) */
 232    if (hdr->entry_length % (VHDX_LOG_SECTOR_SIZE)) {
 233        goto exit;
 234    }
 235
 236    /* per spec, sequence # must be > 0 */
 237    if (hdr->sequence_number == 0) {
 238        goto exit;
 239    }
 240
 241    /* log entries are only valid if they match the file-wide log guid
 242     * found in the active header */
 243    if (!guid_eq(hdr->log_guid, s->headers[s->curr_header]->log_guid)) {
 244        goto exit;
 245    }
 246
 247    if (hdr->descriptor_count * sizeof(VHDXLogDescriptor) > hdr->entry_length) {
 248        goto exit;
 249    }
 250
 251    valid = true;
 252
 253exit:
 254    return valid;
 255}
 256
 257/*
 258 * Given a log header, this will validate that the descriptors and the
 259 * corresponding data sectors (if applicable)
 260 *
 261 * Validation consists of:
 262 *      1. Making sure the sequence numbers matches the entry header
 263 *      2. Verifying a valid signature ('zero' or 'desc' for descriptors)
 264 *      3. File offset field is a multiple of 4KB
 265 *      4. If a data descriptor, the corresponding data sector
 266 *         has its signature ('data') and matching sequence number
 267 *
 268 * @desc: the data buffer containing the descriptor
 269 * @hdr:  the log entry header
 270 *
 271 * Returns true if valid
 272 */
 273static bool vhdx_log_desc_is_valid(VHDXLogDescriptor *desc,
 274                                   VHDXLogEntryHeader *hdr)
 275{
 276    bool ret = false;
 277
 278    if (desc->sequence_number != hdr->sequence_number) {
 279        goto exit;
 280    }
 281    if (desc->file_offset % VHDX_LOG_SECTOR_SIZE) {
 282        goto exit;
 283    }
 284
 285    if (desc->signature == VHDX_LOG_ZERO_SIGNATURE) {
 286        if (desc->zero_length % VHDX_LOG_SECTOR_SIZE == 0) {
 287            /* valid */
 288            ret = true;
 289        }
 290    } else if (desc->signature == VHDX_LOG_DESC_SIGNATURE) {
 291            /* valid */
 292            ret = true;
 293    }
 294
 295exit:
 296    return ret;
 297}
 298
 299
 300/* Prior to sector data for a log entry, there is the header
 301 * and the descriptors referenced in the header:
 302 *
 303 * [] = 4KB sector
 304 *
 305 * [ hdr, desc ][   desc   ][ ... ][ data ][ ... ]
 306 *
 307 * The first sector in a log entry has a 64 byte header, and
 308 * up to 126 32-byte descriptors.  If more descriptors than
 309 * 126 are required, then subsequent sectors can have up to 128
 310 * descriptors.  Each sector is 4KB.  Data follows the descriptor
 311 * sectors.
 312 *
 313 * This will return the number of sectors needed to encompass
 314 * the passed number of descriptors in desc_cnt.
 315 *
 316 * This will never return 0, even if desc_cnt is 0.
 317 */
 318static int vhdx_compute_desc_sectors(uint32_t desc_cnt)
 319{
 320    uint32_t desc_sectors;
 321
 322    desc_cnt += 2; /* account for header in first sector */
 323    desc_sectors = desc_cnt / 128;
 324    if (desc_cnt % 128) {
 325        desc_sectors++;
 326    }
 327
 328    return desc_sectors;
 329}
 330
 331
 332/* Reads the log header, and subsequent descriptors (if any).  This
 333 * will allocate all the space for buffer, which must be NULL when
 334 * passed into this function. Each descriptor will also be validated,
 335 * and error returned if any are invalid. */
 336static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s,
 337                              VHDXLogEntries *log, VHDXLogDescEntries **buffer,
 338                              bool convert_endian)
 339{
 340    int ret = 0;
 341    uint32_t desc_sectors;
 342    uint32_t sectors_read;
 343    VHDXLogEntryHeader hdr;
 344    VHDXLogDescEntries *desc_entries = NULL;
 345    VHDXLogDescriptor desc;
 346    int i;
 347
 348    assert(*buffer == NULL);
 349
 350    ret = vhdx_log_peek_hdr(bs, log, &hdr);
 351    if (ret < 0) {
 352        goto exit;
 353    }
 354
 355    if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) {
 356        ret = -EINVAL;
 357        goto exit;
 358    }
 359
 360    desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count);
 361    desc_entries = qemu_try_blockalign(bs->file->bs,
 362                                       desc_sectors * VHDX_LOG_SECTOR_SIZE);
 363    if (desc_entries == NULL) {
 364        ret = -ENOMEM;
 365        goto exit;
 366    }
 367
 368    ret = vhdx_log_read_sectors(bs, log, &sectors_read, desc_entries,
 369                                desc_sectors, false);
 370    if (ret < 0) {
 371        goto free_and_exit;
 372    }
 373    if (sectors_read != desc_sectors) {
 374        ret = -EINVAL;
 375        goto free_and_exit;
 376    }
 377
 378    /* put in proper endianness, and validate each desc */
 379    for (i = 0; i < hdr.descriptor_count; i++) {
 380        desc = desc_entries->desc[i];
 381        vhdx_log_desc_le_import(&desc);
 382        if (convert_endian) {
 383            desc_entries->desc[i] = desc;
 384        }
 385        if (vhdx_log_desc_is_valid(&desc, &hdr) == false) {
 386            ret = -EINVAL;
 387            goto free_and_exit;
 388        }
 389    }
 390    if (convert_endian) {
 391        desc_entries->hdr = hdr;
 392    }
 393
 394    *buffer = desc_entries;
 395    goto exit;
 396
 397free_and_exit:
 398    qemu_vfree(desc_entries);
 399exit:
 400    return ret;
 401}
 402
 403
 404/* Flushes the descriptor described by desc to the VHDX image file.
 405 * If the descriptor is a data descriptor, than 'data' must be non-NULL,
 406 * and >= 4096 bytes (VHDX_LOG_SECTOR_SIZE), containing the data to be
 407 * written.
 408 *
 409 * Verification is performed to make sure the sequence numbers of a data
 410 * descriptor match the sequence number in the desc.
 411 *
 412 * For a zero descriptor, it may describe multiple sectors to fill with zeroes.
 413 * In this case, it should be noted that zeroes are written to disk, and the
 414 * image file is not extended as a sparse file.  */
 415static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc,
 416                               VHDXLogDataSector *data)
 417{
 418    int ret = 0;
 419    uint64_t seq, file_offset;
 420    uint32_t offset = 0;
 421    void *buffer = NULL;
 422    uint64_t count = 1;
 423    int i;
 424
 425    buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
 426
 427    if (desc->signature == VHDX_LOG_DESC_SIGNATURE) {
 428        /* data sector */
 429        if (data == NULL) {
 430            ret = -EFAULT;
 431            goto exit;
 432        }
 433
 434        /* The sequence number of the data sector must match that
 435         * in the descriptor */
 436        seq = data->sequence_high;
 437        seq <<= 32;
 438        seq |= data->sequence_low & 0xffffffff;
 439
 440        if (seq != desc->sequence_number) {
 441            ret = -EINVAL;
 442            goto exit;
 443        }
 444
 445        /* Each data sector is in total 4096 bytes, however the first
 446         * 8 bytes, and last 4 bytes, are located in the descriptor */
 447        memcpy(buffer, &desc->leading_bytes, 8);
 448        offset += 8;
 449
 450        memcpy(buffer+offset, data->data, 4084);
 451        offset += 4084;
 452
 453        memcpy(buffer+offset, &desc->trailing_bytes, 4);
 454
 455    } else if (desc->signature == VHDX_LOG_ZERO_SIGNATURE) {
 456        /* write 'count' sectors of sector */
 457        memset(buffer, 0, VHDX_LOG_SECTOR_SIZE);
 458        count = desc->zero_length / VHDX_LOG_SECTOR_SIZE;
 459    } else {
 460        error_report("Invalid VHDX log descriptor entry signature 0x%" PRIx32,
 461                      desc->signature);
 462        ret = -EINVAL;
 463        goto exit;
 464    }
 465
 466    file_offset = desc->file_offset;
 467
 468    /* count is only > 1 if we are writing zeroes */
 469    for (i = 0; i < count; i++) {
 470        ret = bdrv_pwrite_sync(bs->file, file_offset, VHDX_LOG_SECTOR_SIZE,
 471                               buffer, 0);
 472        if (ret < 0) {
 473            goto exit;
 474        }
 475        file_offset += VHDX_LOG_SECTOR_SIZE;
 476    }
 477
 478exit:
 479    qemu_vfree(buffer);
 480    return ret;
 481}
 482
 483/* Flush the entire log (as described by 'logs') to the VHDX image
 484 * file, and then set the log to 'empty' status once complete.
 485 *
 486 * The log entries should be validate prior to flushing */
 487static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s,
 488                          VHDXLogSequence *logs)
 489{
 490    int ret = 0;
 491    int i;
 492    uint32_t cnt, sectors_read;
 493    uint64_t new_file_size;
 494    void *data = NULL;
 495    int64_t file_length;
 496    VHDXLogDescEntries *desc_entries = NULL;
 497    VHDXLogEntryHeader hdr_tmp = { 0 };
 498
 499    cnt = logs->count;
 500
 501    data = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
 502
 503    ret = vhdx_user_visible_write(bs, s);
 504    if (ret < 0) {
 505        goto exit;
 506    }
 507
 508    /* each iteration represents one log sequence, which may span multiple
 509     * sectors */
 510    while (cnt--) {
 511        ret = vhdx_log_peek_hdr(bs, &logs->log, &hdr_tmp);
 512        if (ret < 0) {
 513            goto exit;
 514        }
 515        file_length = bdrv_getlength(bs->file->bs);
 516        if (file_length < 0) {
 517            ret = file_length;
 518            goto exit;
 519        }
 520        /* if the log shows a FlushedFileOffset larger than our current file
 521         * size, then that means the file has been truncated / corrupted, and
 522         * we must refused to open it / use it */
 523        if (hdr_tmp.flushed_file_offset > file_length) {
 524            ret = -EINVAL;
 525            goto exit;
 526        }
 527
 528        ret = vhdx_log_read_desc(bs, s, &logs->log, &desc_entries, true);
 529        if (ret < 0) {
 530            goto exit;
 531        }
 532
 533        for (i = 0; i < desc_entries->hdr.descriptor_count; i++) {
 534            if (desc_entries->desc[i].signature == VHDX_LOG_DESC_SIGNATURE) {
 535                /* data sector, so read a sector to flush */
 536                ret = vhdx_log_read_sectors(bs, &logs->log, &sectors_read,
 537                                            data, 1, false);
 538                if (ret < 0) {
 539                    goto exit;
 540                }
 541                if (sectors_read != 1) {
 542                    ret = -EINVAL;
 543                    goto exit;
 544                }
 545                vhdx_log_data_le_import(data);
 546            }
 547
 548            ret = vhdx_log_flush_desc(bs, &desc_entries->desc[i], data);
 549            if (ret < 0) {
 550                goto exit;
 551            }
 552        }
 553        if (file_length < desc_entries->hdr.last_file_offset) {
 554            new_file_size = desc_entries->hdr.last_file_offset;
 555            if (new_file_size % (1 * MiB)) {
 556                /* round up to nearest 1MB boundary */
 557                new_file_size = QEMU_ALIGN_UP(new_file_size, MiB);
 558                if (new_file_size > INT64_MAX) {
 559                    ret = -EINVAL;
 560                    goto exit;
 561                }
 562                ret = bdrv_truncate(bs->file, new_file_size, false,
 563                                    PREALLOC_MODE_OFF, 0, NULL);
 564                if (ret < 0) {
 565                    goto exit;
 566                }
 567            }
 568        }
 569        qemu_vfree(desc_entries);
 570        desc_entries = NULL;
 571    }
 572
 573    ret = bdrv_flush(bs);
 574    if (ret < 0) {
 575        goto exit;
 576    }
 577    /* once the log is fully flushed, indicate that we have an empty log
 578     * now.  This also sets the log guid to 0, to indicate an empty log */
 579    vhdx_log_reset(bs, s);
 580
 581exit:
 582    qemu_vfree(data);
 583    qemu_vfree(desc_entries);
 584    return ret;
 585}
 586
 587static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s,
 588                                   VHDXLogEntries *log, uint64_t seq,
 589                                   bool *valid, VHDXLogEntryHeader *entry)
 590{
 591    int ret = 0;
 592    VHDXLogEntryHeader hdr;
 593    void *buffer = NULL;
 594    uint32_t i, desc_sectors, total_sectors, crc;
 595    uint32_t sectors_read = 0;
 596    VHDXLogDescEntries *desc_buffer = NULL;
 597
 598    *valid = false;
 599
 600    ret = vhdx_log_peek_hdr(bs, log, &hdr);
 601    if (ret < 0) {
 602        goto inc_and_exit;
 603    }
 604
 605    if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) {
 606        goto inc_and_exit;
 607    }
 608
 609    if (seq > 0) {
 610        if (hdr.sequence_number != seq + 1) {
 611            goto inc_and_exit;
 612        }
 613    }
 614
 615    desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count);
 616
 617    /* Read all log sectors, and calculate log checksum */
 618
 619    total_sectors = hdr.entry_length / VHDX_LOG_SECTOR_SIZE;
 620
 621
 622    /* read_desc() will increment the read idx */
 623    ret = vhdx_log_read_desc(bs, s, log, &desc_buffer, false);
 624    if (ret < 0) {
 625        goto free_and_exit;
 626    }
 627
 628    crc = vhdx_checksum_calc(0xffffffff, (void *)desc_buffer,
 629                            desc_sectors * VHDX_LOG_SECTOR_SIZE, 4);
 630    crc ^= 0xffffffff;
 631
 632    buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
 633    if (total_sectors > desc_sectors) {
 634        for (i = 0; i < total_sectors - desc_sectors; i++) {
 635            sectors_read = 0;
 636            ret = vhdx_log_read_sectors(bs, log, &sectors_read, buffer,
 637                                        1, false);
 638            if (ret < 0 || sectors_read != 1) {
 639                goto free_and_exit;
 640            }
 641            crc = vhdx_checksum_calc(crc, buffer, VHDX_LOG_SECTOR_SIZE, -1);
 642            crc ^= 0xffffffff;
 643        }
 644    }
 645    crc ^= 0xffffffff;
 646    if (crc != hdr.checksum) {
 647        goto free_and_exit;
 648    }
 649
 650    *valid = true;
 651    *entry = hdr;
 652    goto free_and_exit;
 653
 654inc_and_exit:
 655    log->read = vhdx_log_inc_idx(log->read, log->length);
 656
 657free_and_exit:
 658    qemu_vfree(buffer);
 659    qemu_vfree(desc_buffer);
 660    return ret;
 661}
 662
 663/* Search through the log circular buffer, and find the valid, active
 664 * log sequence, if any exists
 665 * */
 666static int vhdx_log_search(BlockDriverState *bs, BDRVVHDXState *s,
 667                           VHDXLogSequence *logs)
 668{
 669    int ret = 0;
 670    uint32_t tail;
 671    bool seq_valid = false;
 672    VHDXLogSequence candidate = { 0 };
 673    VHDXLogEntryHeader hdr = { 0 };
 674    VHDXLogEntries curr_log;
 675
 676    memcpy(&curr_log, &s->log, sizeof(VHDXLogEntries));
 677    curr_log.write = curr_log.length;   /* assume log is full */
 678    curr_log.read = 0;
 679
 680
 681    /* now we will go through the whole log sector by sector, until
 682     * we find a valid, active log sequence, or reach the end of the
 683     * log buffer */
 684    for (;;) {
 685        uint64_t curr_seq = 0;
 686        VHDXLogSequence current = { 0 };
 687
 688        tail = curr_log.read;
 689
 690        ret = vhdx_validate_log_entry(bs, s, &curr_log, curr_seq,
 691                                      &seq_valid, &hdr);
 692        if (ret < 0) {
 693            goto exit;
 694        }
 695
 696        if (seq_valid) {
 697            current.valid     = true;
 698            current.log       = curr_log;
 699            current.log.read  = tail;
 700            current.log.write = curr_log.read;
 701            current.count     = 1;
 702            current.hdr       = hdr;
 703
 704
 705            for (;;) {
 706                ret = vhdx_validate_log_entry(bs, s, &curr_log, curr_seq,
 707                                              &seq_valid, &hdr);
 708                if (ret < 0) {
 709                    goto exit;
 710                }
 711                if (seq_valid == false) {
 712                    break;
 713                }
 714                current.log.write = curr_log.read;
 715                current.count++;
 716
 717                curr_seq = hdr.sequence_number;
 718            }
 719        }
 720
 721        if (current.valid) {
 722            if (candidate.valid == false ||
 723                current.hdr.sequence_number > candidate.hdr.sequence_number) {
 724                candidate = current;
 725            }
 726        }
 727
 728        if (curr_log.read < tail) {
 729            break;
 730        }
 731    }
 732
 733    *logs = candidate;
 734
 735    if (candidate.valid) {
 736        /* this is the next sequence number, for writes */
 737        s->log.sequence = candidate.hdr.sequence_number + 1;
 738    }
 739
 740
 741exit:
 742    return ret;
 743}
 744
 745/* Parse the replay log.  Per the VHDX spec, if the log is present
 746 * it must be replayed prior to opening the file, even read-only.
 747 *
 748 * If read-only, we must replay the log in RAM (or refuse to open
 749 * a dirty VHDX file read-only) */
 750int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed,
 751                   Error **errp)
 752{
 753    int ret = 0;
 754    VHDXHeader *hdr;
 755    VHDXLogSequence logs = { 0 };
 756
 757    hdr = s->headers[s->curr_header];
 758
 759    *flushed = false;
 760
 761    /* s->log.hdr is freed in vhdx_close() */
 762    if (s->log.hdr == NULL) {
 763        s->log.hdr = qemu_blockalign(bs, sizeof(VHDXLogEntryHeader));
 764    }
 765
 766    s->log.offset = hdr->log_offset;
 767    s->log.length = hdr->log_length;
 768
 769    if (s->log.offset < VHDX_LOG_MIN_SIZE ||
 770        s->log.offset % VHDX_LOG_MIN_SIZE) {
 771        ret = -EINVAL;
 772        goto exit;
 773    }
 774
 775    /* per spec, only log version of 0 is supported */
 776    if (hdr->log_version != 0) {
 777        ret = -EINVAL;
 778        goto exit;
 779    }
 780
 781    /* If either the log guid, or log length is zero,
 782     * then a replay log is not present */
 783    if (guid_eq(hdr->log_guid, zero_guid)) {
 784        goto exit;
 785    }
 786
 787    if (hdr->log_length == 0) {
 788        goto exit;
 789    }
 790
 791    if (hdr->log_length % VHDX_LOG_MIN_SIZE) {
 792        ret = -EINVAL;
 793        goto exit;
 794    }
 795
 796
 797    /* The log is present, we need to find if and where there is an active
 798     * sequence of valid entries present in the log.  */
 799
 800    ret = vhdx_log_search(bs, s, &logs);
 801    if (ret < 0) {
 802        goto exit;
 803    }
 804
 805    if (logs.valid) {
 806        if (bdrv_is_read_only(bs)) {
 807            bdrv_refresh_filename(bs);
 808            ret = -EPERM;
 809            error_setg(errp,
 810                       "VHDX image file '%s' opened read-only, but "
 811                       "contains a log that needs to be replayed",
 812                       bs->filename);
 813            error_append_hint(errp,  "To replay the log, run:\n"
 814                              "qemu-img check -r all '%s'\n",
 815                              bs->filename);
 816            goto exit;
 817        }
 818        /* now flush the log */
 819        ret = vhdx_log_flush(bs, s, &logs);
 820        if (ret < 0) {
 821            goto exit;
 822        }
 823        *flushed = true;
 824    }
 825
 826
 827exit:
 828    return ret;
 829}
 830
 831
 832
 833static void vhdx_log_raw_to_le_sector(VHDXLogDescriptor *desc,
 834                                      VHDXLogDataSector *sector, void *data,
 835                                      uint64_t seq)
 836{
 837    /* 8 + 4084 + 4 = 4096, 1 log sector */
 838    memcpy(&desc->leading_bytes, data, 8);
 839    data += 8;
 840    desc->leading_bytes = cpu_to_le64(desc->leading_bytes);
 841    memcpy(sector->data, data, 4084);
 842    data += 4084;
 843    memcpy(&desc->trailing_bytes, data, 4);
 844    desc->trailing_bytes = cpu_to_le32(desc->trailing_bytes);
 845    data += 4;
 846
 847    sector->sequence_high  = (uint32_t) (seq >> 32);
 848    sector->sequence_low   = (uint32_t) (seq & 0xffffffff);
 849    sector->data_signature = VHDX_LOG_DATA_SIGNATURE;
 850
 851    vhdx_log_desc_le_export(desc);
 852    vhdx_log_data_le_export(sector);
 853}
 854
 855
 856static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s,
 857                          void *data, uint32_t length, uint64_t offset)
 858{
 859    int ret = 0;
 860    void *buffer = NULL;
 861    void *merged_sector = NULL;
 862    void *data_tmp, *sector_write;
 863    unsigned int i;
 864    int sector_offset;
 865    uint32_t desc_sectors, sectors, total_length;
 866    uint32_t sectors_written = 0;
 867    uint32_t aligned_length;
 868    uint32_t leading_length = 0;
 869    uint32_t trailing_length = 0;
 870    uint32_t partial_sectors = 0;
 871    uint32_t bytes_written = 0;
 872    uint64_t file_offset;
 873    int64_t file_length;
 874    VHDXHeader *header;
 875    VHDXLogEntryHeader new_hdr;
 876    VHDXLogDescriptor *new_desc = NULL;
 877    VHDXLogDataSector *data_sector = NULL;
 878    MSGUID new_guid = { 0 };
 879
 880    header = s->headers[s->curr_header];
 881
 882    /* need to have offset read data, and be on 4096 byte boundary */
 883
 884    if (length > header->log_length) {
 885        /* no log present.  we could create a log here instead of failing */
 886        ret = -EINVAL;
 887        goto exit;
 888    }
 889
 890    if (guid_eq(header->log_guid, zero_guid)) {
 891        vhdx_guid_generate(&new_guid);
 892        vhdx_update_headers(bs, s, false, &new_guid);
 893    } else {
 894        /* currently, we require that the log be flushed after
 895         * every write. */
 896        ret = -ENOTSUP;
 897        goto exit;
 898    }
 899
 900    /* 0 is an invalid sequence number, but may also represent the first
 901     * log write (or a wrapped seq) */
 902    if (s->log.sequence == 0) {
 903        s->log.sequence = 1;
 904    }
 905
 906    sector_offset = offset % VHDX_LOG_SECTOR_SIZE;
 907    file_offset = QEMU_ALIGN_DOWN(offset, VHDX_LOG_SECTOR_SIZE);
 908
 909    aligned_length = length;
 910
 911    /* add in the unaligned head and tail bytes */
 912    if (sector_offset) {
 913        leading_length = (VHDX_LOG_SECTOR_SIZE - sector_offset);
 914        leading_length = leading_length > length ? length : leading_length;
 915        aligned_length -= leading_length;
 916        partial_sectors++;
 917    }
 918
 919    sectors = aligned_length / VHDX_LOG_SECTOR_SIZE;
 920    trailing_length = aligned_length - (sectors * VHDX_LOG_SECTOR_SIZE);
 921    if (trailing_length) {
 922        partial_sectors++;
 923    }
 924
 925    sectors += partial_sectors;
 926
 927    file_length = bdrv_getlength(bs->file->bs);
 928    if (file_length < 0) {
 929        ret = file_length;
 930        goto exit;
 931    }
 932
 933    /* sectors is now how many sectors the data itself takes, not
 934     * including the header and descriptor metadata */
 935
 936    new_hdr = (VHDXLogEntryHeader) {
 937                .signature           = VHDX_LOG_SIGNATURE,
 938                .tail                = s->log.tail,
 939                .sequence_number     = s->log.sequence,
 940                .descriptor_count    = sectors,
 941                .reserved            = 0,
 942                .flushed_file_offset = file_length,
 943                .last_file_offset    = file_length,
 944                .log_guid            = header->log_guid,
 945              };
 946
 947
 948    desc_sectors = vhdx_compute_desc_sectors(new_hdr.descriptor_count);
 949
 950    total_length = (desc_sectors + sectors) * VHDX_LOG_SECTOR_SIZE;
 951    new_hdr.entry_length = total_length;
 952
 953    vhdx_log_entry_hdr_le_export(&new_hdr);
 954
 955    buffer = qemu_blockalign(bs, total_length);
 956    memcpy(buffer, &new_hdr, sizeof(new_hdr));
 957
 958    new_desc = buffer + sizeof(new_hdr);
 959    data_sector = buffer + (desc_sectors * VHDX_LOG_SECTOR_SIZE);
 960    data_tmp = data;
 961
 962    /* All log sectors are 4KB, so for any partial sectors we must
 963     * merge the data with preexisting data from the final file
 964     * destination */
 965    merged_sector = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
 966
 967    for (i = 0; i < sectors; i++) {
 968        new_desc->signature       = VHDX_LOG_DESC_SIGNATURE;
 969        new_desc->sequence_number = s->log.sequence;
 970        new_desc->file_offset     = file_offset;
 971
 972        if (i == 0 && leading_length) {
 973            /* partial sector at the front of the buffer */
 974            ret = bdrv_pread(bs->file, file_offset, VHDX_LOG_SECTOR_SIZE,
 975                             merged_sector, 0);
 976            if (ret < 0) {
 977                goto exit;
 978            }
 979            memcpy(merged_sector + sector_offset, data_tmp, leading_length);
 980            bytes_written = leading_length;
 981            sector_write = merged_sector;
 982        } else if (i == sectors - 1 && trailing_length) {
 983            /* partial sector at the end of the buffer */
 984            ret = bdrv_pread(bs->file, file_offset + trailing_length,
 985                             VHDX_LOG_SECTOR_SIZE - trailing_length,
 986                             merged_sector + trailing_length, 0);
 987            if (ret < 0) {
 988                goto exit;
 989            }
 990            memcpy(merged_sector, data_tmp, trailing_length);
 991            bytes_written = trailing_length;
 992            sector_write = merged_sector;
 993        } else {
 994            bytes_written = VHDX_LOG_SECTOR_SIZE;
 995            sector_write = data_tmp;
 996        }
 997
 998        /* populate the raw sector data into the proper structures,
 999         * as well as update the descriptor, and convert to proper
1000         * endianness */
1001        vhdx_log_raw_to_le_sector(new_desc, data_sector, sector_write,
1002                                  s->log.sequence);
1003
1004        data_tmp += bytes_written;
1005        data_sector++;
1006        new_desc++;
1007        file_offset += VHDX_LOG_SECTOR_SIZE;
1008    }
1009
1010    /* checksum covers entire entry, from the log header through the
1011     * last data sector */
1012    vhdx_update_checksum(buffer, total_length,
1013                         offsetof(VHDXLogEntryHeader, checksum));
1014
1015    /* now write to the log */
1016    ret = vhdx_log_write_sectors(bs, &s->log, &sectors_written, buffer,
1017                                 desc_sectors + sectors);
1018    if (ret < 0) {
1019        goto exit;
1020    }
1021
1022    if (sectors_written != desc_sectors + sectors) {
1023        /* instead of failing, we could flush the log here */
1024        ret = -EINVAL;
1025        goto exit;
1026    }
1027
1028    s->log.sequence++;
1029    /* write new tail */
1030    s->log.tail = s->log.write;
1031
1032exit:
1033    qemu_vfree(buffer);
1034    qemu_vfree(merged_sector);
1035    return ret;
1036}
1037
1038/* Perform a log write, and then immediately flush the entire log */
1039int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s,
1040                             void *data, uint32_t length, uint64_t offset)
1041{
1042    int ret = 0;
1043    VHDXLogSequence logs = { .valid = true,
1044                             .count = 1,
1045                             .hdr = { 0 } };
1046
1047
1048    /* Make sure data written (new and/or changed blocks) is stable
1049     * on disk, before creating log entry */
1050    ret = bdrv_flush(bs);
1051    if (ret < 0) {
1052        goto exit;
1053    }
1054
1055    ret = vhdx_log_write(bs, s, data, length, offset);
1056    if (ret < 0) {
1057        goto exit;
1058    }
1059    logs.log = s->log;
1060
1061    /* Make sure log is stable on disk */
1062    ret = bdrv_flush(bs);
1063    if (ret < 0) {
1064        goto exit;
1065    }
1066
1067    ret = vhdx_log_flush(bs, s, &logs);
1068    if (ret < 0) {
1069        goto exit;
1070    }
1071
1072    s->log = logs.log;
1073
1074exit:
1075    return ret;
1076}
1077
1078