qemu/block.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator block driver
   3 *
   4 * Copyright (c) 2003 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24#include "config-host.h"
  25#include "qemu-common.h"
  26#include "trace.h"
  27#include "monitor.h"
  28#include "block_int.h"
  29#include "module.h"
  30#include "qjson.h"
  31#include "qemu-coroutine.h"
  32#include "qmp-commands.h"
  33
  34#ifdef CONFIG_BSD
  35#include <sys/types.h>
  36#include <sys/stat.h>
  37#include <sys/ioctl.h>
  38#include <sys/queue.h>
  39#ifndef __DragonFly__
  40#include <sys/disk.h>
  41#endif
  42#endif
  43
  44#ifdef _WIN32
  45#include <windows.h>
  46#endif
  47
  48#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
  49
  50static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
  51static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
  52        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
  53        BlockDriverCompletionFunc *cb, void *opaque);
  54static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
  55        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
  56        BlockDriverCompletionFunc *cb, void *opaque);
  57static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
  58                                         int64_t sector_num, int nb_sectors,
  59                                         QEMUIOVector *iov);
  60static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
  61                                         int64_t sector_num, int nb_sectors,
  62                                         QEMUIOVector *iov);
  63static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
  64    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
  65static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
  66    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
  67static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
  68                                               int64_t sector_num,
  69                                               QEMUIOVector *qiov,
  70                                               int nb_sectors,
  71                                               BlockDriverCompletionFunc *cb,
  72                                               void *opaque,
  73                                               bool is_write);
  74static void coroutine_fn bdrv_co_do_rw(void *opaque);
  75
  76static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
  77    QTAILQ_HEAD_INITIALIZER(bdrv_states);
  78
  79static QLIST_HEAD(, BlockDriver) bdrv_drivers =
  80    QLIST_HEAD_INITIALIZER(bdrv_drivers);
  81
  82/* The device to use for VM snapshots */
  83static BlockDriverState *bs_snapshots;
  84
  85/* If non-zero, use only whitelisted block drivers */
  86static int use_bdrv_whitelist;
  87
  88#ifdef _WIN32
  89static int is_windows_drive_prefix(const char *filename)
  90{
  91    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
  92             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
  93            filename[1] == ':');
  94}
  95
  96int is_windows_drive(const char *filename)
  97{
  98    if (is_windows_drive_prefix(filename) &&
  99        filename[2] == '\0')
 100        return 1;
 101    if (strstart(filename, "\\\\.\\", NULL) ||
 102        strstart(filename, "//./", NULL))
 103        return 1;
 104    return 0;
 105}
 106#endif
 107
 108/* check if the path starts with "<protocol>:" */
 109static int path_has_protocol(const char *path)
 110{
 111#ifdef _WIN32
 112    if (is_windows_drive(path) ||
 113        is_windows_drive_prefix(path)) {
 114        return 0;
 115    }
 116#endif
 117
 118    return strchr(path, ':') != NULL;
 119}
 120
 121int path_is_absolute(const char *path)
 122{
 123    const char *p;
 124#ifdef _WIN32
 125    /* specific case for names like: "\\.\d:" */
 126    if (*path == '/' || *path == '\\')
 127        return 1;
 128#endif
 129    p = strchr(path, ':');
 130    if (p)
 131        p++;
 132    else
 133        p = path;
 134#ifdef _WIN32
 135    return (*p == '/' || *p == '\\');
 136#else
 137    return (*p == '/');
 138#endif
 139}
 140
 141/* if filename is absolute, just copy it to dest. Otherwise, build a
 142   path to it by considering it is relative to base_path. URL are
 143   supported. */
 144void path_combine(char *dest, int dest_size,
 145                  const char *base_path,
 146                  const char *filename)
 147{
 148    const char *p, *p1;
 149    int len;
 150
 151    if (dest_size <= 0)
 152        return;
 153    if (path_is_absolute(filename)) {
 154        pstrcpy(dest, dest_size, filename);
 155    } else {
 156        p = strchr(base_path, ':');
 157        if (p)
 158            p++;
 159        else
 160            p = base_path;
 161        p1 = strrchr(base_path, '/');
 162#ifdef _WIN32
 163        {
 164            const char *p2;
 165            p2 = strrchr(base_path, '\\');
 166            if (!p1 || p2 > p1)
 167                p1 = p2;
 168        }
 169#endif
 170        if (p1)
 171            p1++;
 172        else
 173            p1 = base_path;
 174        if (p1 > p)
 175            p = p1;
 176        len = p - base_path;
 177        if (len > dest_size - 1)
 178            len = dest_size - 1;
 179        memcpy(dest, base_path, len);
 180        dest[len] = '\0';
 181        pstrcat(dest, dest_size, filename);
 182    }
 183}
 184
 185void bdrv_register(BlockDriver *bdrv)
 186{
 187    /* Block drivers without coroutine functions need emulation */
 188    if (!bdrv->bdrv_co_readv) {
 189        bdrv->bdrv_co_readv = bdrv_co_readv_em;
 190        bdrv->bdrv_co_writev = bdrv_co_writev_em;
 191
 192        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
 193         * the block driver lacks aio we need to emulate that too.
 194         */
 195        if (!bdrv->bdrv_aio_readv) {
 196            /* add AIO emulation layer */
 197            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
 198            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
 199        }
 200    }
 201
 202    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
 203}
 204
 205/* create a new block device (by default it is empty) */
 206BlockDriverState *bdrv_new(const char *device_name)
 207{
 208    BlockDriverState *bs;
 209
 210    bs = g_malloc0(sizeof(BlockDriverState));
 211    pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
 212    if (device_name[0] != '\0') {
 213        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
 214    }
 215    bdrv_iostatus_disable(bs);
 216    return bs;
 217}
 218
 219BlockDriver *bdrv_find_format(const char *format_name)
 220{
 221    BlockDriver *drv1;
 222    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
 223        if (!strcmp(drv1->format_name, format_name)) {
 224            return drv1;
 225        }
 226    }
 227    return NULL;
 228}
 229
 230static int bdrv_is_whitelisted(BlockDriver *drv)
 231{
 232    static const char *whitelist[] = {
 233        CONFIG_BDRV_WHITELIST
 234    };
 235    const char **p;
 236
 237    if (!whitelist[0])
 238        return 1;               /* no whitelist, anything goes */
 239
 240    for (p = whitelist; *p; p++) {
 241        if (!strcmp(drv->format_name, *p)) {
 242            return 1;
 243        }
 244    }
 245    return 0;
 246}
 247
 248BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
 249{
 250    BlockDriver *drv = bdrv_find_format(format_name);
 251    return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
 252}
 253
 254int bdrv_create(BlockDriver *drv, const char* filename,
 255    QEMUOptionParameter *options)
 256{
 257    if (!drv->bdrv_create)
 258        return -ENOTSUP;
 259
 260    return drv->bdrv_create(filename, options);
 261}
 262
 263int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
 264{
 265    BlockDriver *drv;
 266
 267    drv = bdrv_find_protocol(filename);
 268    if (drv == NULL) {
 269        return -ENOENT;
 270    }
 271
 272    return bdrv_create(drv, filename, options);
 273}
 274
 275#ifdef _WIN32
 276void get_tmp_filename(char *filename, int size)
 277{
 278    char temp_dir[MAX_PATH];
 279
 280    GetTempPath(MAX_PATH, temp_dir);
 281    GetTempFileName(temp_dir, "qem", 0, filename);
 282}
 283#else
 284void get_tmp_filename(char *filename, int size)
 285{
 286    int fd;
 287    const char *tmpdir;
 288    /* XXX: race condition possible */
 289    tmpdir = getenv("TMPDIR");
 290    if (!tmpdir)
 291        tmpdir = "/tmp";
 292    snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
 293    fd = mkstemp(filename);
 294    close(fd);
 295}
 296#endif
 297
 298/*
 299 * Detect host devices. By convention, /dev/cdrom[N] is always
 300 * recognized as a host CDROM.
 301 */
 302static BlockDriver *find_hdev_driver(const char *filename)
 303{
 304    int score_max = 0, score;
 305    BlockDriver *drv = NULL, *d;
 306
 307    QLIST_FOREACH(d, &bdrv_drivers, list) {
 308        if (d->bdrv_probe_device) {
 309            score = d->bdrv_probe_device(filename);
 310            if (score > score_max) {
 311                score_max = score;
 312                drv = d;
 313            }
 314        }
 315    }
 316
 317    return drv;
 318}
 319
 320BlockDriver *bdrv_find_protocol(const char *filename)
 321{
 322    BlockDriver *drv1;
 323    char protocol[128];
 324    int len;
 325    const char *p;
 326
 327    /* TODO Drivers without bdrv_file_open must be specified explicitly */
 328
 329    /*
 330     * XXX(hch): we really should not let host device detection
 331     * override an explicit protocol specification, but moving this
 332     * later breaks access to device names with colons in them.
 333     * Thanks to the brain-dead persistent naming schemes on udev-
 334     * based Linux systems those actually are quite common.
 335     */
 336    drv1 = find_hdev_driver(filename);
 337    if (drv1) {
 338        return drv1;
 339    }
 340
 341    if (!path_has_protocol(filename)) {
 342        return bdrv_find_format("file");
 343    }
 344    p = strchr(filename, ':');
 345    assert(p != NULL);
 346    len = p - filename;
 347    if (len > sizeof(protocol) - 1)
 348        len = sizeof(protocol) - 1;
 349    memcpy(protocol, filename, len);
 350    protocol[len] = '\0';
 351    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
 352        if (drv1->protocol_name &&
 353            !strcmp(drv1->protocol_name, protocol)) {
 354            return drv1;
 355        }
 356    }
 357    return NULL;
 358}
 359
 360static int find_image_format(const char *filename, BlockDriver **pdrv)
 361{
 362    int ret, score, score_max;
 363    BlockDriver *drv1, *drv;
 364    uint8_t buf[2048];
 365    BlockDriverState *bs;
 366
 367    ret = bdrv_file_open(&bs, filename, 0);
 368    if (ret < 0) {
 369        *pdrv = NULL;
 370        return ret;
 371    }
 372
 373    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
 374    if (bs->sg || !bdrv_is_inserted(bs)) {
 375        bdrv_delete(bs);
 376        drv = bdrv_find_format("raw");
 377        if (!drv) {
 378            ret = -ENOENT;
 379        }
 380        *pdrv = drv;
 381        return ret;
 382    }
 383
 384    ret = bdrv_pread(bs, 0, buf, sizeof(buf));
 385    bdrv_delete(bs);
 386    if (ret < 0) {
 387        *pdrv = NULL;
 388        return ret;
 389    }
 390
 391    score_max = 0;
 392    drv = NULL;
 393    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
 394        if (drv1->bdrv_probe) {
 395            score = drv1->bdrv_probe(buf, ret, filename);
 396            if (score > score_max) {
 397                score_max = score;
 398                drv = drv1;
 399            }
 400        }
 401    }
 402    if (!drv) {
 403        ret = -ENOENT;
 404    }
 405    *pdrv = drv;
 406    return ret;
 407}
 408
 409/**
 410 * Set the current 'total_sectors' value
 411 */
 412static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
 413{
 414    BlockDriver *drv = bs->drv;
 415
 416    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
 417    if (bs->sg)
 418        return 0;
 419
 420    /* query actual device if possible, otherwise just trust the hint */
 421    if (drv->bdrv_getlength) {
 422        int64_t length = drv->bdrv_getlength(bs);
 423        if (length < 0) {
 424            return length;
 425        }
 426        hint = length >> BDRV_SECTOR_BITS;
 427    }
 428
 429    bs->total_sectors = hint;
 430    return 0;
 431}
 432
 433/**
 434 * Set open flags for a given cache mode
 435 *
 436 * Return 0 on success, -1 if the cache mode was invalid.
 437 */
 438int bdrv_parse_cache_flags(const char *mode, int *flags)
 439{
 440    *flags &= ~BDRV_O_CACHE_MASK;
 441
 442    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
 443        *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
 444    } else if (!strcmp(mode, "directsync")) {
 445        *flags |= BDRV_O_NOCACHE;
 446    } else if (!strcmp(mode, "writeback")) {
 447        *flags |= BDRV_O_CACHE_WB;
 448    } else if (!strcmp(mode, "unsafe")) {
 449        *flags |= BDRV_O_CACHE_WB;
 450        *flags |= BDRV_O_NO_FLUSH;
 451    } else if (!strcmp(mode, "writethrough")) {
 452        /* this is the default */
 453    } else {
 454        return -1;
 455    }
 456
 457    return 0;
 458}
 459
 460/*
 461 * Common part for opening disk images and files
 462 */
 463static int bdrv_open_common(BlockDriverState *bs, const char *filename,
 464    int flags, BlockDriver *drv)
 465{
 466    int ret, open_flags;
 467
 468    assert(drv != NULL);
 469
 470    trace_bdrv_open_common(bs, filename, flags, drv->format_name);
 471
 472    bs->file = NULL;
 473    bs->total_sectors = 0;
 474    bs->encrypted = 0;
 475    bs->valid_key = 0;
 476    bs->sg = 0;
 477    bs->open_flags = flags;
 478    bs->growable = 0;
 479    bs->buffer_alignment = 512;
 480
 481    pstrcpy(bs->filename, sizeof(bs->filename), filename);
 482    bs->backing_file[0] = '\0';
 483
 484    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
 485        return -ENOTSUP;
 486    }
 487
 488    bs->drv = drv;
 489    bs->opaque = g_malloc0(drv->instance_size);
 490
 491    bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
 492
 493    /*
 494     * Clear flags that are internal to the block layer before opening the
 495     * image.
 496     */
 497    open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
 498
 499    /*
 500     * Snapshots should be writable.
 501     */
 502    if (bs->is_temporary) {
 503        open_flags |= BDRV_O_RDWR;
 504    }
 505
 506    bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
 507
 508    /* Open the image, either directly or using a protocol */
 509    if (drv->bdrv_file_open) {
 510        ret = drv->bdrv_file_open(bs, filename, open_flags);
 511    } else {
 512        ret = bdrv_file_open(&bs->file, filename, open_flags);
 513        if (ret >= 0) {
 514            ret = drv->bdrv_open(bs, open_flags);
 515        }
 516    }
 517
 518    if (ret < 0) {
 519        goto free_and_fail;
 520    }
 521
 522    ret = refresh_total_sectors(bs, bs->total_sectors);
 523    if (ret < 0) {
 524        goto free_and_fail;
 525    }
 526
 527#ifndef _WIN32
 528    if (bs->is_temporary) {
 529        unlink(filename);
 530    }
 531#endif
 532    return 0;
 533
 534free_and_fail:
 535    if (bs->file) {
 536        bdrv_delete(bs->file);
 537        bs->file = NULL;
 538    }
 539    g_free(bs->opaque);
 540    bs->opaque = NULL;
 541    bs->drv = NULL;
 542    return ret;
 543}
 544
 545/*
 546 * Opens a file using a protocol (file, host_device, nbd, ...)
 547 */
 548int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
 549{
 550    BlockDriverState *bs;
 551    BlockDriver *drv;
 552    int ret;
 553
 554    drv = bdrv_find_protocol(filename);
 555    if (!drv) {
 556        return -ENOENT;
 557    }
 558
 559    bs = bdrv_new("");
 560    ret = bdrv_open_common(bs, filename, flags, drv);
 561    if (ret < 0) {
 562        bdrv_delete(bs);
 563        return ret;
 564    }
 565    bs->growable = 1;
 566    *pbs = bs;
 567    return 0;
 568}
 569
 570/*
 571 * Opens a disk image (raw, qcow2, vmdk, ...)
 572 */
 573int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
 574              BlockDriver *drv)
 575{
 576    int ret;
 577    char tmp_filename[PATH_MAX];
 578
 579    if (flags & BDRV_O_SNAPSHOT) {
 580        BlockDriverState *bs1;
 581        int64_t total_size;
 582        int is_protocol = 0;
 583        BlockDriver *bdrv_qcow2;
 584        QEMUOptionParameter *options;
 585        char backing_filename[PATH_MAX];
 586
 587        /* if snapshot, we create a temporary backing file and open it
 588           instead of opening 'filename' directly */
 589
 590        /* if there is a backing file, use it */
 591        bs1 = bdrv_new("");
 592        ret = bdrv_open(bs1, filename, 0, drv);
 593        if (ret < 0) {
 594            bdrv_delete(bs1);
 595            return ret;
 596        }
 597        total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
 598
 599        if (bs1->drv && bs1->drv->protocol_name)
 600            is_protocol = 1;
 601
 602        bdrv_delete(bs1);
 603
 604        get_tmp_filename(tmp_filename, sizeof(tmp_filename));
 605
 606        /* Real path is meaningless for protocols */
 607        if (is_protocol)
 608            snprintf(backing_filename, sizeof(backing_filename),
 609                     "%s", filename);
 610        else if (!realpath(filename, backing_filename))
 611            return -errno;
 612
 613        bdrv_qcow2 = bdrv_find_format("qcow2");
 614        options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
 615
 616        set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
 617        set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
 618        if (drv) {
 619            set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
 620                drv->format_name);
 621        }
 622
 623        ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
 624        free_option_parameters(options);
 625        if (ret < 0) {
 626            return ret;
 627        }
 628
 629        filename = tmp_filename;
 630        drv = bdrv_qcow2;
 631        bs->is_temporary = 1;
 632    }
 633
 634    /* Find the right image format driver */
 635    if (!drv) {
 636        ret = find_image_format(filename, &drv);
 637    }
 638
 639    if (!drv) {
 640        goto unlink_and_fail;
 641    }
 642
 643    /* Open the image */
 644    ret = bdrv_open_common(bs, filename, flags, drv);
 645    if (ret < 0) {
 646        goto unlink_and_fail;
 647    }
 648
 649    /* If there is a backing file, use it */
 650    if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
 651        char backing_filename[PATH_MAX];
 652        int back_flags;
 653        BlockDriver *back_drv = NULL;
 654
 655        bs->backing_hd = bdrv_new("");
 656
 657        if (path_has_protocol(bs->backing_file)) {
 658            pstrcpy(backing_filename, sizeof(backing_filename),
 659                    bs->backing_file);
 660        } else {
 661            path_combine(backing_filename, sizeof(backing_filename),
 662                         filename, bs->backing_file);
 663        }
 664
 665        if (bs->backing_format[0] != '\0') {
 666            back_drv = bdrv_find_format(bs->backing_format);
 667        }
 668
 669        /* backing files always opened read-only */
 670        back_flags =
 671            flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
 672
 673        ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
 674        if (ret < 0) {
 675            bdrv_close(bs);
 676            return ret;
 677        }
 678        if (bs->is_temporary) {
 679            bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
 680        } else {
 681            /* base image inherits from "parent" */
 682            bs->backing_hd->keep_read_only = bs->keep_read_only;
 683        }
 684    }
 685
 686    if (!bdrv_key_required(bs)) {
 687        bdrv_dev_change_media_cb(bs, true);
 688    }
 689
 690    return 0;
 691
 692unlink_and_fail:
 693    if (bs->is_temporary) {
 694        unlink(filename);
 695    }
 696    return ret;
 697}
 698
 699void bdrv_close(BlockDriverState *bs)
 700{
 701    if (bs->drv) {
 702        if (bs == bs_snapshots) {
 703            bs_snapshots = NULL;
 704        }
 705        if (bs->backing_hd) {
 706            bdrv_delete(bs->backing_hd);
 707            bs->backing_hd = NULL;
 708        }
 709        bs->drv->bdrv_close(bs);
 710        g_free(bs->opaque);
 711#ifdef _WIN32
 712        if (bs->is_temporary) {
 713            unlink(bs->filename);
 714        }
 715#endif
 716        bs->opaque = NULL;
 717        bs->drv = NULL;
 718
 719        if (bs->file != NULL) {
 720            bdrv_close(bs->file);
 721        }
 722
 723        bdrv_dev_change_media_cb(bs, false);
 724    }
 725}
 726
 727void bdrv_close_all(void)
 728{
 729    BlockDriverState *bs;
 730
 731    QTAILQ_FOREACH(bs, &bdrv_states, list) {
 732        bdrv_close(bs);
 733    }
 734}
 735
 736/* make a BlockDriverState anonymous by removing from bdrv_state list.
 737   Also, NULL terminate the device_name to prevent double remove */
 738void bdrv_make_anon(BlockDriverState *bs)
 739{
 740    if (bs->device_name[0] != '\0') {
 741        QTAILQ_REMOVE(&bdrv_states, bs, list);
 742    }
 743    bs->device_name[0] = '\0';
 744}
 745
 746void bdrv_delete(BlockDriverState *bs)
 747{
 748    assert(!bs->dev);
 749
 750    /* remove from list, if necessary */
 751    bdrv_make_anon(bs);
 752
 753    bdrv_close(bs);
 754    if (bs->file != NULL) {
 755        bdrv_delete(bs->file);
 756    }
 757
 758    assert(bs != bs_snapshots);
 759    g_free(bs);
 760}
 761
 762int bdrv_attach_dev(BlockDriverState *bs, void *dev)
 763/* TODO change to DeviceState *dev when all users are qdevified */
 764{
 765    if (bs->dev) {
 766        return -EBUSY;
 767    }
 768    bs->dev = dev;
 769    bdrv_iostatus_reset(bs);
 770    return 0;
 771}
 772
 773/* TODO qdevified devices don't use this, remove when devices are qdevified */
 774void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
 775{
 776    if (bdrv_attach_dev(bs, dev) < 0) {
 777        abort();
 778    }
 779}
 780
 781void bdrv_detach_dev(BlockDriverState *bs, void *dev)
 782/* TODO change to DeviceState *dev when all users are qdevified */
 783{
 784    assert(bs->dev == dev);
 785    bs->dev = NULL;
 786    bs->dev_ops = NULL;
 787    bs->dev_opaque = NULL;
 788    bs->buffer_alignment = 512;
 789}
 790
 791/* TODO change to return DeviceState * when all users are qdevified */
 792void *bdrv_get_attached_dev(BlockDriverState *bs)
 793{
 794    return bs->dev;
 795}
 796
 797void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
 798                      void *opaque)
 799{
 800    bs->dev_ops = ops;
 801    bs->dev_opaque = opaque;
 802    if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
 803        bs_snapshots = NULL;
 804    }
 805}
 806
 807static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
 808{
 809    if (bs->dev_ops && bs->dev_ops->change_media_cb) {
 810        bs->dev_ops->change_media_cb(bs->dev_opaque, load);
 811    }
 812}
 813
 814bool bdrv_dev_has_removable_media(BlockDriverState *bs)
 815{
 816    return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
 817}
 818
 819void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
 820{
 821    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
 822        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
 823    }
 824}
 825
 826bool bdrv_dev_is_tray_open(BlockDriverState *bs)
 827{
 828    if (bs->dev_ops && bs->dev_ops->is_tray_open) {
 829        return bs->dev_ops->is_tray_open(bs->dev_opaque);
 830    }
 831    return false;
 832}
 833
 834static void bdrv_dev_resize_cb(BlockDriverState *bs)
 835{
 836    if (bs->dev_ops && bs->dev_ops->resize_cb) {
 837        bs->dev_ops->resize_cb(bs->dev_opaque);
 838    }
 839}
 840
 841bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
 842{
 843    if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
 844        return bs->dev_ops->is_medium_locked(bs->dev_opaque);
 845    }
 846    return false;
 847}
 848
 849/*
 850 * Run consistency checks on an image
 851 *
 852 * Returns 0 if the check could be completed (it doesn't mean that the image is
 853 * free of errors) or -errno when an internal error occurred. The results of the
 854 * check are stored in res.
 855 */
 856int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
 857{
 858    if (bs->drv->bdrv_check == NULL) {
 859        return -ENOTSUP;
 860    }
 861
 862    memset(res, 0, sizeof(*res));
 863    return bs->drv->bdrv_check(bs, res);
 864}
 865
 866#define COMMIT_BUF_SECTORS 2048
 867
 868/* commit COW file into the raw image */
 869int bdrv_commit(BlockDriverState *bs)
 870{
 871    BlockDriver *drv = bs->drv;
 872    BlockDriver *backing_drv;
 873    int64_t sector, total_sectors;
 874    int n, ro, open_flags;
 875    int ret = 0, rw_ret = 0;
 876    uint8_t *buf;
 877    char filename[1024];
 878    BlockDriverState *bs_rw, *bs_ro;
 879
 880    if (!drv)
 881        return -ENOMEDIUM;
 882    
 883    if (!bs->backing_hd) {
 884        return -ENOTSUP;
 885    }
 886
 887    if (bs->backing_hd->keep_read_only) {
 888        return -EACCES;
 889    }
 890
 891    backing_drv = bs->backing_hd->drv;
 892    ro = bs->backing_hd->read_only;
 893    strncpy(filename, bs->backing_hd->filename, sizeof(filename));
 894    open_flags =  bs->backing_hd->open_flags;
 895
 896    if (ro) {
 897        /* re-open as RW */
 898        bdrv_delete(bs->backing_hd);
 899        bs->backing_hd = NULL;
 900        bs_rw = bdrv_new("");
 901        rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
 902            backing_drv);
 903        if (rw_ret < 0) {
 904            bdrv_delete(bs_rw);
 905            /* try to re-open read-only */
 906            bs_ro = bdrv_new("");
 907            ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
 908                backing_drv);
 909            if (ret < 0) {
 910                bdrv_delete(bs_ro);
 911                /* drive not functional anymore */
 912                bs->drv = NULL;
 913                return ret;
 914            }
 915            bs->backing_hd = bs_ro;
 916            return rw_ret;
 917        }
 918        bs->backing_hd = bs_rw;
 919    }
 920
 921    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
 922    buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
 923
 924    for (sector = 0; sector < total_sectors; sector += n) {
 925        if (drv->bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
 926
 927            if (bdrv_read(bs, sector, buf, n) != 0) {
 928                ret = -EIO;
 929                goto ro_cleanup;
 930            }
 931
 932            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
 933                ret = -EIO;
 934                goto ro_cleanup;
 935            }
 936        }
 937    }
 938
 939    if (drv->bdrv_make_empty) {
 940        ret = drv->bdrv_make_empty(bs);
 941        bdrv_flush(bs);
 942    }
 943
 944    /*
 945     * Make sure all data we wrote to the backing device is actually
 946     * stable on disk.
 947     */
 948    if (bs->backing_hd)
 949        bdrv_flush(bs->backing_hd);
 950
 951ro_cleanup:
 952    g_free(buf);
 953
 954    if (ro) {
 955        /* re-open as RO */
 956        bdrv_delete(bs->backing_hd);
 957        bs->backing_hd = NULL;
 958        bs_ro = bdrv_new("");
 959        ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
 960            backing_drv);
 961        if (ret < 0) {
 962            bdrv_delete(bs_ro);
 963            /* drive not functional anymore */
 964            bs->drv = NULL;
 965            return ret;
 966        }
 967        bs->backing_hd = bs_ro;
 968        bs->backing_hd->keep_read_only = 0;
 969    }
 970
 971    return ret;
 972}
 973
 974void bdrv_commit_all(void)
 975{
 976    BlockDriverState *bs;
 977
 978    QTAILQ_FOREACH(bs, &bdrv_states, list) {
 979        bdrv_commit(bs);
 980    }
 981}
 982
 983/*
 984 * Return values:
 985 * 0        - success
 986 * -EINVAL  - backing format specified, but no file
 987 * -ENOSPC  - can't update the backing file because no space is left in the
 988 *            image file header
 989 * -ENOTSUP - format driver doesn't support changing the backing file
 990 */
 991int bdrv_change_backing_file(BlockDriverState *bs,
 992    const char *backing_file, const char *backing_fmt)
 993{
 994    BlockDriver *drv = bs->drv;
 995
 996    if (drv->bdrv_change_backing_file != NULL) {
 997        return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
 998    } else {
 999        return -ENOTSUP;
1000    }
1001}
1002
1003static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1004                                   size_t size)
1005{
1006    int64_t len;
1007
1008    if (!bdrv_is_inserted(bs))
1009        return -ENOMEDIUM;
1010
1011    if (bs->growable)
1012        return 0;
1013
1014    len = bdrv_getlength(bs);
1015
1016    if (offset < 0)
1017        return -EIO;
1018
1019    if ((offset > len) || (len - offset < size))
1020        return -EIO;
1021
1022    return 0;
1023}
1024
1025static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1026                              int nb_sectors)
1027{
1028    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1029                                   nb_sectors * BDRV_SECTOR_SIZE);
1030}
1031
1032typedef struct RwCo {
1033    BlockDriverState *bs;
1034    int64_t sector_num;
1035    int nb_sectors;
1036    QEMUIOVector *qiov;
1037    bool is_write;
1038    int ret;
1039} RwCo;
1040
1041static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1042{
1043    RwCo *rwco = opaque;
1044
1045    if (!rwco->is_write) {
1046        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1047                                     rwco->nb_sectors, rwco->qiov);
1048    } else {
1049        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1050                                      rwco->nb_sectors, rwco->qiov);
1051    }
1052}
1053
1054/*
1055 * Process a synchronous request using coroutines
1056 */
1057static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1058                      int nb_sectors, bool is_write)
1059{
1060    QEMUIOVector qiov;
1061    struct iovec iov = {
1062        .iov_base = (void *)buf,
1063        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1064    };
1065    Coroutine *co;
1066    RwCo rwco = {
1067        .bs = bs,
1068        .sector_num = sector_num,
1069        .nb_sectors = nb_sectors,
1070        .qiov = &qiov,
1071        .is_write = is_write,
1072        .ret = NOT_DONE,
1073    };
1074
1075    qemu_iovec_init_external(&qiov, &iov, 1);
1076
1077    if (qemu_in_coroutine()) {
1078        /* Fast-path if already in coroutine context */
1079        bdrv_rw_co_entry(&rwco);
1080    } else {
1081        co = qemu_coroutine_create(bdrv_rw_co_entry);
1082        qemu_coroutine_enter(co, &rwco);
1083        while (rwco.ret == NOT_DONE) {
1084            qemu_aio_wait();
1085        }
1086    }
1087    return rwco.ret;
1088}
1089
1090/* return < 0 if error. See bdrv_write() for the return codes */
1091int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1092              uint8_t *buf, int nb_sectors)
1093{
1094    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1095}
1096
1097static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1098                             int nb_sectors, int dirty)
1099{
1100    int64_t start, end;
1101    unsigned long val, idx, bit;
1102
1103    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1104    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1105
1106    for (; start <= end; start++) {
1107        idx = start / (sizeof(unsigned long) * 8);
1108        bit = start % (sizeof(unsigned long) * 8);
1109        val = bs->dirty_bitmap[idx];
1110        if (dirty) {
1111            if (!(val & (1UL << bit))) {
1112                bs->dirty_count++;
1113                val |= 1UL << bit;
1114            }
1115        } else {
1116            if (val & (1UL << bit)) {
1117                bs->dirty_count--;
1118                val &= ~(1UL << bit);
1119            }
1120        }
1121        bs->dirty_bitmap[idx] = val;
1122    }
1123}
1124
1125/* Return < 0 if error. Important errors are:
1126  -EIO         generic I/O error (may happen for all errors)
1127  -ENOMEDIUM   No media inserted.
1128  -EINVAL      Invalid sector number or nb_sectors
1129  -EACCES      Trying to write a read-only device
1130*/
1131int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1132               const uint8_t *buf, int nb_sectors)
1133{
1134    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1135}
1136
1137int bdrv_pread(BlockDriverState *bs, int64_t offset,
1138               void *buf, int count1)
1139{
1140    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1141    int len, nb_sectors, count;
1142    int64_t sector_num;
1143    int ret;
1144
1145    count = count1;
1146    /* first read to align to sector start */
1147    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1148    if (len > count)
1149        len = count;
1150    sector_num = offset >> BDRV_SECTOR_BITS;
1151    if (len > 0) {
1152        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1153            return ret;
1154        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1155        count -= len;
1156        if (count == 0)
1157            return count1;
1158        sector_num++;
1159        buf += len;
1160    }
1161
1162    /* read the sectors "in place" */
1163    nb_sectors = count >> BDRV_SECTOR_BITS;
1164    if (nb_sectors > 0) {
1165        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1166            return ret;
1167        sector_num += nb_sectors;
1168        len = nb_sectors << BDRV_SECTOR_BITS;
1169        buf += len;
1170        count -= len;
1171    }
1172
1173    /* add data from the last sector */
1174    if (count > 0) {
1175        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1176            return ret;
1177        memcpy(buf, tmp_buf, count);
1178    }
1179    return count1;
1180}
1181
1182int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1183                const void *buf, int count1)
1184{
1185    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1186    int len, nb_sectors, count;
1187    int64_t sector_num;
1188    int ret;
1189
1190    count = count1;
1191    /* first write to align to sector start */
1192    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1193    if (len > count)
1194        len = count;
1195    sector_num = offset >> BDRV_SECTOR_BITS;
1196    if (len > 0) {
1197        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1198            return ret;
1199        memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1200        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1201            return ret;
1202        count -= len;
1203        if (count == 0)
1204            return count1;
1205        sector_num++;
1206        buf += len;
1207    }
1208
1209    /* write the sectors "in place" */
1210    nb_sectors = count >> BDRV_SECTOR_BITS;
1211    if (nb_sectors > 0) {
1212        if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1213            return ret;
1214        sector_num += nb_sectors;
1215        len = nb_sectors << BDRV_SECTOR_BITS;
1216        buf += len;
1217        count -= len;
1218    }
1219
1220    /* add data from the last sector */
1221    if (count > 0) {
1222        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1223            return ret;
1224        memcpy(tmp_buf, buf, count);
1225        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1226            return ret;
1227    }
1228    return count1;
1229}
1230
1231/*
1232 * Writes to the file and ensures that no writes are reordered across this
1233 * request (acts as a barrier)
1234 *
1235 * Returns 0 on success, -errno in error cases.
1236 */
1237int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1238    const void *buf, int count)
1239{
1240    int ret;
1241
1242    ret = bdrv_pwrite(bs, offset, buf, count);
1243    if (ret < 0) {
1244        return ret;
1245    }
1246
1247    /* No flush needed for cache modes that use O_DSYNC */
1248    if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
1249        bdrv_flush(bs);
1250    }
1251
1252    return 0;
1253}
1254
1255/*
1256 * Handle a read request in coroutine context
1257 */
1258static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1259    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1260{
1261    BlockDriver *drv = bs->drv;
1262
1263    if (!drv) {
1264        return -ENOMEDIUM;
1265    }
1266    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1267        return -EIO;
1268    }
1269
1270    return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1271}
1272
1273int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1274    int nb_sectors, QEMUIOVector *qiov)
1275{
1276    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1277
1278    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov);
1279}
1280
1281/*
1282 * Handle a write request in coroutine context
1283 */
1284static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1285    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1286{
1287    BlockDriver *drv = bs->drv;
1288    int ret;
1289
1290    if (!bs->drv) {
1291        return -ENOMEDIUM;
1292    }
1293    if (bs->read_only) {
1294        return -EACCES;
1295    }
1296    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1297        return -EIO;
1298    }
1299
1300    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1301
1302    if (bs->dirty_bitmap) {
1303        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1304    }
1305
1306    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1307        bs->wr_highest_sector = sector_num + nb_sectors - 1;
1308    }
1309
1310    return ret;
1311}
1312
1313int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1314    int nb_sectors, QEMUIOVector *qiov)
1315{
1316    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1317
1318    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov);
1319}
1320
1321/**
1322 * Truncate file to 'offset' bytes (needed only for file protocols)
1323 */
1324int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1325{
1326    BlockDriver *drv = bs->drv;
1327    int ret;
1328    if (!drv)
1329        return -ENOMEDIUM;
1330    if (!drv->bdrv_truncate)
1331        return -ENOTSUP;
1332    if (bs->read_only)
1333        return -EACCES;
1334    if (bdrv_in_use(bs))
1335        return -EBUSY;
1336    ret = drv->bdrv_truncate(bs, offset);
1337    if (ret == 0) {
1338        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
1339        bdrv_dev_resize_cb(bs);
1340    }
1341    return ret;
1342}
1343
1344/**
1345 * Length of a allocated file in bytes. Sparse files are counted by actual
1346 * allocated space. Return < 0 if error or unknown.
1347 */
1348int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
1349{
1350    BlockDriver *drv = bs->drv;
1351    if (!drv) {
1352        return -ENOMEDIUM;
1353    }
1354    if (drv->bdrv_get_allocated_file_size) {
1355        return drv->bdrv_get_allocated_file_size(bs);
1356    }
1357    if (bs->file) {
1358        return bdrv_get_allocated_file_size(bs->file);
1359    }
1360    return -ENOTSUP;
1361}
1362
1363/**
1364 * Length of a file in bytes. Return < 0 if error or unknown.
1365 */
1366int64_t bdrv_getlength(BlockDriverState *bs)
1367{
1368    BlockDriver *drv = bs->drv;
1369    if (!drv)
1370        return -ENOMEDIUM;
1371
1372    if (bs->growable || bdrv_dev_has_removable_media(bs)) {
1373        if (drv->bdrv_getlength) {
1374            return drv->bdrv_getlength(bs);
1375        }
1376    }
1377    return bs->total_sectors * BDRV_SECTOR_SIZE;
1378}
1379
1380/* return 0 as number of sectors if no device present or error */
1381void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
1382{
1383    int64_t length;
1384    length = bdrv_getlength(bs);
1385    if (length < 0)
1386        length = 0;
1387    else
1388        length = length >> BDRV_SECTOR_BITS;
1389    *nb_sectors_ptr = length;
1390}
1391
1392struct partition {
1393        uint8_t boot_ind;           /* 0x80 - active */
1394        uint8_t head;               /* starting head */
1395        uint8_t sector;             /* starting sector */
1396        uint8_t cyl;                /* starting cylinder */
1397        uint8_t sys_ind;            /* What partition type */
1398        uint8_t end_head;           /* end head */
1399        uint8_t end_sector;         /* end sector */
1400        uint8_t end_cyl;            /* end cylinder */
1401        uint32_t start_sect;        /* starting sector counting from 0 */
1402        uint32_t nr_sects;          /* nr of sectors in partition */
1403} QEMU_PACKED;
1404
1405/* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
1406static int guess_disk_lchs(BlockDriverState *bs,
1407                           int *pcylinders, int *pheads, int *psectors)
1408{
1409    uint8_t buf[BDRV_SECTOR_SIZE];
1410    int ret, i, heads, sectors, cylinders;
1411    struct partition *p;
1412    uint32_t nr_sects;
1413    uint64_t nb_sectors;
1414
1415    bdrv_get_geometry(bs, &nb_sectors);
1416
1417    ret = bdrv_read(bs, 0, buf, 1);
1418    if (ret < 0)
1419        return -1;
1420    /* test msdos magic */
1421    if (buf[510] != 0x55 || buf[511] != 0xaa)
1422        return -1;
1423    for(i = 0; i < 4; i++) {
1424        p = ((struct partition *)(buf + 0x1be)) + i;
1425        nr_sects = le32_to_cpu(p->nr_sects);
1426        if (nr_sects && p->end_head) {
1427            /* We make the assumption that the partition terminates on
1428               a cylinder boundary */
1429            heads = p->end_head + 1;
1430            sectors = p->end_sector & 63;
1431            if (sectors == 0)
1432                continue;
1433            cylinders = nb_sectors / (heads * sectors);
1434            if (cylinders < 1 || cylinders > 16383)
1435                continue;
1436            *pheads = heads;
1437            *psectors = sectors;
1438            *pcylinders = cylinders;
1439#if 0
1440            printf("guessed geometry: LCHS=%d %d %d\n",
1441                   cylinders, heads, sectors);
1442#endif
1443            return 0;
1444        }
1445    }
1446    return -1;
1447}
1448
1449void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
1450{
1451    int translation, lba_detected = 0;
1452    int cylinders, heads, secs;
1453    uint64_t nb_sectors;
1454
1455    /* if a geometry hint is available, use it */
1456    bdrv_get_geometry(bs, &nb_sectors);
1457    bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
1458    translation = bdrv_get_translation_hint(bs);
1459    if (cylinders != 0) {
1460        *pcyls = cylinders;
1461        *pheads = heads;
1462        *psecs = secs;
1463    } else {
1464        if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
1465            if (heads > 16) {
1466                /* if heads > 16, it means that a BIOS LBA
1467                   translation was active, so the default
1468                   hardware geometry is OK */
1469                lba_detected = 1;
1470                goto default_geometry;
1471            } else {
1472                *pcyls = cylinders;
1473                *pheads = heads;
1474                *psecs = secs;
1475                /* disable any translation to be in sync with
1476                   the logical geometry */
1477                if (translation == BIOS_ATA_TRANSLATION_AUTO) {
1478                    bdrv_set_translation_hint(bs,
1479                                              BIOS_ATA_TRANSLATION_NONE);
1480                }
1481            }
1482        } else {
1483        default_geometry:
1484            /* if no geometry, use a standard physical disk geometry */
1485            cylinders = nb_sectors / (16 * 63);
1486
1487            if (cylinders > 16383)
1488                cylinders = 16383;
1489            else if (cylinders < 2)
1490                cylinders = 2;
1491            *pcyls = cylinders;
1492            *pheads = 16;
1493            *psecs = 63;
1494            if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
1495                if ((*pcyls * *pheads) <= 131072) {
1496                    bdrv_set_translation_hint(bs,
1497                                              BIOS_ATA_TRANSLATION_LARGE);
1498                } else {
1499                    bdrv_set_translation_hint(bs,
1500                                              BIOS_ATA_TRANSLATION_LBA);
1501                }
1502            }
1503        }
1504        bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
1505    }
1506}
1507
1508void bdrv_set_geometry_hint(BlockDriverState *bs,
1509                            int cyls, int heads, int secs)
1510{
1511    bs->cyls = cyls;
1512    bs->heads = heads;
1513    bs->secs = secs;
1514}
1515
1516void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
1517{
1518    bs->translation = translation;
1519}
1520
1521void bdrv_get_geometry_hint(BlockDriverState *bs,
1522                            int *pcyls, int *pheads, int *psecs)
1523{
1524    *pcyls = bs->cyls;
1525    *pheads = bs->heads;
1526    *psecs = bs->secs;
1527}
1528
1529/* Recognize floppy formats */
1530typedef struct FDFormat {
1531    FDriveType drive;
1532    uint8_t last_sect;
1533    uint8_t max_track;
1534    uint8_t max_head;
1535} FDFormat;
1536
1537static const FDFormat fd_formats[] = {
1538    /* First entry is default format */
1539    /* 1.44 MB 3"1/2 floppy disks */
1540    { FDRIVE_DRV_144, 18, 80, 1, },
1541    { FDRIVE_DRV_144, 20, 80, 1, },
1542    { FDRIVE_DRV_144, 21, 80, 1, },
1543    { FDRIVE_DRV_144, 21, 82, 1, },
1544    { FDRIVE_DRV_144, 21, 83, 1, },
1545    { FDRIVE_DRV_144, 22, 80, 1, },
1546    { FDRIVE_DRV_144, 23, 80, 1, },
1547    { FDRIVE_DRV_144, 24, 80, 1, },
1548    /* 2.88 MB 3"1/2 floppy disks */
1549    { FDRIVE_DRV_288, 36, 80, 1, },
1550    { FDRIVE_DRV_288, 39, 80, 1, },
1551    { FDRIVE_DRV_288, 40, 80, 1, },
1552    { FDRIVE_DRV_288, 44, 80, 1, },
1553    { FDRIVE_DRV_288, 48, 80, 1, },
1554    /* 720 kB 3"1/2 floppy disks */
1555    { FDRIVE_DRV_144,  9, 80, 1, },
1556    { FDRIVE_DRV_144, 10, 80, 1, },
1557    { FDRIVE_DRV_144, 10, 82, 1, },
1558    { FDRIVE_DRV_144, 10, 83, 1, },
1559    { FDRIVE_DRV_144, 13, 80, 1, },
1560    { FDRIVE_DRV_144, 14, 80, 1, },
1561    /* 1.2 MB 5"1/4 floppy disks */
1562    { FDRIVE_DRV_120, 15, 80, 1, },
1563    { FDRIVE_DRV_120, 18, 80, 1, },
1564    { FDRIVE_DRV_120, 18, 82, 1, },
1565    { FDRIVE_DRV_120, 18, 83, 1, },
1566    { FDRIVE_DRV_120, 20, 80, 1, },
1567    /* 720 kB 5"1/4 floppy disks */
1568    { FDRIVE_DRV_120,  9, 80, 1, },
1569    { FDRIVE_DRV_120, 11, 80, 1, },
1570    /* 360 kB 5"1/4 floppy disks */
1571    { FDRIVE_DRV_120,  9, 40, 1, },
1572    { FDRIVE_DRV_120,  9, 40, 0, },
1573    { FDRIVE_DRV_120, 10, 41, 1, },
1574    { FDRIVE_DRV_120, 10, 42, 1, },
1575    /* 320 kB 5"1/4 floppy disks */
1576    { FDRIVE_DRV_120,  8, 40, 1, },
1577    { FDRIVE_DRV_120,  8, 40, 0, },
1578    /* 360 kB must match 5"1/4 better than 3"1/2... */
1579    { FDRIVE_DRV_144,  9, 80, 0, },
1580    /* end */
1581    { FDRIVE_DRV_NONE, -1, -1, 0, },
1582};
1583
1584void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
1585                                   int *max_track, int *last_sect,
1586                                   FDriveType drive_in, FDriveType *drive)
1587{
1588    const FDFormat *parse;
1589    uint64_t nb_sectors, size;
1590    int i, first_match, match;
1591
1592    bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
1593    if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
1594        /* User defined disk */
1595    } else {
1596        bdrv_get_geometry(bs, &nb_sectors);
1597        match = -1;
1598        first_match = -1;
1599        for (i = 0; ; i++) {
1600            parse = &fd_formats[i];
1601            if (parse->drive == FDRIVE_DRV_NONE) {
1602                break;
1603            }
1604            if (drive_in == parse->drive ||
1605                drive_in == FDRIVE_DRV_NONE) {
1606                size = (parse->max_head + 1) * parse->max_track *
1607                    parse->last_sect;
1608                if (nb_sectors == size) {
1609                    match = i;
1610                    break;
1611                }
1612                if (first_match == -1) {
1613                    first_match = i;
1614                }
1615            }
1616        }
1617        if (match == -1) {
1618            if (first_match == -1) {
1619                match = 1;
1620            } else {
1621                match = first_match;
1622            }
1623            parse = &fd_formats[match];
1624        }
1625        *nb_heads = parse->max_head + 1;
1626        *max_track = parse->max_track;
1627        *last_sect = parse->last_sect;
1628        *drive = parse->drive;
1629    }
1630}
1631
1632int bdrv_get_translation_hint(BlockDriverState *bs)
1633{
1634    return bs->translation;
1635}
1636
1637void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
1638                       BlockErrorAction on_write_error)
1639{
1640    bs->on_read_error = on_read_error;
1641    bs->on_write_error = on_write_error;
1642}
1643
1644BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
1645{
1646    return is_read ? bs->on_read_error : bs->on_write_error;
1647}
1648
1649int bdrv_is_read_only(BlockDriverState *bs)
1650{
1651    return bs->read_only;
1652}
1653
1654int bdrv_is_sg(BlockDriverState *bs)
1655{
1656    return bs->sg;
1657}
1658
1659int bdrv_enable_write_cache(BlockDriverState *bs)
1660{
1661    return bs->enable_write_cache;
1662}
1663
1664int bdrv_is_encrypted(BlockDriverState *bs)
1665{
1666    if (bs->backing_hd && bs->backing_hd->encrypted)
1667        return 1;
1668    return bs->encrypted;
1669}
1670
1671int bdrv_key_required(BlockDriverState *bs)
1672{
1673    BlockDriverState *backing_hd = bs->backing_hd;
1674
1675    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
1676        return 1;
1677    return (bs->encrypted && !bs->valid_key);
1678}
1679
1680int bdrv_set_key(BlockDriverState *bs, const char *key)
1681{
1682    int ret;
1683    if (bs->backing_hd && bs->backing_hd->encrypted) {
1684        ret = bdrv_set_key(bs->backing_hd, key);
1685        if (ret < 0)
1686            return ret;
1687        if (!bs->encrypted)
1688            return 0;
1689    }
1690    if (!bs->encrypted) {
1691        return -EINVAL;
1692    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
1693        return -ENOMEDIUM;
1694    }
1695    ret = bs->drv->bdrv_set_key(bs, key);
1696    if (ret < 0) {
1697        bs->valid_key = 0;
1698    } else if (!bs->valid_key) {
1699        bs->valid_key = 1;
1700        /* call the change callback now, we skipped it on open */
1701        bdrv_dev_change_media_cb(bs, true);
1702    }
1703    return ret;
1704}
1705
1706void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
1707{
1708    if (!bs->drv) {
1709        buf[0] = '\0';
1710    } else {
1711        pstrcpy(buf, buf_size, bs->drv->format_name);
1712    }
1713}
1714
1715void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
1716                         void *opaque)
1717{
1718    BlockDriver *drv;
1719
1720    QLIST_FOREACH(drv, &bdrv_drivers, list) {
1721        it(opaque, drv->format_name);
1722    }
1723}
1724
1725BlockDriverState *bdrv_find(const char *name)
1726{
1727    BlockDriverState *bs;
1728
1729    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1730        if (!strcmp(name, bs->device_name)) {
1731            return bs;
1732        }
1733    }
1734    return NULL;
1735}
1736
1737BlockDriverState *bdrv_next(BlockDriverState *bs)
1738{
1739    if (!bs) {
1740        return QTAILQ_FIRST(&bdrv_states);
1741    }
1742    return QTAILQ_NEXT(bs, list);
1743}
1744
1745void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
1746{
1747    BlockDriverState *bs;
1748
1749    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1750        it(opaque, bs);
1751    }
1752}
1753
1754const char *bdrv_get_device_name(BlockDriverState *bs)
1755{
1756    return bs->device_name;
1757}
1758
1759void bdrv_flush_all(void)
1760{
1761    BlockDriverState *bs;
1762
1763    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1764        if (!bdrv_is_read_only(bs) && bdrv_is_inserted(bs)) {
1765            bdrv_flush(bs);
1766        }
1767    }
1768}
1769
1770int bdrv_has_zero_init(BlockDriverState *bs)
1771{
1772    assert(bs->drv);
1773
1774    if (bs->drv->bdrv_has_zero_init) {
1775        return bs->drv->bdrv_has_zero_init(bs);
1776    }
1777
1778    return 1;
1779}
1780
1781/*
1782 * Returns true iff the specified sector is present in the disk image. Drivers
1783 * not implementing the functionality are assumed to not support backing files,
1784 * hence all their sectors are reported as allocated.
1785 *
1786 * 'pnum' is set to the number of sectors (including and immediately following
1787 * the specified sector) that are known to be in the same
1788 * allocated/unallocated state.
1789 *
1790 * 'nb_sectors' is the max value 'pnum' should be set to.
1791 */
1792int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
1793        int *pnum)
1794{
1795    int64_t n;
1796    if (!bs->drv->bdrv_is_allocated) {
1797        if (sector_num >= bs->total_sectors) {
1798            *pnum = 0;
1799            return 0;
1800        }
1801        n = bs->total_sectors - sector_num;
1802        *pnum = (n < nb_sectors) ? (n) : (nb_sectors);
1803        return 1;
1804    }
1805    return bs->drv->bdrv_is_allocated(bs, sector_num, nb_sectors, pnum);
1806}
1807
1808void bdrv_mon_event(const BlockDriverState *bdrv,
1809                    BlockMonEventAction action, int is_read)
1810{
1811    QObject *data;
1812    const char *action_str;
1813
1814    switch (action) {
1815    case BDRV_ACTION_REPORT:
1816        action_str = "report";
1817        break;
1818    case BDRV_ACTION_IGNORE:
1819        action_str = "ignore";
1820        break;
1821    case BDRV_ACTION_STOP:
1822        action_str = "stop";
1823        break;
1824    default:
1825        abort();
1826    }
1827
1828    data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1829                              bdrv->device_name,
1830                              action_str,
1831                              is_read ? "read" : "write");
1832    monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1833
1834    qobject_decref(data);
1835}
1836
1837BlockInfoList *qmp_query_block(Error **errp)
1838{
1839    BlockInfoList *head = NULL, *cur_item = NULL;
1840    BlockDriverState *bs;
1841
1842    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1843        BlockInfoList *info = g_malloc0(sizeof(*info));
1844
1845        info->value = g_malloc0(sizeof(*info->value));
1846        info->value->device = g_strdup(bs->device_name);
1847        info->value->type = g_strdup("unknown");
1848        info->value->locked = bdrv_dev_is_medium_locked(bs);
1849        info->value->removable = bdrv_dev_has_removable_media(bs);
1850
1851        if (bdrv_dev_has_removable_media(bs)) {
1852            info->value->has_tray_open = true;
1853            info->value->tray_open = bdrv_dev_is_tray_open(bs);
1854        }
1855
1856        if (bdrv_iostatus_is_enabled(bs)) {
1857            info->value->has_io_status = true;
1858            info->value->io_status = bs->iostatus;
1859        }
1860
1861        if (bs->drv) {
1862            info->value->has_inserted = true;
1863            info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
1864            info->value->inserted->file = g_strdup(bs->filename);
1865            info->value->inserted->ro = bs->read_only;
1866            info->value->inserted->drv = g_strdup(bs->drv->format_name);
1867            info->value->inserted->encrypted = bs->encrypted;
1868            if (bs->backing_file[0]) {
1869                info->value->inserted->has_backing_file = true;
1870                info->value->inserted->backing_file = g_strdup(bs->backing_file);
1871            }
1872        }
1873
1874        /* XXX: waiting for the qapi to support GSList */
1875        if (!cur_item) {
1876            head = cur_item = info;
1877        } else {
1878            cur_item->next = info;
1879            cur_item = info;
1880        }
1881    }
1882
1883    return head;
1884}
1885
1886/* Consider exposing this as a full fledged QMP command */
1887static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
1888{
1889    BlockStats *s;
1890
1891    s = g_malloc0(sizeof(*s));
1892
1893    if (bs->device_name[0]) {
1894        s->has_device = true;
1895        s->device = g_strdup(bs->device_name);
1896    }
1897
1898    s->stats = g_malloc0(sizeof(*s->stats));
1899    s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
1900    s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
1901    s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
1902    s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
1903    s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
1904    s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
1905    s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
1906    s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
1907    s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
1908
1909    if (bs->file) {
1910        s->has_parent = true;
1911        s->parent = qmp_query_blockstat(bs->file, NULL);
1912    }
1913
1914    return s;
1915}
1916
1917BlockStatsList *qmp_query_blockstats(Error **errp)
1918{
1919    BlockStatsList *head = NULL, *cur_item = NULL;
1920    BlockDriverState *bs;
1921
1922    QTAILQ_FOREACH(bs, &bdrv_states, list) {
1923        BlockStatsList *info = g_malloc0(sizeof(*info));
1924        info->value = qmp_query_blockstat(bs, NULL);
1925
1926        /* XXX: waiting for the qapi to support GSList */
1927        if (!cur_item) {
1928            head = cur_item = info;
1929        } else {
1930            cur_item->next = info;
1931            cur_item = info;
1932        }
1933    }
1934
1935    return head;
1936}
1937
1938const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
1939{
1940    if (bs->backing_hd && bs->backing_hd->encrypted)
1941        return bs->backing_file;
1942    else if (bs->encrypted)
1943        return bs->filename;
1944    else
1945        return NULL;
1946}
1947
1948void bdrv_get_backing_filename(BlockDriverState *bs,
1949                               char *filename, int filename_size)
1950{
1951    pstrcpy(filename, filename_size, bs->backing_file);
1952}
1953
1954int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
1955                          const uint8_t *buf, int nb_sectors)
1956{
1957    BlockDriver *drv = bs->drv;
1958    if (!drv)
1959        return -ENOMEDIUM;
1960    if (!drv->bdrv_write_compressed)
1961        return -ENOTSUP;
1962    if (bdrv_check_request(bs, sector_num, nb_sectors))
1963        return -EIO;
1964
1965    if (bs->dirty_bitmap) {
1966        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1967    }
1968
1969    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
1970}
1971
1972int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1973{
1974    BlockDriver *drv = bs->drv;
1975    if (!drv)
1976        return -ENOMEDIUM;
1977    if (!drv->bdrv_get_info)
1978        return -ENOTSUP;
1979    memset(bdi, 0, sizeof(*bdi));
1980    return drv->bdrv_get_info(bs, bdi);
1981}
1982
1983int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
1984                      int64_t pos, int size)
1985{
1986    BlockDriver *drv = bs->drv;
1987    if (!drv)
1988        return -ENOMEDIUM;
1989    if (drv->bdrv_save_vmstate)
1990        return drv->bdrv_save_vmstate(bs, buf, pos, size);
1991    if (bs->file)
1992        return bdrv_save_vmstate(bs->file, buf, pos, size);
1993    return -ENOTSUP;
1994}
1995
1996int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
1997                      int64_t pos, int size)
1998{
1999    BlockDriver *drv = bs->drv;
2000    if (!drv)
2001        return -ENOMEDIUM;
2002    if (drv->bdrv_load_vmstate)
2003        return drv->bdrv_load_vmstate(bs, buf, pos, size);
2004    if (bs->file)
2005        return bdrv_load_vmstate(bs->file, buf, pos, size);
2006    return -ENOTSUP;
2007}
2008
2009void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2010{
2011    BlockDriver *drv = bs->drv;
2012
2013    if (!drv || !drv->bdrv_debug_event) {
2014        return;
2015    }
2016
2017    return drv->bdrv_debug_event(bs, event);
2018
2019}
2020
2021/**************************************************************/
2022/* handling of snapshots */
2023
2024int bdrv_can_snapshot(BlockDriverState *bs)
2025{
2026    BlockDriver *drv = bs->drv;
2027    if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2028        return 0;
2029    }
2030
2031    if (!drv->bdrv_snapshot_create) {
2032        if (bs->file != NULL) {
2033            return bdrv_can_snapshot(bs->file);
2034        }
2035        return 0;
2036    }
2037
2038    return 1;
2039}
2040
2041int bdrv_is_snapshot(BlockDriverState *bs)
2042{
2043    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2044}
2045
2046BlockDriverState *bdrv_snapshots(void)
2047{
2048    BlockDriverState *bs;
2049
2050    if (bs_snapshots) {
2051        return bs_snapshots;
2052    }
2053
2054    bs = NULL;
2055    while ((bs = bdrv_next(bs))) {
2056        if (bdrv_can_snapshot(bs)) {
2057            bs_snapshots = bs;
2058            return bs;
2059        }
2060    }
2061    return NULL;
2062}
2063
2064int bdrv_snapshot_create(BlockDriverState *bs,
2065                         QEMUSnapshotInfo *sn_info)
2066{
2067    BlockDriver *drv = bs->drv;
2068    if (!drv)
2069        return -ENOMEDIUM;
2070    if (drv->bdrv_snapshot_create)
2071        return drv->bdrv_snapshot_create(bs, sn_info);
2072    if (bs->file)
2073        return bdrv_snapshot_create(bs->file, sn_info);
2074    return -ENOTSUP;
2075}
2076
2077int bdrv_snapshot_goto(BlockDriverState *bs,
2078                       const char *snapshot_id)
2079{
2080    BlockDriver *drv = bs->drv;
2081    int ret, open_ret;
2082
2083    if (!drv)
2084        return -ENOMEDIUM;
2085    if (drv->bdrv_snapshot_goto)
2086        return drv->bdrv_snapshot_goto(bs, snapshot_id);
2087
2088    if (bs->file) {
2089        drv->bdrv_close(bs);
2090        ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2091        open_ret = drv->bdrv_open(bs, bs->open_flags);
2092        if (open_ret < 0) {
2093            bdrv_delete(bs->file);
2094            bs->drv = NULL;
2095            return open_ret;
2096        }
2097        return ret;
2098    }
2099
2100    return -ENOTSUP;
2101}
2102
2103int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2104{
2105    BlockDriver *drv = bs->drv;
2106    if (!drv)
2107        return -ENOMEDIUM;
2108    if (drv->bdrv_snapshot_delete)
2109        return drv->bdrv_snapshot_delete(bs, snapshot_id);
2110    if (bs->file)
2111        return bdrv_snapshot_delete(bs->file, snapshot_id);
2112    return -ENOTSUP;
2113}
2114
2115int bdrv_snapshot_list(BlockDriverState *bs,
2116                       QEMUSnapshotInfo **psn_info)
2117{
2118    BlockDriver *drv = bs->drv;
2119    if (!drv)
2120        return -ENOMEDIUM;
2121    if (drv->bdrv_snapshot_list)
2122        return drv->bdrv_snapshot_list(bs, psn_info);
2123    if (bs->file)
2124        return bdrv_snapshot_list(bs->file, psn_info);
2125    return -ENOTSUP;
2126}
2127
2128int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2129        const char *snapshot_name)
2130{
2131    BlockDriver *drv = bs->drv;
2132    if (!drv) {
2133        return -ENOMEDIUM;
2134    }
2135    if (!bs->read_only) {
2136        return -EINVAL;
2137    }
2138    if (drv->bdrv_snapshot_load_tmp) {
2139        return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2140    }
2141    return -ENOTSUP;
2142}
2143
2144#define NB_SUFFIXES 4
2145
2146char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2147{
2148    static const char suffixes[NB_SUFFIXES] = "KMGT";
2149    int64_t base;
2150    int i;
2151
2152    if (size <= 999) {
2153        snprintf(buf, buf_size, "%" PRId64, size);
2154    } else {
2155        base = 1024;
2156        for(i = 0; i < NB_SUFFIXES; i++) {
2157            if (size < (10 * base)) {
2158                snprintf(buf, buf_size, "%0.1f%c",
2159                         (double)size / base,
2160                         suffixes[i]);
2161                break;
2162            } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2163                snprintf(buf, buf_size, "%" PRId64 "%c",
2164                         ((size + (base >> 1)) / base),
2165                         suffixes[i]);
2166                break;
2167            }
2168            base = base * 1024;
2169        }
2170    }
2171    return buf;
2172}
2173
2174char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2175{
2176    char buf1[128], date_buf[128], clock_buf[128];
2177#ifdef _WIN32
2178    struct tm *ptm;
2179#else
2180    struct tm tm;
2181#endif
2182    time_t ti;
2183    int64_t secs;
2184
2185    if (!sn) {
2186        snprintf(buf, buf_size,
2187                 "%-10s%-20s%7s%20s%15s",
2188                 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2189    } else {
2190        ti = sn->date_sec;
2191#ifdef _WIN32
2192        ptm = localtime(&ti);
2193        strftime(date_buf, sizeof(date_buf),
2194                 "%Y-%m-%d %H:%M:%S", ptm);
2195#else
2196        localtime_r(&ti, &tm);
2197        strftime(date_buf, sizeof(date_buf),
2198                 "%Y-%m-%d %H:%M:%S", &tm);
2199#endif
2200        secs = sn->vm_clock_nsec / 1000000000;
2201        snprintf(clock_buf, sizeof(clock_buf),
2202                 "%02d:%02d:%02d.%03d",
2203                 (int)(secs / 3600),
2204                 (int)((secs / 60) % 60),
2205                 (int)(secs % 60),
2206                 (int)((sn->vm_clock_nsec / 1000000) % 1000));
2207        snprintf(buf, buf_size,
2208                 "%-10s%-20s%7s%20s%15s",
2209                 sn->id_str, sn->name,
2210                 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2211                 date_buf,
2212                 clock_buf);
2213    }
2214    return buf;
2215}
2216
2217/**************************************************************/
2218/* async I/Os */
2219
2220BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2221                                 QEMUIOVector *qiov, int nb_sectors,
2222                                 BlockDriverCompletionFunc *cb, void *opaque)
2223{
2224    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2225
2226    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2227                                 cb, opaque, false);
2228}
2229
2230BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2231                                  QEMUIOVector *qiov, int nb_sectors,
2232                                  BlockDriverCompletionFunc *cb, void *opaque)
2233{
2234    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2235
2236    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2237                                 cb, opaque, true);
2238}
2239
2240
2241typedef struct MultiwriteCB {
2242    int error;
2243    int num_requests;
2244    int num_callbacks;
2245    struct {
2246        BlockDriverCompletionFunc *cb;
2247        void *opaque;
2248        QEMUIOVector *free_qiov;
2249        void *free_buf;
2250    } callbacks[];
2251} MultiwriteCB;
2252
2253static void multiwrite_user_cb(MultiwriteCB *mcb)
2254{
2255    int i;
2256
2257    for (i = 0; i < mcb->num_callbacks; i++) {
2258        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2259        if (mcb->callbacks[i].free_qiov) {
2260            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2261        }
2262        g_free(mcb->callbacks[i].free_qiov);
2263        qemu_vfree(mcb->callbacks[i].free_buf);
2264    }
2265}
2266
2267static void multiwrite_cb(void *opaque, int ret)
2268{
2269    MultiwriteCB *mcb = opaque;
2270
2271    trace_multiwrite_cb(mcb, ret);
2272
2273    if (ret < 0 && !mcb->error) {
2274        mcb->error = ret;
2275    }
2276
2277    mcb->num_requests--;
2278    if (mcb->num_requests == 0) {
2279        multiwrite_user_cb(mcb);
2280        g_free(mcb);
2281    }
2282}
2283
2284static int multiwrite_req_compare(const void *a, const void *b)
2285{
2286    const BlockRequest *req1 = a, *req2 = b;
2287
2288    /*
2289     * Note that we can't simply subtract req2->sector from req1->sector
2290     * here as that could overflow the return value.
2291     */
2292    if (req1->sector > req2->sector) {
2293        return 1;
2294    } else if (req1->sector < req2->sector) {
2295        return -1;
2296    } else {
2297        return 0;
2298    }
2299}
2300
2301/*
2302 * Takes a bunch of requests and tries to merge them. Returns the number of
2303 * requests that remain after merging.
2304 */
2305static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2306    int num_reqs, MultiwriteCB *mcb)
2307{
2308    int i, outidx;
2309
2310    // Sort requests by start sector
2311    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2312
2313    // Check if adjacent requests touch the same clusters. If so, combine them,
2314    // filling up gaps with zero sectors.
2315    outidx = 0;
2316    for (i = 1; i < num_reqs; i++) {
2317        int merge = 0;
2318        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2319
2320        // This handles the cases that are valid for all block drivers, namely
2321        // exactly sequential writes and overlapping writes.
2322        if (reqs[i].sector <= oldreq_last) {
2323            merge = 1;
2324        }
2325
2326        // The block driver may decide that it makes sense to combine requests
2327        // even if there is a gap of some sectors between them. In this case,
2328        // the gap is filled with zeros (therefore only applicable for yet
2329        // unused space in format like qcow2).
2330        if (!merge && bs->drv->bdrv_merge_requests) {
2331            merge = bs->drv->bdrv_merge_requests(bs, &reqs[outidx], &reqs[i]);
2332        }
2333
2334        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2335            merge = 0;
2336        }
2337
2338        if (merge) {
2339            size_t size;
2340            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
2341            qemu_iovec_init(qiov,
2342                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2343
2344            // Add the first request to the merged one. If the requests are
2345            // overlapping, drop the last sectors of the first request.
2346            size = (reqs[i].sector - reqs[outidx].sector) << 9;
2347            qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
2348
2349            // We might need to add some zeros between the two requests
2350            if (reqs[i].sector > oldreq_last) {
2351                size_t zero_bytes = (reqs[i].sector - oldreq_last) << 9;
2352                uint8_t *buf = qemu_blockalign(bs, zero_bytes);
2353                memset(buf, 0, zero_bytes);
2354                qemu_iovec_add(qiov, buf, zero_bytes);
2355                mcb->callbacks[i].free_buf = buf;
2356            }
2357
2358            // Add the second request
2359            qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
2360
2361            reqs[outidx].nb_sectors = qiov->size >> 9;
2362            reqs[outidx].qiov = qiov;
2363
2364            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
2365        } else {
2366            outidx++;
2367            reqs[outidx].sector     = reqs[i].sector;
2368            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
2369            reqs[outidx].qiov       = reqs[i].qiov;
2370        }
2371    }
2372
2373    return outidx + 1;
2374}
2375
2376/*
2377 * Submit multiple AIO write requests at once.
2378 *
2379 * On success, the function returns 0 and all requests in the reqs array have
2380 * been submitted. In error case this function returns -1, and any of the
2381 * requests may or may not be submitted yet. In particular, this means that the
2382 * callback will be called for some of the requests, for others it won't. The
2383 * caller must check the error field of the BlockRequest to wait for the right
2384 * callbacks (if error != 0, no callback will be called).
2385 *
2386 * The implementation may modify the contents of the reqs array, e.g. to merge
2387 * requests. However, the fields opaque and error are left unmodified as they
2388 * are used to signal failure for a single request to the caller.
2389 */
2390int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
2391{
2392    BlockDriverAIOCB *acb;
2393    MultiwriteCB *mcb;
2394    int i;
2395
2396    /* don't submit writes if we don't have a medium */
2397    if (bs->drv == NULL) {
2398        for (i = 0; i < num_reqs; i++) {
2399            reqs[i].error = -ENOMEDIUM;
2400        }
2401        return -1;
2402    }
2403
2404    if (num_reqs == 0) {
2405        return 0;
2406    }
2407
2408    // Create MultiwriteCB structure
2409    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
2410    mcb->num_requests = 0;
2411    mcb->num_callbacks = num_reqs;
2412
2413    for (i = 0; i < num_reqs; i++) {
2414        mcb->callbacks[i].cb = reqs[i].cb;
2415        mcb->callbacks[i].opaque = reqs[i].opaque;
2416    }
2417
2418    // Check for mergable requests
2419    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
2420
2421    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
2422
2423    /*
2424     * Run the aio requests. As soon as one request can't be submitted
2425     * successfully, fail all requests that are not yet submitted (we must
2426     * return failure for all requests anyway)
2427     *
2428     * num_requests cannot be set to the right value immediately: If
2429     * bdrv_aio_writev fails for some request, num_requests would be too high
2430     * and therefore multiwrite_cb() would never recognize the multiwrite
2431     * request as completed. We also cannot use the loop variable i to set it
2432     * when the first request fails because the callback may already have been
2433     * called for previously submitted requests. Thus, num_requests must be
2434     * incremented for each request that is submitted.
2435     *
2436     * The problem that callbacks may be called early also means that we need
2437     * to take care that num_requests doesn't become 0 before all requests are
2438     * submitted - multiwrite_cb() would consider the multiwrite request
2439     * completed. A dummy request that is "completed" by a manual call to
2440     * multiwrite_cb() takes care of this.
2441     */
2442    mcb->num_requests = 1;
2443
2444    // Run the aio requests
2445    for (i = 0; i < num_reqs; i++) {
2446        mcb->num_requests++;
2447        acb = bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
2448            reqs[i].nb_sectors, multiwrite_cb, mcb);
2449
2450        if (acb == NULL) {
2451            // We can only fail the whole thing if no request has been
2452            // submitted yet. Otherwise we'll wait for the submitted AIOs to
2453            // complete and report the error in the callback.
2454            if (i == 0) {
2455                trace_bdrv_aio_multiwrite_earlyfail(mcb);
2456                goto fail;
2457            } else {
2458                trace_bdrv_aio_multiwrite_latefail(mcb, i);
2459                multiwrite_cb(mcb, -EIO);
2460                break;
2461            }
2462        }
2463    }
2464
2465    /* Complete the dummy request */
2466    multiwrite_cb(mcb, 0);
2467
2468    return 0;
2469
2470fail:
2471    for (i = 0; i < mcb->num_callbacks; i++) {
2472        reqs[i].error = -EIO;
2473    }
2474    g_free(mcb);
2475    return -1;
2476}
2477
2478void bdrv_aio_cancel(BlockDriverAIOCB *acb)
2479{
2480    acb->pool->cancel(acb);
2481}
2482
2483
2484/**************************************************************/
2485/* async block device emulation */
2486
2487typedef struct BlockDriverAIOCBSync {
2488    BlockDriverAIOCB common;
2489    QEMUBH *bh;
2490    int ret;
2491    /* vector translation state */
2492    QEMUIOVector *qiov;
2493    uint8_t *bounce;
2494    int is_write;
2495} BlockDriverAIOCBSync;
2496
2497static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
2498{
2499    BlockDriverAIOCBSync *acb =
2500        container_of(blockacb, BlockDriverAIOCBSync, common);
2501    qemu_bh_delete(acb->bh);
2502    acb->bh = NULL;
2503    qemu_aio_release(acb);
2504}
2505
2506static AIOPool bdrv_em_aio_pool = {
2507    .aiocb_size         = sizeof(BlockDriverAIOCBSync),
2508    .cancel             = bdrv_aio_cancel_em,
2509};
2510
2511static void bdrv_aio_bh_cb(void *opaque)
2512{
2513    BlockDriverAIOCBSync *acb = opaque;
2514
2515    if (!acb->is_write)
2516        qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
2517    qemu_vfree(acb->bounce);
2518    acb->common.cb(acb->common.opaque, acb->ret);
2519    qemu_bh_delete(acb->bh);
2520    acb->bh = NULL;
2521    qemu_aio_release(acb);
2522}
2523
2524static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
2525                                            int64_t sector_num,
2526                                            QEMUIOVector *qiov,
2527                                            int nb_sectors,
2528                                            BlockDriverCompletionFunc *cb,
2529                                            void *opaque,
2530                                            int is_write)
2531
2532{
2533    BlockDriverAIOCBSync *acb;
2534
2535    acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
2536    acb->is_write = is_write;
2537    acb->qiov = qiov;
2538    acb->bounce = qemu_blockalign(bs, qiov->size);
2539
2540    if (!acb->bh)
2541        acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
2542
2543    if (is_write) {
2544        qemu_iovec_to_buffer(acb->qiov, acb->bounce);
2545        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
2546    } else {
2547        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
2548    }
2549
2550    qemu_bh_schedule(acb->bh);
2551
2552    return &acb->common;
2553}
2554
2555static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
2556        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2557        BlockDriverCompletionFunc *cb, void *opaque)
2558{
2559    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
2560}
2561
2562static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
2563        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
2564        BlockDriverCompletionFunc *cb, void *opaque)
2565{
2566    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
2567}
2568
2569
2570typedef struct BlockDriverAIOCBCoroutine {
2571    BlockDriverAIOCB common;
2572    BlockRequest req;
2573    bool is_write;
2574    QEMUBH* bh;
2575} BlockDriverAIOCBCoroutine;
2576
2577static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
2578{
2579    qemu_aio_flush();
2580}
2581
2582static AIOPool bdrv_em_co_aio_pool = {
2583    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
2584    .cancel             = bdrv_aio_co_cancel_em,
2585};
2586
2587static void bdrv_co_em_bh(void *opaque)
2588{
2589    BlockDriverAIOCBCoroutine *acb = opaque;
2590
2591    acb->common.cb(acb->common.opaque, acb->req.error);
2592    qemu_bh_delete(acb->bh);
2593    qemu_aio_release(acb);
2594}
2595
2596/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
2597static void coroutine_fn bdrv_co_do_rw(void *opaque)
2598{
2599    BlockDriverAIOCBCoroutine *acb = opaque;
2600    BlockDriverState *bs = acb->common.bs;
2601
2602    if (!acb->is_write) {
2603        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
2604            acb->req.nb_sectors, acb->req.qiov);
2605    } else {
2606        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
2607            acb->req.nb_sectors, acb->req.qiov);
2608    }
2609
2610    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
2611    qemu_bh_schedule(acb->bh);
2612}
2613
2614static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
2615                                               int64_t sector_num,
2616                                               QEMUIOVector *qiov,
2617                                               int nb_sectors,
2618                                               BlockDriverCompletionFunc *cb,
2619                                               void *opaque,
2620                                               bool is_write)
2621{
2622    Coroutine *co;
2623    BlockDriverAIOCBCoroutine *acb;
2624
2625    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
2626    acb->req.sector = sector_num;
2627    acb->req.nb_sectors = nb_sectors;
2628    acb->req.qiov = qiov;
2629    acb->is_write = is_write;
2630
2631    co = qemu_coroutine_create(bdrv_co_do_rw);
2632    qemu_coroutine_enter(co, acb);
2633
2634    return &acb->common;
2635}
2636
2637static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
2638{
2639    BlockDriverAIOCBCoroutine *acb = opaque;
2640    BlockDriverState *bs = acb->common.bs;
2641
2642    acb->req.error = bdrv_co_flush(bs);
2643    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
2644    qemu_bh_schedule(acb->bh);
2645}
2646
2647BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
2648        BlockDriverCompletionFunc *cb, void *opaque)
2649{
2650    trace_bdrv_aio_flush(bs, opaque);
2651
2652    Coroutine *co;
2653    BlockDriverAIOCBCoroutine *acb;
2654
2655    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
2656    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
2657    qemu_coroutine_enter(co, acb);
2658
2659    return &acb->common;
2660}
2661
2662static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
2663{
2664    BlockDriverAIOCBCoroutine *acb = opaque;
2665    BlockDriverState *bs = acb->common.bs;
2666
2667    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
2668    acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
2669    qemu_bh_schedule(acb->bh);
2670}
2671
2672BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
2673        int64_t sector_num, int nb_sectors,
2674        BlockDriverCompletionFunc *cb, void *opaque)
2675{
2676    Coroutine *co;
2677    BlockDriverAIOCBCoroutine *acb;
2678
2679    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
2680
2681    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
2682    acb->req.sector = sector_num;
2683    acb->req.nb_sectors = nb_sectors;
2684    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
2685    qemu_coroutine_enter(co, acb);
2686
2687    return &acb->common;
2688}
2689
2690void bdrv_init(void)
2691{
2692    module_call_init(MODULE_INIT_BLOCK);
2693}
2694
2695void bdrv_init_with_whitelist(void)
2696{
2697    use_bdrv_whitelist = 1;
2698    bdrv_init();
2699}
2700
2701void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
2702                   BlockDriverCompletionFunc *cb, void *opaque)
2703{
2704    BlockDriverAIOCB *acb;
2705
2706    if (pool->free_aiocb) {
2707        acb = pool->free_aiocb;
2708        pool->free_aiocb = acb->next;
2709    } else {
2710        acb = g_malloc0(pool->aiocb_size);
2711        acb->pool = pool;
2712    }
2713    acb->bs = bs;
2714    acb->cb = cb;
2715    acb->opaque = opaque;
2716    return acb;
2717}
2718
2719void qemu_aio_release(void *p)
2720{
2721    BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
2722    AIOPool *pool = acb->pool;
2723    acb->next = pool->free_aiocb;
2724    pool->free_aiocb = acb;
2725}
2726
2727/**************************************************************/
2728/* Coroutine block device emulation */
2729
2730typedef struct CoroutineIOCompletion {
2731    Coroutine *coroutine;
2732    int ret;
2733} CoroutineIOCompletion;
2734
2735static void bdrv_co_io_em_complete(void *opaque, int ret)
2736{
2737    CoroutineIOCompletion *co = opaque;
2738
2739    co->ret = ret;
2740    qemu_coroutine_enter(co->coroutine, NULL);
2741}
2742
2743static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
2744                                      int nb_sectors, QEMUIOVector *iov,
2745                                      bool is_write)
2746{
2747    CoroutineIOCompletion co = {
2748        .coroutine = qemu_coroutine_self(),
2749    };
2750    BlockDriverAIOCB *acb;
2751
2752    if (is_write) {
2753        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
2754                                       bdrv_co_io_em_complete, &co);
2755    } else {
2756        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
2757                                      bdrv_co_io_em_complete, &co);
2758    }
2759
2760    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
2761    if (!acb) {
2762        return -EIO;
2763    }
2764    qemu_coroutine_yield();
2765
2766    return co.ret;
2767}
2768
2769static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
2770                                         int64_t sector_num, int nb_sectors,
2771                                         QEMUIOVector *iov)
2772{
2773    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
2774}
2775
2776static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
2777                                         int64_t sector_num, int nb_sectors,
2778                                         QEMUIOVector *iov)
2779{
2780    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
2781}
2782
2783static void coroutine_fn bdrv_flush_co_entry(void *opaque)
2784{
2785    RwCo *rwco = opaque;
2786
2787    rwco->ret = bdrv_co_flush(rwco->bs);
2788}
2789
2790int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2791{
2792    int ret;
2793
2794    if (!bs->drv) {
2795        return 0;
2796    }
2797
2798    /* Write back cached data to the OS even with cache=unsafe */
2799    if (bs->drv->bdrv_co_flush_to_os) {
2800        ret = bs->drv->bdrv_co_flush_to_os(bs);
2801        if (ret < 0) {
2802            return ret;
2803        }
2804    }
2805
2806    /* But don't actually force it to the disk with cache=unsafe */
2807    if (bs->open_flags & BDRV_O_NO_FLUSH) {
2808        return 0;
2809    }
2810
2811    if (bs->drv->bdrv_co_flush_to_disk) {
2812        return bs->drv->bdrv_co_flush_to_disk(bs);
2813    } else if (bs->drv->bdrv_aio_flush) {
2814        BlockDriverAIOCB *acb;
2815        CoroutineIOCompletion co = {
2816            .coroutine = qemu_coroutine_self(),
2817        };
2818
2819        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2820        if (acb == NULL) {
2821            return -EIO;
2822        } else {
2823            qemu_coroutine_yield();
2824            return co.ret;
2825        }
2826    } else {
2827        /*
2828         * Some block drivers always operate in either writethrough or unsafe
2829         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2830         * know how the server works (because the behaviour is hardcoded or
2831         * depends on server-side configuration), so we can't ensure that
2832         * everything is safe on disk. Returning an error doesn't work because
2833         * that would break guests even if the server operates in writethrough
2834         * mode.
2835         *
2836         * Let's hope the user knows what he's doing.
2837         */
2838        return 0;
2839    }
2840}
2841
2842void bdrv_invalidate_cache(BlockDriverState *bs)
2843{
2844    if (bs->drv && bs->drv->bdrv_invalidate_cache) {
2845        bs->drv->bdrv_invalidate_cache(bs);
2846    }
2847}
2848
2849void bdrv_invalidate_cache_all(void)
2850{
2851    BlockDriverState *bs;
2852
2853    QTAILQ_FOREACH(bs, &bdrv_states, list) {
2854        bdrv_invalidate_cache(bs);
2855    }
2856}
2857
2858int bdrv_flush(BlockDriverState *bs)
2859{
2860    Coroutine *co;
2861    RwCo rwco = {
2862        .bs = bs,
2863        .ret = NOT_DONE,
2864    };
2865
2866    if (qemu_in_coroutine()) {
2867        /* Fast-path if already in coroutine context */
2868        bdrv_flush_co_entry(&rwco);
2869    } else {
2870        co = qemu_coroutine_create(bdrv_flush_co_entry);
2871        qemu_coroutine_enter(co, &rwco);
2872        while (rwco.ret == NOT_DONE) {
2873            qemu_aio_wait();
2874        }
2875    }
2876
2877    return rwco.ret;
2878}
2879
2880static void coroutine_fn bdrv_discard_co_entry(void *opaque)
2881{
2882    RwCo *rwco = opaque;
2883
2884    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
2885}
2886
2887int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
2888                                 int nb_sectors)
2889{
2890    if (!bs->drv) {
2891        return -ENOMEDIUM;
2892    } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2893        return -EIO;
2894    } else if (bs->read_only) {
2895        return -EROFS;
2896    } else if (bs->drv->bdrv_co_discard) {
2897        return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
2898    } else if (bs->drv->bdrv_aio_discard) {
2899        BlockDriverAIOCB *acb;
2900        CoroutineIOCompletion co = {
2901            .coroutine = qemu_coroutine_self(),
2902        };
2903
2904        acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
2905                                        bdrv_co_io_em_complete, &co);
2906        if (acb == NULL) {
2907            return -EIO;
2908        } else {
2909            qemu_coroutine_yield();
2910            return co.ret;
2911        }
2912    } else {
2913        return 0;
2914    }
2915}
2916
2917int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
2918{
2919    Coroutine *co;
2920    RwCo rwco = {
2921        .bs = bs,
2922        .sector_num = sector_num,
2923        .nb_sectors = nb_sectors,
2924        .ret = NOT_DONE,
2925    };
2926
2927    if (qemu_in_coroutine()) {
2928        /* Fast-path if already in coroutine context */
2929        bdrv_discard_co_entry(&rwco);
2930    } else {
2931        co = qemu_coroutine_create(bdrv_discard_co_entry);
2932        qemu_coroutine_enter(co, &rwco);
2933        while (rwco.ret == NOT_DONE) {
2934            qemu_aio_wait();
2935        }
2936    }
2937
2938    return rwco.ret;
2939}
2940
2941/**************************************************************/
2942/* removable device support */
2943
2944/**
2945 * Return TRUE if the media is present
2946 */
2947int bdrv_is_inserted(BlockDriverState *bs)
2948{
2949    BlockDriver *drv = bs->drv;
2950
2951    if (!drv)
2952        return 0;
2953    if (!drv->bdrv_is_inserted)
2954        return 1;
2955    return drv->bdrv_is_inserted(bs);
2956}
2957
2958/**
2959 * Return whether the media changed since the last call to this
2960 * function, or -ENOTSUP if we don't know.  Most drivers don't know.
2961 */
2962int bdrv_media_changed(BlockDriverState *bs)
2963{
2964    BlockDriver *drv = bs->drv;
2965
2966    if (drv && drv->bdrv_media_changed) {
2967        return drv->bdrv_media_changed(bs);
2968    }
2969    return -ENOTSUP;
2970}
2971
2972/**
2973 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
2974 */
2975void bdrv_eject(BlockDriverState *bs, int eject_flag)
2976{
2977    BlockDriver *drv = bs->drv;
2978
2979    if (drv && drv->bdrv_eject) {
2980        drv->bdrv_eject(bs, eject_flag);
2981    }
2982}
2983
2984/**
2985 * Lock or unlock the media (if it is locked, the user won't be able
2986 * to eject it manually).
2987 */
2988void bdrv_lock_medium(BlockDriverState *bs, bool locked)
2989{
2990    BlockDriver *drv = bs->drv;
2991
2992    trace_bdrv_lock_medium(bs, locked);
2993
2994    if (drv && drv->bdrv_lock_medium) {
2995        drv->bdrv_lock_medium(bs, locked);
2996    }
2997}
2998
2999/* needed for generic scsi interface */
3000
3001int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3002{
3003    BlockDriver *drv = bs->drv;
3004
3005    if (drv && drv->bdrv_ioctl)
3006        return drv->bdrv_ioctl(bs, req, buf);
3007    return -ENOTSUP;
3008}
3009
3010BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3011        unsigned long int req, void *buf,
3012        BlockDriverCompletionFunc *cb, void *opaque)
3013{
3014    BlockDriver *drv = bs->drv;
3015
3016    if (drv && drv->bdrv_aio_ioctl)
3017        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3018    return NULL;
3019}
3020
3021void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3022{
3023    bs->buffer_alignment = align;
3024}
3025
3026void *qemu_blockalign(BlockDriverState *bs, size_t size)
3027{
3028    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3029}
3030
3031void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3032{
3033    int64_t bitmap_size;
3034
3035    bs->dirty_count = 0;
3036    if (enable) {
3037        if (!bs->dirty_bitmap) {
3038            bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3039                    BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
3040            bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
3041
3042            bs->dirty_bitmap = g_malloc0(bitmap_size);
3043        }
3044    } else {
3045        if (bs->dirty_bitmap) {
3046            g_free(bs->dirty_bitmap);
3047            bs->dirty_bitmap = NULL;
3048        }
3049    }
3050}
3051
3052int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3053{
3054    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3055
3056    if (bs->dirty_bitmap &&
3057        (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3058        return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3059            (1UL << (chunk % (sizeof(unsigned long) * 8))));
3060    } else {
3061        return 0;
3062    }
3063}
3064
3065void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3066                      int nr_sectors)
3067{
3068    set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3069}
3070
3071int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3072{
3073    return bs->dirty_count;
3074}
3075
3076void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3077{
3078    assert(bs->in_use != in_use);
3079    bs->in_use = in_use;
3080}
3081
3082int bdrv_in_use(BlockDriverState *bs)
3083{
3084    return bs->in_use;
3085}
3086
3087void bdrv_iostatus_enable(BlockDriverState *bs)
3088{
3089    bs->iostatus_enabled = true;
3090    bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3091}
3092
3093/* The I/O status is only enabled if the drive explicitly
3094 * enables it _and_ the VM is configured to stop on errors */
3095bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3096{
3097    return (bs->iostatus_enabled &&
3098           (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3099            bs->on_write_error == BLOCK_ERR_STOP_ANY    ||
3100            bs->on_read_error == BLOCK_ERR_STOP_ANY));
3101}
3102
3103void bdrv_iostatus_disable(BlockDriverState *bs)
3104{
3105    bs->iostatus_enabled = false;
3106}
3107
3108void bdrv_iostatus_reset(BlockDriverState *bs)
3109{
3110    if (bdrv_iostatus_is_enabled(bs)) {
3111        bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3112    }
3113}
3114
3115/* XXX: Today this is set by device models because it makes the implementation
3116   quite simple. However, the block layer knows about the error, so it's
3117   possible to implement this without device models being involved */
3118void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3119{
3120    if (bdrv_iostatus_is_enabled(bs) &&
3121        bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
3122        assert(error >= 0);
3123        bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3124                                         BLOCK_DEVICE_IO_STATUS_FAILED;
3125    }
3126}
3127
3128void
3129bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3130        enum BlockAcctType type)
3131{
3132    assert(type < BDRV_MAX_IOTYPE);
3133
3134    cookie->bytes = bytes;
3135    cookie->start_time_ns = get_clock();
3136    cookie->type = type;
3137}
3138
3139void
3140bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3141{
3142    assert(cookie->type < BDRV_MAX_IOTYPE);
3143
3144    bs->nr_bytes[cookie->type] += cookie->bytes;
3145    bs->nr_ops[cookie->type]++;
3146    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
3147}
3148
3149int bdrv_img_create(const char *filename, const char *fmt,
3150                    const char *base_filename, const char *base_fmt,
3151                    char *options, uint64_t img_size, int flags)
3152{
3153    QEMUOptionParameter *param = NULL, *create_options = NULL;
3154    QEMUOptionParameter *backing_fmt, *backing_file, *size;
3155    BlockDriverState *bs = NULL;
3156    BlockDriver *drv, *proto_drv;
3157    BlockDriver *backing_drv = NULL;
3158    int ret = 0;
3159
3160    /* Find driver and parse its options */
3161    drv = bdrv_find_format(fmt);
3162    if (!drv) {
3163        error_report("Unknown file format '%s'", fmt);
3164        ret = -EINVAL;
3165        goto out;
3166    }
3167
3168    proto_drv = bdrv_find_protocol(filename);
3169    if (!proto_drv) {
3170        error_report("Unknown protocol '%s'", filename);
3171        ret = -EINVAL;
3172        goto out;
3173    }
3174
3175    create_options = append_option_parameters(create_options,
3176                                              drv->create_options);
3177    create_options = append_option_parameters(create_options,
3178                                              proto_drv->create_options);
3179
3180    /* Create parameter list with default values */
3181    param = parse_option_parameters("", create_options, param);
3182
3183    set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
3184
3185    /* Parse -o options */
3186    if (options) {
3187        param = parse_option_parameters(options, create_options, param);
3188        if (param == NULL) {
3189            error_report("Invalid options for file format '%s'.", fmt);
3190            ret = -EINVAL;
3191            goto out;
3192        }
3193    }
3194
3195    if (base_filename) {
3196        if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
3197                                 base_filename)) {
3198            error_report("Backing file not supported for file format '%s'",
3199                         fmt);
3200            ret = -EINVAL;
3201            goto out;
3202        }
3203    }
3204
3205    if (base_fmt) {
3206        if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
3207            error_report("Backing file format not supported for file "
3208                         "format '%s'", fmt);
3209            ret = -EINVAL;
3210            goto out;
3211        }
3212    }
3213
3214    backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
3215    if (backing_file && backing_file->value.s) {
3216        if (!strcmp(filename, backing_file->value.s)) {
3217            error_report("Error: Trying to create an image with the "
3218                         "same filename as the backing file");
3219            ret = -EINVAL;
3220            goto out;
3221        }
3222    }
3223
3224    backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
3225    if (backing_fmt && backing_fmt->value.s) {
3226        backing_drv = bdrv_find_format(backing_fmt->value.s);
3227        if (!backing_drv) {
3228            error_report("Unknown backing file format '%s'",
3229                         backing_fmt->value.s);
3230            ret = -EINVAL;
3231            goto out;
3232        }
3233    }
3234
3235    // The size for the image must always be specified, with one exception:
3236    // If we are using a backing file, we can obtain the size from there
3237    size = get_option_parameter(param, BLOCK_OPT_SIZE);
3238    if (size && size->value.n == -1) {
3239        if (backing_file && backing_file->value.s) {
3240            uint64_t size;
3241            char buf[32];
3242
3243            bs = bdrv_new("");
3244
3245            ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
3246            if (ret < 0) {
3247                error_report("Could not open '%s'", backing_file->value.s);
3248                goto out;
3249            }
3250            bdrv_get_geometry(bs, &size);
3251            size *= 512;
3252
3253            snprintf(buf, sizeof(buf), "%" PRId64, size);
3254            set_option_parameter(param, BLOCK_OPT_SIZE, buf);
3255        } else {
3256            error_report("Image creation needs a size parameter");
3257            ret = -EINVAL;
3258            goto out;
3259        }
3260    }
3261
3262    printf("Formatting '%s', fmt=%s ", filename, fmt);
3263    print_option_parameters(param);
3264    puts("");
3265
3266    ret = bdrv_create(drv, filename, param);
3267
3268    if (ret < 0) {
3269        if (ret == -ENOTSUP) {
3270            error_report("Formatting or formatting option not supported for "
3271                         "file format '%s'", fmt);
3272        } else if (ret == -EFBIG) {
3273            error_report("The image size is too large for file format '%s'",
3274                         fmt);
3275        } else {
3276            error_report("%s: error while creating %s: %s", filename, fmt,
3277                         strerror(-ret));
3278        }
3279    }
3280
3281out:
3282    free_option_parameters(create_options);
3283    free_option_parameters(param);
3284
3285    if (bs) {
3286        bdrv_delete(bs);
3287    }
3288
3289    return ret;
3290}
3291