qemu/block/raw-posix.c
<<
>>
Prefs
   1/*
   2 * Block driver for RAW files (posix)
   3 *
   4 * Copyright (c) 2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24#include "qemu/osdep.h"
  25#include "qapi/error.h"
  26#include "qemu/cutils.h"
  27#include "qemu/error-report.h"
  28#include "qemu/timer.h"
  29#include "qemu/log.h"
  30#include "block/block_int.h"
  31#include "qemu/module.h"
  32#include "trace.h"
  33#include "block/thread-pool.h"
  34#include "qemu/iov.h"
  35#include "raw-aio.h"
  36#include "qapi/util.h"
  37#include "qapi/qmp/qstring.h"
  38
  39#if defined(__APPLE__) && (__MACH__)
  40#include <paths.h>
  41#include <sys/param.h>
  42#include <IOKit/IOKitLib.h>
  43#include <IOKit/IOBSD.h>
  44#include <IOKit/storage/IOMediaBSDClient.h>
  45#include <IOKit/storage/IOMedia.h>
  46#include <IOKit/storage/IOCDMedia.h>
  47//#include <IOKit/storage/IOCDTypes.h>
  48#include <IOKit/storage/IODVDMedia.h>
  49#include <CoreFoundation/CoreFoundation.h>
  50#endif
  51
  52#ifdef __sun__
  53#define _POSIX_PTHREAD_SEMANTICS 1
  54#include <sys/dkio.h>
  55#endif
  56#ifdef __linux__
  57#include <sys/ioctl.h>
  58#include <sys/param.h>
  59#include <linux/cdrom.h>
  60#include <linux/fd.h>
  61#include <linux/fs.h>
  62#include <linux/hdreg.h>
  63#include <scsi/sg.h>
  64#ifdef __s390__
  65#include <asm/dasd.h>
  66#endif
  67#ifndef FS_NOCOW_FL
  68#define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
  69#endif
  70#endif
  71#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE)
  72#include <linux/falloc.h>
  73#endif
  74#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
  75#include <sys/disk.h>
  76#include <sys/cdio.h>
  77#endif
  78
  79#ifdef __OpenBSD__
  80#include <sys/ioctl.h>
  81#include <sys/disklabel.h>
  82#include <sys/dkio.h>
  83#endif
  84
  85#ifdef __NetBSD__
  86#include <sys/ioctl.h>
  87#include <sys/disklabel.h>
  88#include <sys/dkio.h>
  89#include <sys/disk.h>
  90#endif
  91
  92#ifdef __DragonFly__
  93#include <sys/ioctl.h>
  94#include <sys/diskslice.h>
  95#endif
  96
  97#ifdef CONFIG_XFS
  98#include <xfs/xfs.h>
  99#endif
 100
 101//#define DEBUG_BLOCK
 102
 103#ifdef DEBUG_BLOCK
 104# define DEBUG_BLOCK_PRINT 1
 105#else
 106# define DEBUG_BLOCK_PRINT 0
 107#endif
 108#define DPRINTF(fmt, ...) \
 109do { \
 110    if (DEBUG_BLOCK_PRINT) { \
 111        printf(fmt, ## __VA_ARGS__); \
 112    } \
 113} while (0)
 114
 115/* OS X does not have O_DSYNC */
 116#ifndef O_DSYNC
 117#ifdef O_SYNC
 118#define O_DSYNC O_SYNC
 119#elif defined(O_FSYNC)
 120#define O_DSYNC O_FSYNC
 121#endif
 122#endif
 123
 124/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
 125#ifndef O_DIRECT
 126#define O_DIRECT O_DSYNC
 127#endif
 128
 129#define FTYPE_FILE   0
 130#define FTYPE_CD     1
 131
 132#define MAX_BLOCKSIZE   4096
 133
 134typedef struct BDRVRawState {
 135    int fd;
 136    int type;
 137    int open_flags;
 138    size_t buf_align;
 139
 140#ifdef CONFIG_LINUX_AIO
 141    int use_aio;
 142    void *aio_ctx;
 143#endif
 144#ifdef CONFIG_XFS
 145    bool is_xfs:1;
 146#endif
 147    bool has_discard:1;
 148    bool has_write_zeroes:1;
 149    bool discard_zeroes:1;
 150    bool has_fallocate;
 151    bool needs_alignment;
 152} BDRVRawState;
 153
 154typedef struct BDRVRawReopenState {
 155    int fd;
 156    int open_flags;
 157#ifdef CONFIG_LINUX_AIO
 158    int use_aio;
 159#endif
 160} BDRVRawReopenState;
 161
 162static int fd_open(BlockDriverState *bs);
 163static int64_t raw_getlength(BlockDriverState *bs);
 164
 165typedef struct RawPosixAIOData {
 166    BlockDriverState *bs;
 167    int aio_fildes;
 168    union {
 169        struct iovec *aio_iov;
 170        void *aio_ioctl_buf;
 171    };
 172    int aio_niov;
 173    uint64_t aio_nbytes;
 174#define aio_ioctl_cmd   aio_nbytes /* for QEMU_AIO_IOCTL */
 175    off_t aio_offset;
 176    int aio_type;
 177} RawPosixAIOData;
 178
 179#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
 180static int cdrom_reopen(BlockDriverState *bs);
 181#endif
 182
 183#if defined(__NetBSD__)
 184static int raw_normalize_devicepath(const char **filename)
 185{
 186    static char namebuf[PATH_MAX];
 187    const char *dp, *fname;
 188    struct stat sb;
 189
 190    fname = *filename;
 191    dp = strrchr(fname, '/');
 192    if (lstat(fname, &sb) < 0) {
 193        fprintf(stderr, "%s: stat failed: %s\n",
 194            fname, strerror(errno));
 195        return -errno;
 196    }
 197
 198    if (!S_ISBLK(sb.st_mode)) {
 199        return 0;
 200    }
 201
 202    if (dp == NULL) {
 203        snprintf(namebuf, PATH_MAX, "r%s", fname);
 204    } else {
 205        snprintf(namebuf, PATH_MAX, "%.*s/r%s",
 206            (int)(dp - fname), fname, dp + 1);
 207    }
 208    fprintf(stderr, "%s is a block device", fname);
 209    *filename = namebuf;
 210    fprintf(stderr, ", using %s\n", *filename);
 211
 212    return 0;
 213}
 214#else
 215static int raw_normalize_devicepath(const char **filename)
 216{
 217    return 0;
 218}
 219#endif
 220
 221/*
 222 * Get logical block size via ioctl. On success store it in @sector_size_p.
 223 */
 224static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
 225{
 226    unsigned int sector_size;
 227    bool success = false;
 228
 229    errno = ENOTSUP;
 230
 231    /* Try a few ioctls to get the right size */
 232#ifdef BLKSSZGET
 233    if (ioctl(fd, BLKSSZGET, &sector_size) >= 0) {
 234        *sector_size_p = sector_size;
 235        success = true;
 236    }
 237#endif
 238#ifdef DKIOCGETBLOCKSIZE
 239    if (ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) >= 0) {
 240        *sector_size_p = sector_size;
 241        success = true;
 242    }
 243#endif
 244#ifdef DIOCGSECTORSIZE
 245    if (ioctl(fd, DIOCGSECTORSIZE, &sector_size) >= 0) {
 246        *sector_size_p = sector_size;
 247        success = true;
 248    }
 249#endif
 250
 251    return success ? 0 : -errno;
 252}
 253
 254/**
 255 * Get physical block size of @fd.
 256 * On success, store it in @blk_size and return 0.
 257 * On failure, return -errno.
 258 */
 259static int probe_physical_blocksize(int fd, unsigned int *blk_size)
 260{
 261#ifdef BLKPBSZGET
 262    if (ioctl(fd, BLKPBSZGET, blk_size) < 0) {
 263        return -errno;
 264    }
 265    return 0;
 266#else
 267    return -ENOTSUP;
 268#endif
 269}
 270
 271/* Check if read is allowed with given memory buffer and length.
 272 *
 273 * This function is used to check O_DIRECT memory buffer and request alignment.
 274 */
 275static bool raw_is_io_aligned(int fd, void *buf, size_t len)
 276{
 277    ssize_t ret = pread(fd, buf, len, 0);
 278
 279    if (ret >= 0) {
 280        return true;
 281    }
 282
 283#ifdef __linux__
 284    /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads.  Ignore
 285     * other errors (e.g. real I/O error), which could happen on a failed
 286     * drive, since we only care about probing alignment.
 287     */
 288    if (errno != EINVAL) {
 289        return true;
 290    }
 291#endif
 292
 293    return false;
 294}
 295
 296static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
 297{
 298    BDRVRawState *s = bs->opaque;
 299    char *buf;
 300    size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
 301
 302    /* For SCSI generic devices the alignment is not really used.
 303       With buffered I/O, we don't have any restrictions. */
 304    if (bdrv_is_sg(bs) || !s->needs_alignment) {
 305        bs->request_alignment = 1;
 306        s->buf_align = 1;
 307        return;
 308    }
 309
 310    bs->request_alignment = 0;
 311    s->buf_align = 0;
 312    /* Let's try to use the logical blocksize for the alignment. */
 313    if (probe_logical_blocksize(fd, &bs->request_alignment) < 0) {
 314        bs->request_alignment = 0;
 315    }
 316#ifdef CONFIG_XFS
 317    if (s->is_xfs) {
 318        struct dioattr da;
 319        if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) {
 320            bs->request_alignment = da.d_miniosz;
 321            /* The kernel returns wrong information for d_mem */
 322            /* s->buf_align = da.d_mem; */
 323        }
 324    }
 325#endif
 326
 327    /* If we could not get the sizes so far, we can only guess them */
 328    if (!s->buf_align) {
 329        size_t align;
 330        buf = qemu_memalign(max_align, 2 * max_align);
 331        for (align = 512; align <= max_align; align <<= 1) {
 332            if (raw_is_io_aligned(fd, buf + align, max_align)) {
 333                s->buf_align = align;
 334                break;
 335            }
 336        }
 337        qemu_vfree(buf);
 338    }
 339
 340    if (!bs->request_alignment) {
 341        size_t align;
 342        buf = qemu_memalign(s->buf_align, max_align);
 343        for (align = 512; align <= max_align; align <<= 1) {
 344            if (raw_is_io_aligned(fd, buf, align)) {
 345                bs->request_alignment = align;
 346                break;
 347            }
 348        }
 349        qemu_vfree(buf);
 350    }
 351
 352    if (!s->buf_align || !bs->request_alignment) {
 353        error_setg(errp, "Could not find working O_DIRECT alignment. "
 354                         "Try cache.direct=off.");
 355    }
 356}
 357
 358static void raw_parse_flags(int bdrv_flags, int *open_flags)
 359{
 360    assert(open_flags != NULL);
 361
 362    *open_flags |= O_BINARY;
 363    *open_flags &= ~O_ACCMODE;
 364    if (bdrv_flags & BDRV_O_RDWR) {
 365        *open_flags |= O_RDWR;
 366    } else {
 367        *open_flags |= O_RDONLY;
 368    }
 369
 370    /* Use O_DSYNC for write-through caching, no flags for write-back caching,
 371     * and O_DIRECT for no caching. */
 372    if ((bdrv_flags & BDRV_O_NOCACHE)) {
 373        *open_flags |= O_DIRECT;
 374    }
 375}
 376
 377static void raw_detach_aio_context(BlockDriverState *bs)
 378{
 379#ifdef CONFIG_LINUX_AIO
 380    BDRVRawState *s = bs->opaque;
 381
 382    if (s->use_aio) {
 383        laio_detach_aio_context(s->aio_ctx, bdrv_get_aio_context(bs));
 384    }
 385#endif
 386}
 387
 388static void raw_attach_aio_context(BlockDriverState *bs,
 389                                   AioContext *new_context)
 390{
 391#ifdef CONFIG_LINUX_AIO
 392    BDRVRawState *s = bs->opaque;
 393
 394    if (s->use_aio) {
 395        laio_attach_aio_context(s->aio_ctx, new_context);
 396    }
 397#endif
 398}
 399
 400#ifdef CONFIG_LINUX_AIO
 401static int raw_set_aio(void **aio_ctx, int *use_aio, int bdrv_flags)
 402{
 403    int ret = -1;
 404    assert(aio_ctx != NULL);
 405    assert(use_aio != NULL);
 406    /*
 407     * Currently Linux do AIO only for files opened with O_DIRECT
 408     * specified so check NOCACHE flag too
 409     */
 410    if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
 411                      (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) {
 412
 413        /* if non-NULL, laio_init() has already been run */
 414        if (*aio_ctx == NULL) {
 415            *aio_ctx = laio_init();
 416            if (!*aio_ctx) {
 417                goto error;
 418            }
 419        }
 420        *use_aio = 1;
 421    } else {
 422        *use_aio = 0;
 423    }
 424
 425    ret = 0;
 426
 427error:
 428    return ret;
 429}
 430#endif
 431
 432static void raw_parse_filename(const char *filename, QDict *options,
 433                               Error **errp)
 434{
 435    /* The filename does not have to be prefixed by the protocol name, since
 436     * "file" is the default protocol; therefore, the return value of this
 437     * function call can be ignored. */
 438    strstart(filename, "file:", &filename);
 439
 440    qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename)));
 441}
 442
 443static QemuOptsList raw_runtime_opts = {
 444    .name = "raw",
 445    .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
 446    .desc = {
 447        {
 448            .name = "filename",
 449            .type = QEMU_OPT_STRING,
 450            .help = "File name of the image",
 451        },
 452        { /* end of list */ }
 453    },
 454};
 455
 456static int raw_open_common(BlockDriverState *bs, QDict *options,
 457                           int bdrv_flags, int open_flags, Error **errp)
 458{
 459    BDRVRawState *s = bs->opaque;
 460    QemuOpts *opts;
 461    Error *local_err = NULL;
 462    const char *filename = NULL;
 463    int fd, ret;
 464    struct stat st;
 465
 466    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
 467    qemu_opts_absorb_qdict(opts, options, &local_err);
 468    if (local_err) {
 469        error_propagate(errp, local_err);
 470        ret = -EINVAL;
 471        goto fail;
 472    }
 473
 474    filename = qemu_opt_get(opts, "filename");
 475
 476    ret = raw_normalize_devicepath(&filename);
 477    if (ret != 0) {
 478        error_setg_errno(errp, -ret, "Could not normalize device path");
 479        goto fail;
 480    }
 481
 482    s->open_flags = open_flags;
 483    raw_parse_flags(bdrv_flags, &s->open_flags);
 484
 485    s->fd = -1;
 486    fd = qemu_open(filename, s->open_flags, 0644);
 487    if (fd < 0) {
 488        ret = -errno;
 489        if (ret == -EROFS) {
 490            ret = -EACCES;
 491        }
 492        goto fail;
 493    }
 494    s->fd = fd;
 495
 496#ifdef CONFIG_LINUX_AIO
 497    if (raw_set_aio(&s->aio_ctx, &s->use_aio, bdrv_flags)) {
 498        qemu_close(fd);
 499        ret = -errno;
 500        error_setg_errno(errp, -ret, "Could not set AIO state");
 501        goto fail;
 502    }
 503    if (!s->use_aio && (bdrv_flags & BDRV_O_NATIVE_AIO)) {
 504        error_setg(errp, "aio=native was specified, but it requires "
 505                         "cache.direct=on, which was not specified.");
 506        ret = -EINVAL;
 507        goto fail;
 508    }
 509#else
 510    if (bdrv_flags & BDRV_O_NATIVE_AIO) {
 511        error_setg(errp, "aio=native was specified, but is not supported "
 512                         "in this build.");
 513        ret = -EINVAL;
 514        goto fail;
 515    }
 516#endif /* !defined(CONFIG_LINUX_AIO) */
 517
 518    s->has_discard = true;
 519    s->has_write_zeroes = true;
 520    if ((bs->open_flags & BDRV_O_NOCACHE) != 0) {
 521        s->needs_alignment = true;
 522    }
 523
 524    if (fstat(s->fd, &st) < 0) {
 525        ret = -errno;
 526        error_setg_errno(errp, errno, "Could not stat file");
 527        goto fail;
 528    }
 529    if (S_ISREG(st.st_mode)) {
 530        s->discard_zeroes = true;
 531        s->has_fallocate = true;
 532    }
 533    if (S_ISBLK(st.st_mode)) {
 534#ifdef BLKDISCARDZEROES
 535        unsigned int arg;
 536        if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) {
 537            s->discard_zeroes = true;
 538        }
 539#endif
 540#ifdef __linux__
 541        /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
 542         * not rely on the contents of discarded blocks unless using O_DIRECT.
 543         * Same for BLKZEROOUT.
 544         */
 545        if (!(bs->open_flags & BDRV_O_NOCACHE)) {
 546            s->discard_zeroes = false;
 547            s->has_write_zeroes = false;
 548        }
 549#endif
 550    }
 551#ifdef __FreeBSD__
 552    if (S_ISCHR(st.st_mode)) {
 553        /*
 554         * The file is a char device (disk), which on FreeBSD isn't behind
 555         * a pager, so force all requests to be aligned. This is needed
 556         * so QEMU makes sure all IO operations on the device are aligned
 557         * to sector size, or else FreeBSD will reject them with EINVAL.
 558         */
 559        s->needs_alignment = true;
 560    }
 561#endif
 562
 563#ifdef CONFIG_XFS
 564    if (platform_test_xfs_fd(s->fd)) {
 565        s->is_xfs = true;
 566    }
 567#endif
 568
 569    raw_attach_aio_context(bs, bdrv_get_aio_context(bs));
 570
 571    ret = 0;
 572fail:
 573    if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
 574        unlink(filename);
 575    }
 576    qemu_opts_del(opts);
 577    return ret;
 578}
 579
 580static int raw_open(BlockDriverState *bs, QDict *options, int flags,
 581                    Error **errp)
 582{
 583    BDRVRawState *s = bs->opaque;
 584    Error *local_err = NULL;
 585    int ret;
 586
 587    s->type = FTYPE_FILE;
 588    ret = raw_open_common(bs, options, flags, 0, &local_err);
 589    if (local_err) {
 590        error_propagate(errp, local_err);
 591    }
 592    return ret;
 593}
 594
 595static int raw_reopen_prepare(BDRVReopenState *state,
 596                              BlockReopenQueue *queue, Error **errp)
 597{
 598    BDRVRawState *s;
 599    BDRVRawReopenState *raw_s;
 600    int ret = 0;
 601    Error *local_err = NULL;
 602
 603    assert(state != NULL);
 604    assert(state->bs != NULL);
 605
 606    s = state->bs->opaque;
 607
 608    state->opaque = g_new0(BDRVRawReopenState, 1);
 609    raw_s = state->opaque;
 610
 611#ifdef CONFIG_LINUX_AIO
 612    raw_s->use_aio = s->use_aio;
 613
 614    /* we can use s->aio_ctx instead of a copy, because the use_aio flag is
 615     * valid in the 'false' condition even if aio_ctx is set, and raw_set_aio()
 616     * won't override aio_ctx if aio_ctx is non-NULL */
 617    if (raw_set_aio(&s->aio_ctx, &raw_s->use_aio, state->flags)) {
 618        error_setg(errp, "Could not set AIO state");
 619        return -1;
 620    }
 621#endif
 622
 623    if (s->type == FTYPE_CD) {
 624        raw_s->open_flags |= O_NONBLOCK;
 625    }
 626
 627    raw_parse_flags(state->flags, &raw_s->open_flags);
 628
 629    raw_s->fd = -1;
 630
 631    int fcntl_flags = O_APPEND | O_NONBLOCK;
 632#ifdef O_NOATIME
 633    fcntl_flags |= O_NOATIME;
 634#endif
 635
 636#ifdef O_ASYNC
 637    /* Not all operating systems have O_ASYNC, and those that don't
 638     * will not let us track the state into raw_s->open_flags (typically
 639     * you achieve the same effect with an ioctl, for example I_SETSIG
 640     * on Solaris). But we do not use O_ASYNC, so that's fine.
 641     */
 642    assert((s->open_flags & O_ASYNC) == 0);
 643#endif
 644
 645    if ((raw_s->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
 646        /* dup the original fd */
 647        /* TODO: use qemu fcntl wrapper */
 648#ifdef F_DUPFD_CLOEXEC
 649        raw_s->fd = fcntl(s->fd, F_DUPFD_CLOEXEC, 0);
 650#else
 651        raw_s->fd = dup(s->fd);
 652        if (raw_s->fd != -1) {
 653            qemu_set_cloexec(raw_s->fd);
 654        }
 655#endif
 656        if (raw_s->fd >= 0) {
 657            ret = fcntl_setfl(raw_s->fd, raw_s->open_flags);
 658            if (ret) {
 659                qemu_close(raw_s->fd);
 660                raw_s->fd = -1;
 661            }
 662        }
 663    }
 664
 665    /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
 666    if (raw_s->fd == -1) {
 667        const char *normalized_filename = state->bs->filename;
 668        ret = raw_normalize_devicepath(&normalized_filename);
 669        if (ret < 0) {
 670            error_setg_errno(errp, -ret, "Could not normalize device path");
 671        } else {
 672            assert(!(raw_s->open_flags & O_CREAT));
 673            raw_s->fd = qemu_open(normalized_filename, raw_s->open_flags);
 674            if (raw_s->fd == -1) {
 675                error_setg_errno(errp, errno, "Could not reopen file");
 676                ret = -1;
 677            }
 678        }
 679    }
 680
 681    /* Fail already reopen_prepare() if we can't get a working O_DIRECT
 682     * alignment with the new fd. */
 683    if (raw_s->fd != -1) {
 684        raw_probe_alignment(state->bs, raw_s->fd, &local_err);
 685        if (local_err) {
 686            qemu_close(raw_s->fd);
 687            raw_s->fd = -1;
 688            error_propagate(errp, local_err);
 689            ret = -EINVAL;
 690        }
 691    }
 692
 693    return ret;
 694}
 695
 696static void raw_reopen_commit(BDRVReopenState *state)
 697{
 698    BDRVRawReopenState *raw_s = state->opaque;
 699    BDRVRawState *s = state->bs->opaque;
 700
 701    s->open_flags = raw_s->open_flags;
 702
 703    qemu_close(s->fd);
 704    s->fd = raw_s->fd;
 705#ifdef CONFIG_LINUX_AIO
 706    s->use_aio = raw_s->use_aio;
 707#endif
 708
 709    g_free(state->opaque);
 710    state->opaque = NULL;
 711}
 712
 713
 714static void raw_reopen_abort(BDRVReopenState *state)
 715{
 716    BDRVRawReopenState *raw_s = state->opaque;
 717
 718     /* nothing to do if NULL, we didn't get far enough */
 719    if (raw_s == NULL) {
 720        return;
 721    }
 722
 723    if (raw_s->fd >= 0) {
 724        qemu_close(raw_s->fd);
 725        raw_s->fd = -1;
 726    }
 727    g_free(state->opaque);
 728    state->opaque = NULL;
 729}
 730
 731static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 732{
 733    BDRVRawState *s = bs->opaque;
 734
 735    raw_probe_alignment(bs, s->fd, errp);
 736    bs->bl.min_mem_alignment = s->buf_align;
 737    bs->bl.opt_mem_alignment = MAX(s->buf_align, getpagesize());
 738}
 739
 740static int check_for_dasd(int fd)
 741{
 742#ifdef BIODASDINFO2
 743    struct dasd_information2_t info = {0};
 744
 745    return ioctl(fd, BIODASDINFO2, &info);
 746#else
 747    return -1;
 748#endif
 749}
 750
 751/**
 752 * Try to get @bs's logical and physical block size.
 753 * On success, store them in @bsz and return zero.
 754 * On failure, return negative errno.
 755 */
 756static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
 757{
 758    BDRVRawState *s = bs->opaque;
 759    int ret;
 760
 761    /* If DASD, get blocksizes */
 762    if (check_for_dasd(s->fd) < 0) {
 763        return -ENOTSUP;
 764    }
 765    ret = probe_logical_blocksize(s->fd, &bsz->log);
 766    if (ret < 0) {
 767        return ret;
 768    }
 769    return probe_physical_blocksize(s->fd, &bsz->phys);
 770}
 771
 772/**
 773 * Try to get @bs's geometry: cyls, heads, sectors.
 774 * On success, store them in @geo and return 0.
 775 * On failure return -errno.
 776 * (Allows block driver to assign default geometry values that guest sees)
 777 */
 778#ifdef __linux__
 779static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
 780{
 781    BDRVRawState *s = bs->opaque;
 782    struct hd_geometry ioctl_geo = {0};
 783
 784    /* If DASD, get its geometry */
 785    if (check_for_dasd(s->fd) < 0) {
 786        return -ENOTSUP;
 787    }
 788    if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) {
 789        return -errno;
 790    }
 791    /* HDIO_GETGEO may return success even though geo contains zeros
 792       (e.g. certain multipath setups) */
 793    if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) {
 794        return -ENOTSUP;
 795    }
 796    /* Do not return a geometry for partition */
 797    if (ioctl_geo.start != 0) {
 798        return -ENOTSUP;
 799    }
 800    geo->heads = ioctl_geo.heads;
 801    geo->sectors = ioctl_geo.sectors;
 802    geo->cylinders = ioctl_geo.cylinders;
 803
 804    return 0;
 805}
 806#else /* __linux__ */
 807static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
 808{
 809    return -ENOTSUP;
 810}
 811#endif
 812
 813static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)
 814{
 815    int ret;
 816
 817    ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf);
 818    if (ret == -1) {
 819        return -errno;
 820    }
 821
 822    return 0;
 823}
 824
 825static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb)
 826{
 827    int ret;
 828
 829    ret = qemu_fdatasync(aiocb->aio_fildes);
 830    if (ret == -1) {
 831        return -errno;
 832    }
 833    return 0;
 834}
 835
 836#ifdef CONFIG_PREADV
 837
 838static bool preadv_present = true;
 839
 840static ssize_t
 841qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
 842{
 843    return preadv(fd, iov, nr_iov, offset);
 844}
 845
 846static ssize_t
 847qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
 848{
 849    return pwritev(fd, iov, nr_iov, offset);
 850}
 851
 852#else
 853
 854static bool preadv_present = false;
 855
 856static ssize_t
 857qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
 858{
 859    return -ENOSYS;
 860}
 861
 862static ssize_t
 863qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
 864{
 865    return -ENOSYS;
 866}
 867
 868#endif
 869
 870static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
 871{
 872    ssize_t len;
 873
 874    do {
 875        if (aiocb->aio_type & QEMU_AIO_WRITE)
 876            len = qemu_pwritev(aiocb->aio_fildes,
 877                               aiocb->aio_iov,
 878                               aiocb->aio_niov,
 879                               aiocb->aio_offset);
 880         else
 881            len = qemu_preadv(aiocb->aio_fildes,
 882                              aiocb->aio_iov,
 883                              aiocb->aio_niov,
 884                              aiocb->aio_offset);
 885    } while (len == -1 && errno == EINTR);
 886
 887    if (len == -1) {
 888        return -errno;
 889    }
 890    return len;
 891}
 892
 893/*
 894 * Read/writes the data to/from a given linear buffer.
 895 *
 896 * Returns the number of bytes handles or -errno in case of an error. Short
 897 * reads are only returned if the end of the file is reached.
 898 */
 899static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
 900{
 901    ssize_t offset = 0;
 902    ssize_t len;
 903
 904    while (offset < aiocb->aio_nbytes) {
 905        if (aiocb->aio_type & QEMU_AIO_WRITE) {
 906            len = pwrite(aiocb->aio_fildes,
 907                         (const char *)buf + offset,
 908                         aiocb->aio_nbytes - offset,
 909                         aiocb->aio_offset + offset);
 910        } else {
 911            len = pread(aiocb->aio_fildes,
 912                        buf + offset,
 913                        aiocb->aio_nbytes - offset,
 914                        aiocb->aio_offset + offset);
 915        }
 916        if (len == -1 && errno == EINTR) {
 917            continue;
 918        } else if (len == -1 && errno == EINVAL &&
 919                   (aiocb->bs->open_flags & BDRV_O_NOCACHE) &&
 920                   !(aiocb->aio_type & QEMU_AIO_WRITE) &&
 921                   offset > 0) {
 922            /* O_DIRECT pread() may fail with EINVAL when offset is unaligned
 923             * after a short read.  Assume that O_DIRECT short reads only occur
 924             * at EOF.  Therefore this is a short read, not an I/O error.
 925             */
 926            break;
 927        } else if (len == -1) {
 928            offset = -errno;
 929            break;
 930        } else if (len == 0) {
 931            break;
 932        }
 933        offset += len;
 934    }
 935
 936    return offset;
 937}
 938
 939static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
 940{
 941    ssize_t nbytes;
 942    char *buf;
 943
 944    if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
 945        /*
 946         * If there is just a single buffer, and it is properly aligned
 947         * we can just use plain pread/pwrite without any problems.
 948         */
 949        if (aiocb->aio_niov == 1) {
 950             return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base);
 951        }
 952        /*
 953         * We have more than one iovec, and all are properly aligned.
 954         *
 955         * Try preadv/pwritev first and fall back to linearizing the
 956         * buffer if it's not supported.
 957         */
 958        if (preadv_present) {
 959            nbytes = handle_aiocb_rw_vector(aiocb);
 960            if (nbytes == aiocb->aio_nbytes ||
 961                (nbytes < 0 && nbytes != -ENOSYS)) {
 962                return nbytes;
 963            }
 964            preadv_present = false;
 965        }
 966
 967        /*
 968         * XXX(hch): short read/write.  no easy way to handle the reminder
 969         * using these interfaces.  For now retry using plain
 970         * pread/pwrite?
 971         */
 972    }
 973
 974    /*
 975     * Ok, we have to do it the hard way, copy all segments into
 976     * a single aligned buffer.
 977     */
 978    buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes);
 979    if (buf == NULL) {
 980        return -ENOMEM;
 981    }
 982
 983    if (aiocb->aio_type & QEMU_AIO_WRITE) {
 984        char *p = buf;
 985        int i;
 986
 987        for (i = 0; i < aiocb->aio_niov; ++i) {
 988            memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len);
 989            p += aiocb->aio_iov[i].iov_len;
 990        }
 991        assert(p - buf == aiocb->aio_nbytes);
 992    }
 993
 994    nbytes = handle_aiocb_rw_linear(aiocb, buf);
 995    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
 996        char *p = buf;
 997        size_t count = aiocb->aio_nbytes, copy;
 998        int i;
 999
1000        for (i = 0; i < aiocb->aio_niov && count; ++i) {
1001            copy = count;
1002            if (copy > aiocb->aio_iov[i].iov_len) {
1003                copy = aiocb->aio_iov[i].iov_len;
1004            }
1005            memcpy(aiocb->aio_iov[i].iov_base, p, copy);
1006            assert(count >= copy);
1007            p     += copy;
1008            count -= copy;
1009        }
1010        assert(count == 0);
1011    }
1012    qemu_vfree(buf);
1013
1014    return nbytes;
1015}
1016
1017#ifdef CONFIG_XFS
1018static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes)
1019{
1020    struct xfs_flock64 fl;
1021    int err;
1022
1023    memset(&fl, 0, sizeof(fl));
1024    fl.l_whence = SEEK_SET;
1025    fl.l_start = offset;
1026    fl.l_len = bytes;
1027
1028    if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) {
1029        err = errno;
1030        DPRINTF("cannot write zero range (%s)\n", strerror(errno));
1031        return -err;
1032    }
1033
1034    return 0;
1035}
1036
1037static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
1038{
1039    struct xfs_flock64 fl;
1040    int err;
1041
1042    memset(&fl, 0, sizeof(fl));
1043    fl.l_whence = SEEK_SET;
1044    fl.l_start = offset;
1045    fl.l_len = bytes;
1046
1047    if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) {
1048        err = errno;
1049        DPRINTF("cannot punch hole (%s)\n", strerror(errno));
1050        return -err;
1051    }
1052
1053    return 0;
1054}
1055#endif
1056
1057static int translate_err(int err)
1058{
1059    if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
1060        err == -ENOTTY) {
1061        err = -ENOTSUP;
1062    }
1063    return err;
1064}
1065
1066#ifdef CONFIG_FALLOCATE
1067static int do_fallocate(int fd, int mode, off_t offset, off_t len)
1068{
1069    do {
1070        if (fallocate(fd, mode, offset, len) == 0) {
1071            return 0;
1072        }
1073    } while (errno == EINTR);
1074    return translate_err(-errno);
1075}
1076#endif
1077
1078static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb)
1079{
1080    int ret = -ENOTSUP;
1081    BDRVRawState *s = aiocb->bs->opaque;
1082
1083    if (!s->has_write_zeroes) {
1084        return -ENOTSUP;
1085    }
1086
1087#ifdef BLKZEROOUT
1088    do {
1089        uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1090        if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
1091            return 0;
1092        }
1093    } while (errno == EINTR);
1094
1095    ret = translate_err(-errno);
1096#endif
1097
1098    if (ret == -ENOTSUP) {
1099        s->has_write_zeroes = false;
1100    }
1101    return ret;
1102}
1103
1104static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
1105{
1106#if defined(CONFIG_FALLOCATE) || defined(CONFIG_XFS)
1107    BDRVRawState *s = aiocb->bs->opaque;
1108#endif
1109
1110    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1111        return handle_aiocb_write_zeroes_block(aiocb);
1112    }
1113
1114#ifdef CONFIG_XFS
1115    if (s->is_xfs) {
1116        return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes);
1117    }
1118#endif
1119
1120#ifdef CONFIG_FALLOCATE_ZERO_RANGE
1121    if (s->has_write_zeroes) {
1122        int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE,
1123                               aiocb->aio_offset, aiocb->aio_nbytes);
1124        if (ret == 0 || ret != -ENOTSUP) {
1125            return ret;
1126        }
1127        s->has_write_zeroes = false;
1128    }
1129#endif
1130
1131#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1132    if (s->has_discard && s->has_fallocate) {
1133        int ret = do_fallocate(s->fd,
1134                               FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1135                               aiocb->aio_offset, aiocb->aio_nbytes);
1136        if (ret == 0) {
1137            ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1138            if (ret == 0 || ret != -ENOTSUP) {
1139                return ret;
1140            }
1141            s->has_fallocate = false;
1142        } else if (ret != -ENOTSUP) {
1143            return ret;
1144        } else {
1145            s->has_discard = false;
1146        }
1147    }
1148#endif
1149
1150#ifdef CONFIG_FALLOCATE
1151    if (s->has_fallocate && aiocb->aio_offset >= bdrv_getlength(aiocb->bs)) {
1152        int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1153        if (ret == 0 || ret != -ENOTSUP) {
1154            return ret;
1155        }
1156        s->has_fallocate = false;
1157    }
1158#endif
1159
1160    return -ENOTSUP;
1161}
1162
1163static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
1164{
1165    int ret = -EOPNOTSUPP;
1166    BDRVRawState *s = aiocb->bs->opaque;
1167
1168    if (!s->has_discard) {
1169        return -ENOTSUP;
1170    }
1171
1172    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1173#ifdef BLKDISCARD
1174        do {
1175            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1176            if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) {
1177                return 0;
1178            }
1179        } while (errno == EINTR);
1180
1181        ret = -errno;
1182#endif
1183    } else {
1184#ifdef CONFIG_XFS
1185        if (s->is_xfs) {
1186            return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes);
1187        }
1188#endif
1189
1190#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1191        ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1192                           aiocb->aio_offset, aiocb->aio_nbytes);
1193#endif
1194    }
1195
1196    ret = translate_err(ret);
1197    if (ret == -ENOTSUP) {
1198        s->has_discard = false;
1199    }
1200    return ret;
1201}
1202
1203static int aio_worker(void *arg)
1204{
1205    RawPosixAIOData *aiocb = arg;
1206    ssize_t ret = 0;
1207
1208    switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) {
1209    case QEMU_AIO_READ:
1210        ret = handle_aiocb_rw(aiocb);
1211        if (ret >= 0 && ret < aiocb->aio_nbytes) {
1212            iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret,
1213                      0, aiocb->aio_nbytes - ret);
1214
1215            ret = aiocb->aio_nbytes;
1216        }
1217        if (ret == aiocb->aio_nbytes) {
1218            ret = 0;
1219        } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
1220            ret = -EINVAL;
1221        }
1222        break;
1223    case QEMU_AIO_WRITE:
1224        ret = handle_aiocb_rw(aiocb);
1225        if (ret == aiocb->aio_nbytes) {
1226            ret = 0;
1227        } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
1228            ret = -EINVAL;
1229        }
1230        break;
1231    case QEMU_AIO_FLUSH:
1232        ret = handle_aiocb_flush(aiocb);
1233        break;
1234    case QEMU_AIO_IOCTL:
1235        ret = handle_aiocb_ioctl(aiocb);
1236        break;
1237    case QEMU_AIO_DISCARD:
1238        ret = handle_aiocb_discard(aiocb);
1239        break;
1240    case QEMU_AIO_WRITE_ZEROES:
1241        ret = handle_aiocb_write_zeroes(aiocb);
1242        break;
1243    default:
1244        fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
1245        ret = -EINVAL;
1246        break;
1247    }
1248
1249    g_free(aiocb);
1250    return ret;
1251}
1252
1253static int paio_submit_co(BlockDriverState *bs, int fd,
1254        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
1255        int type)
1256{
1257    RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
1258    ThreadPool *pool;
1259
1260    acb->bs = bs;
1261    acb->aio_type = type;
1262    acb->aio_fildes = fd;
1263
1264    acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE;
1265    acb->aio_offset = sector_num * BDRV_SECTOR_SIZE;
1266
1267    if (qiov) {
1268        acb->aio_iov = qiov->iov;
1269        acb->aio_niov = qiov->niov;
1270        assert(qiov->size == acb->aio_nbytes);
1271    }
1272
1273    trace_paio_submit_co(sector_num, nb_sectors, type);
1274    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
1275    return thread_pool_submit_co(pool, aio_worker, acb);
1276}
1277
1278static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
1279        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
1280        BlockCompletionFunc *cb, void *opaque, int type)
1281{
1282    RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
1283    ThreadPool *pool;
1284
1285    acb->bs = bs;
1286    acb->aio_type = type;
1287    acb->aio_fildes = fd;
1288
1289    acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE;
1290    acb->aio_offset = sector_num * BDRV_SECTOR_SIZE;
1291
1292    if (qiov) {
1293        acb->aio_iov = qiov->iov;
1294        acb->aio_niov = qiov->niov;
1295        assert(qiov->size == acb->aio_nbytes);
1296    }
1297
1298    trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
1299    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
1300    return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
1301}
1302
1303static BlockAIOCB *raw_aio_submit(BlockDriverState *bs,
1304        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
1305        BlockCompletionFunc *cb, void *opaque, int type)
1306{
1307    BDRVRawState *s = bs->opaque;
1308
1309    if (fd_open(bs) < 0)
1310        return NULL;
1311
1312    /*
1313     * Check if the underlying device requires requests to be aligned,
1314     * and if the request we are trying to submit is aligned or not.
1315     * If this is the case tell the low-level driver that it needs
1316     * to copy the buffer.
1317     */
1318    if (s->needs_alignment) {
1319        if (!bdrv_qiov_is_aligned(bs, qiov)) {
1320            type |= QEMU_AIO_MISALIGNED;
1321#ifdef CONFIG_LINUX_AIO
1322        } else if (s->use_aio) {
1323            return laio_submit(bs, s->aio_ctx, s->fd, sector_num, qiov,
1324                               nb_sectors, cb, opaque, type);
1325#endif
1326        }
1327    }
1328
1329    return paio_submit(bs, s->fd, sector_num, qiov, nb_sectors,
1330                       cb, opaque, type);
1331}
1332
1333static void raw_aio_plug(BlockDriverState *bs)
1334{
1335#ifdef CONFIG_LINUX_AIO
1336    BDRVRawState *s = bs->opaque;
1337    if (s->use_aio) {
1338        laio_io_plug(bs, s->aio_ctx);
1339    }
1340#endif
1341}
1342
1343static void raw_aio_unplug(BlockDriverState *bs)
1344{
1345#ifdef CONFIG_LINUX_AIO
1346    BDRVRawState *s = bs->opaque;
1347    if (s->use_aio) {
1348        laio_io_unplug(bs, s->aio_ctx, true);
1349    }
1350#endif
1351}
1352
1353static void raw_aio_flush_io_queue(BlockDriverState *bs)
1354{
1355#ifdef CONFIG_LINUX_AIO
1356    BDRVRawState *s = bs->opaque;
1357    if (s->use_aio) {
1358        laio_io_unplug(bs, s->aio_ctx, false);
1359    }
1360#endif
1361}
1362
1363static BlockAIOCB *raw_aio_readv(BlockDriverState *bs,
1364        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
1365        BlockCompletionFunc *cb, void *opaque)
1366{
1367    return raw_aio_submit(bs, sector_num, qiov, nb_sectors,
1368                          cb, opaque, QEMU_AIO_READ);
1369}
1370
1371static BlockAIOCB *raw_aio_writev(BlockDriverState *bs,
1372        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
1373        BlockCompletionFunc *cb, void *opaque)
1374{
1375    return raw_aio_submit(bs, sector_num, qiov, nb_sectors,
1376                          cb, opaque, QEMU_AIO_WRITE);
1377}
1378
1379static BlockAIOCB *raw_aio_flush(BlockDriverState *bs,
1380        BlockCompletionFunc *cb, void *opaque)
1381{
1382    BDRVRawState *s = bs->opaque;
1383
1384    if (fd_open(bs) < 0)
1385        return NULL;
1386
1387    return paio_submit(bs, s->fd, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH);
1388}
1389
1390static void raw_close(BlockDriverState *bs)
1391{
1392    BDRVRawState *s = bs->opaque;
1393
1394    raw_detach_aio_context(bs);
1395
1396#ifdef CONFIG_LINUX_AIO
1397    if (s->use_aio) {
1398        laio_cleanup(s->aio_ctx);
1399    }
1400#endif
1401    if (s->fd >= 0) {
1402        qemu_close(s->fd);
1403        s->fd = -1;
1404    }
1405}
1406
1407static int raw_truncate(BlockDriverState *bs, int64_t offset)
1408{
1409    BDRVRawState *s = bs->opaque;
1410    struct stat st;
1411
1412    if (fstat(s->fd, &st)) {
1413        return -errno;
1414    }
1415
1416    if (S_ISREG(st.st_mode)) {
1417        if (ftruncate(s->fd, offset) < 0) {
1418            return -errno;
1419        }
1420    } else if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
1421       if (offset > raw_getlength(bs)) {
1422           return -EINVAL;
1423       }
1424    } else {
1425        return -ENOTSUP;
1426    }
1427
1428    return 0;
1429}
1430
1431#ifdef __OpenBSD__
1432static int64_t raw_getlength(BlockDriverState *bs)
1433{
1434    BDRVRawState *s = bs->opaque;
1435    int fd = s->fd;
1436    struct stat st;
1437
1438    if (fstat(fd, &st))
1439        return -errno;
1440    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
1441        struct disklabel dl;
1442
1443        if (ioctl(fd, DIOCGDINFO, &dl))
1444            return -errno;
1445        return (uint64_t)dl.d_secsize *
1446            dl.d_partitions[DISKPART(st.st_rdev)].p_size;
1447    } else
1448        return st.st_size;
1449}
1450#elif defined(__NetBSD__)
1451static int64_t raw_getlength(BlockDriverState *bs)
1452{
1453    BDRVRawState *s = bs->opaque;
1454    int fd = s->fd;
1455    struct stat st;
1456
1457    if (fstat(fd, &st))
1458        return -errno;
1459    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
1460        struct dkwedge_info dkw;
1461
1462        if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) {
1463            return dkw.dkw_size * 512;
1464        } else {
1465            struct disklabel dl;
1466
1467            if (ioctl(fd, DIOCGDINFO, &dl))
1468                return -errno;
1469            return (uint64_t)dl.d_secsize *
1470                dl.d_partitions[DISKPART(st.st_rdev)].p_size;
1471        }
1472    } else
1473        return st.st_size;
1474}
1475#elif defined(__sun__)
1476static int64_t raw_getlength(BlockDriverState *bs)
1477{
1478    BDRVRawState *s = bs->opaque;
1479    struct dk_minfo minfo;
1480    int ret;
1481    int64_t size;
1482
1483    ret = fd_open(bs);
1484    if (ret < 0) {
1485        return ret;
1486    }
1487
1488    /*
1489     * Use the DKIOCGMEDIAINFO ioctl to read the size.
1490     */
1491    ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo);
1492    if (ret != -1) {
1493        return minfo.dki_lbsize * minfo.dki_capacity;
1494    }
1495
1496    /*
1497     * There are reports that lseek on some devices fails, but
1498     * irc discussion said that contingency on contingency was overkill.
1499     */
1500    size = lseek(s->fd, 0, SEEK_END);
1501    if (size < 0) {
1502        return -errno;
1503    }
1504    return size;
1505}
1506#elif defined(CONFIG_BSD)
1507static int64_t raw_getlength(BlockDriverState *bs)
1508{
1509    BDRVRawState *s = bs->opaque;
1510    int fd = s->fd;
1511    int64_t size;
1512    struct stat sb;
1513#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
1514    int reopened = 0;
1515#endif
1516    int ret;
1517
1518    ret = fd_open(bs);
1519    if (ret < 0)
1520        return ret;
1521
1522#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
1523again:
1524#endif
1525    if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
1526#ifdef DIOCGMEDIASIZE
1527        if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size))
1528#elif defined(DIOCGPART)
1529        {
1530                struct partinfo pi;
1531                if (ioctl(fd, DIOCGPART, &pi) == 0)
1532                        size = pi.media_size;
1533                else
1534                        size = 0;
1535        }
1536        if (size == 0)
1537#endif
1538#if defined(__APPLE__) && defined(__MACH__)
1539        {
1540            uint64_t sectors = 0;
1541            uint32_t sector_size = 0;
1542
1543            if (ioctl(fd, DKIOCGETBLOCKCOUNT, &sectors) == 0
1544               && ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) == 0) {
1545                size = sectors * sector_size;
1546            } else {
1547                size = lseek(fd, 0LL, SEEK_END);
1548                if (size < 0) {
1549                    return -errno;
1550                }
1551            }
1552        }
1553#else
1554        size = lseek(fd, 0LL, SEEK_END);
1555        if (size < 0) {
1556            return -errno;
1557        }
1558#endif
1559#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
1560        switch(s->type) {
1561        case FTYPE_CD:
1562            /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
1563            if (size == 2048LL * (unsigned)-1)
1564                size = 0;
1565            /* XXX no disc?  maybe we need to reopen... */
1566            if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) {
1567                reopened = 1;
1568                goto again;
1569            }
1570        }
1571#endif
1572    } else {
1573        size = lseek(fd, 0, SEEK_END);
1574        if (size < 0) {
1575            return -errno;
1576        }
1577    }
1578    return size;
1579}
1580#else
1581static int64_t raw_getlength(BlockDriverState *bs)
1582{
1583    BDRVRawState *s = bs->opaque;
1584    int ret;
1585    int64_t size;
1586
1587    ret = fd_open(bs);
1588    if (ret < 0) {
1589        return ret;
1590    }
1591
1592    size = lseek(s->fd, 0, SEEK_END);
1593    if (size < 0) {
1594        return -errno;
1595    }
1596    return size;
1597}
1598#endif
1599
1600static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
1601{
1602    struct stat st;
1603    BDRVRawState *s = bs->opaque;
1604
1605    if (fstat(s->fd, &st) < 0) {
1606        return -errno;
1607    }
1608    return (int64_t)st.st_blocks * 512;
1609}
1610
1611static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
1612{
1613    int fd;
1614    int result = 0;
1615    int64_t total_size = 0;
1616    bool nocow = false;
1617    PreallocMode prealloc;
1618    char *buf = NULL;
1619    Error *local_err = NULL;
1620
1621    strstart(filename, "file:", &filename);
1622
1623    /* Read out options */
1624    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
1625                          BDRV_SECTOR_SIZE);
1626    nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
1627    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
1628    prealloc = qapi_enum_parse(PreallocMode_lookup, buf,
1629                               PREALLOC_MODE__MAX, PREALLOC_MODE_OFF,
1630                               &local_err);
1631    g_free(buf);
1632    if (local_err) {
1633        error_propagate(errp, local_err);
1634        result = -EINVAL;
1635        goto out;
1636    }
1637
1638    fd = qemu_open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY,
1639                   0644);
1640    if (fd < 0) {
1641        result = -errno;
1642        error_setg_errno(errp, -result, "Could not create file");
1643        goto out;
1644    }
1645
1646    if (nocow) {
1647#ifdef __linux__
1648        /* Set NOCOW flag to solve performance issue on fs like btrfs.
1649         * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
1650         * will be ignored since any failure of this operation should not
1651         * block the left work.
1652         */
1653        int attr;
1654        if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
1655            attr |= FS_NOCOW_FL;
1656            ioctl(fd, FS_IOC_SETFLAGS, &attr);
1657        }
1658#endif
1659    }
1660
1661    if (ftruncate(fd, total_size) != 0) {
1662        result = -errno;
1663        error_setg_errno(errp, -result, "Could not resize file");
1664        goto out_close;
1665    }
1666
1667    switch (prealloc) {
1668#ifdef CONFIG_POSIX_FALLOCATE
1669    case PREALLOC_MODE_FALLOC:
1670        /* posix_fallocate() doesn't set errno. */
1671        result = -posix_fallocate(fd, 0, total_size);
1672        if (result != 0) {
1673            error_setg_errno(errp, -result,
1674                             "Could not preallocate data for the new file");
1675        }
1676        break;
1677#endif
1678    case PREALLOC_MODE_FULL:
1679    {
1680        int64_t num = 0, left = total_size;
1681        buf = g_malloc0(65536);
1682
1683        while (left > 0) {
1684            num = MIN(left, 65536);
1685            result = write(fd, buf, num);
1686            if (result < 0) {
1687                result = -errno;
1688                error_setg_errno(errp, -result,
1689                                 "Could not write to the new file");
1690                break;
1691            }
1692            left -= result;
1693        }
1694        if (result >= 0) {
1695            result = fsync(fd);
1696            if (result < 0) {
1697                result = -errno;
1698                error_setg_errno(errp, -result,
1699                                 "Could not flush new file to disk");
1700            }
1701        }
1702        g_free(buf);
1703        break;
1704    }
1705    case PREALLOC_MODE_OFF:
1706        break;
1707    default:
1708        result = -EINVAL;
1709        error_setg(errp, "Unsupported preallocation mode: %s",
1710                   PreallocMode_lookup[prealloc]);
1711        break;
1712    }
1713
1714out_close:
1715    if (qemu_close(fd) != 0 && result == 0) {
1716        result = -errno;
1717        error_setg_errno(errp, -result, "Could not close the new file");
1718    }
1719out:
1720    return result;
1721}
1722
1723/*
1724 * Find allocation range in @bs around offset @start.
1725 * May change underlying file descriptor's file offset.
1726 * If @start is not in a hole, store @start in @data, and the
1727 * beginning of the next hole in @hole, and return 0.
1728 * If @start is in a non-trailing hole, store @start in @hole and the
1729 * beginning of the next non-hole in @data, and return 0.
1730 * If @start is in a trailing hole or beyond EOF, return -ENXIO.
1731 * If we can't find out, return a negative errno other than -ENXIO.
1732 */
1733static int find_allocation(BlockDriverState *bs, off_t start,
1734                           off_t *data, off_t *hole)
1735{
1736#if defined SEEK_HOLE && defined SEEK_DATA
1737    BDRVRawState *s = bs->opaque;
1738    off_t offs;
1739
1740    /*
1741     * SEEK_DATA cases:
1742     * D1. offs == start: start is in data
1743     * D2. offs > start: start is in a hole, next data at offs
1744     * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
1745     *                              or start is beyond EOF
1746     *     If the latter happens, the file has been truncated behind
1747     *     our back since we opened it.  All bets are off then.
1748     *     Treating like a trailing hole is simplest.
1749     * D4. offs < 0, errno != ENXIO: we learned nothing
1750     */
1751    offs = lseek(s->fd, start, SEEK_DATA);
1752    if (offs < 0) {
1753        return -errno;          /* D3 or D4 */
1754    }
1755    assert(offs >= start);
1756
1757    if (offs > start) {
1758        /* D2: in hole, next data at offs */
1759        *hole = start;
1760        *data = offs;
1761        return 0;
1762    }
1763
1764    /* D1: in data, end not yet known */
1765
1766    /*
1767     * SEEK_HOLE cases:
1768     * H1. offs == start: start is in a hole
1769     *     If this happens here, a hole has been dug behind our back
1770     *     since the previous lseek().
1771     * H2. offs > start: either start is in data, next hole at offs,
1772     *                   or start is in trailing hole, EOF at offs
1773     *     Linux treats trailing holes like any other hole: offs ==
1774     *     start.  Solaris seeks to EOF instead: offs > start (blech).
1775     *     If that happens here, a hole has been dug behind our back
1776     *     since the previous lseek().
1777     * H3. offs < 0, errno = ENXIO: start is beyond EOF
1778     *     If this happens, the file has been truncated behind our
1779     *     back since we opened it.  Treat it like a trailing hole.
1780     * H4. offs < 0, errno != ENXIO: we learned nothing
1781     *     Pretend we know nothing at all, i.e. "forget" about D1.
1782     */
1783    offs = lseek(s->fd, start, SEEK_HOLE);
1784    if (offs < 0) {
1785        return -errno;          /* D1 and (H3 or H4) */
1786    }
1787    assert(offs >= start);
1788
1789    if (offs > start) {
1790        /*
1791         * D1 and H2: either in data, next hole at offs, or it was in
1792         * data but is now in a trailing hole.  In the latter case,
1793         * all bets are off.  Treating it as if it there was data all
1794         * the way to EOF is safe, so simply do that.
1795         */
1796        *data = start;
1797        *hole = offs;
1798        return 0;
1799    }
1800
1801    /* D1 and H1 */
1802    return -EBUSY;
1803#else
1804    return -ENOTSUP;
1805#endif
1806}
1807
1808/*
1809 * Returns the allocation status of the specified sectors.
1810 *
1811 * If 'sector_num' is beyond the end of the disk image the return value is 0
1812 * and 'pnum' is set to 0.
1813 *
1814 * 'pnum' is set to the number of sectors (including and immediately following
1815 * the specified sector) that are known to be in the same
1816 * allocated/unallocated state.
1817 *
1818 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
1819 * beyond the end of the disk image it will be clamped.
1820 */
1821static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
1822                                                    int64_t sector_num,
1823                                                    int nb_sectors, int *pnum,
1824                                                    BlockDriverState **file)
1825{
1826    off_t start, data = 0, hole = 0;
1827    int64_t total_size;
1828    int ret;
1829
1830    ret = fd_open(bs);
1831    if (ret < 0) {
1832        return ret;
1833    }
1834
1835    start = sector_num * BDRV_SECTOR_SIZE;
1836    total_size = bdrv_getlength(bs);
1837    if (total_size < 0) {
1838        return total_size;
1839    } else if (start >= total_size) {
1840        *pnum = 0;
1841        return 0;
1842    } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) {
1843        nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE);
1844    }
1845
1846    ret = find_allocation(bs, start, &data, &hole);
1847    if (ret == -ENXIO) {
1848        /* Trailing hole */
1849        *pnum = nb_sectors;
1850        ret = BDRV_BLOCK_ZERO;
1851    } else if (ret < 0) {
1852        /* No info available, so pretend there are no holes */
1853        *pnum = nb_sectors;
1854        ret = BDRV_BLOCK_DATA;
1855    } else if (data == start) {
1856        /* On a data extent, compute sectors to the end of the extent,
1857         * possibly including a partial sector at EOF. */
1858        *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE));
1859        ret = BDRV_BLOCK_DATA;
1860    } else {
1861        /* On a hole, compute sectors to the beginning of the next extent.  */
1862        assert(hole == start);
1863        *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
1864        ret = BDRV_BLOCK_ZERO;
1865    }
1866    *file = bs;
1867    return ret | BDRV_BLOCK_OFFSET_VALID | start;
1868}
1869
1870static coroutine_fn BlockAIOCB *raw_aio_discard(BlockDriverState *bs,
1871    int64_t sector_num, int nb_sectors,
1872    BlockCompletionFunc *cb, void *opaque)
1873{
1874    BDRVRawState *s = bs->opaque;
1875
1876    return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors,
1877                       cb, opaque, QEMU_AIO_DISCARD);
1878}
1879
1880static int coroutine_fn raw_co_write_zeroes(
1881    BlockDriverState *bs, int64_t sector_num,
1882    int nb_sectors, BdrvRequestFlags flags)
1883{
1884    BDRVRawState *s = bs->opaque;
1885
1886    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
1887        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
1888                              QEMU_AIO_WRITE_ZEROES);
1889    } else if (s->discard_zeroes) {
1890        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
1891                              QEMU_AIO_DISCARD);
1892    }
1893    return -ENOTSUP;
1894}
1895
1896static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1897{
1898    BDRVRawState *s = bs->opaque;
1899
1900    bdi->unallocated_blocks_are_zero = s->discard_zeroes;
1901    bdi->can_write_zeroes_with_unmap = s->discard_zeroes;
1902    return 0;
1903}
1904
1905static QemuOptsList raw_create_opts = {
1906    .name = "raw-create-opts",
1907    .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
1908    .desc = {
1909        {
1910            .name = BLOCK_OPT_SIZE,
1911            .type = QEMU_OPT_SIZE,
1912            .help = "Virtual disk size"
1913        },
1914        {
1915            .name = BLOCK_OPT_NOCOW,
1916            .type = QEMU_OPT_BOOL,
1917            .help = "Turn off copy-on-write (valid only on btrfs)"
1918        },
1919        {
1920            .name = BLOCK_OPT_PREALLOC,
1921            .type = QEMU_OPT_STRING,
1922            .help = "Preallocation mode (allowed values: off, falloc, full)"
1923        },
1924        { /* end of list */ }
1925    }
1926};
1927
1928BlockDriver bdrv_file = {
1929    .format_name = "file",
1930    .protocol_name = "file",
1931    .instance_size = sizeof(BDRVRawState),
1932    .bdrv_needs_filename = true,
1933    .bdrv_probe = NULL, /* no probe for protocols */
1934    .bdrv_parse_filename = raw_parse_filename,
1935    .bdrv_file_open = raw_open,
1936    .bdrv_reopen_prepare = raw_reopen_prepare,
1937    .bdrv_reopen_commit = raw_reopen_commit,
1938    .bdrv_reopen_abort = raw_reopen_abort,
1939    .bdrv_close = raw_close,
1940    .bdrv_create = raw_create,
1941    .bdrv_has_zero_init = bdrv_has_zero_init_1,
1942    .bdrv_co_get_block_status = raw_co_get_block_status,
1943    .bdrv_co_write_zeroes = raw_co_write_zeroes,
1944
1945    .bdrv_aio_readv = raw_aio_readv,
1946    .bdrv_aio_writev = raw_aio_writev,
1947    .bdrv_aio_flush = raw_aio_flush,
1948    .bdrv_aio_discard = raw_aio_discard,
1949    .bdrv_refresh_limits = raw_refresh_limits,
1950    .bdrv_io_plug = raw_aio_plug,
1951    .bdrv_io_unplug = raw_aio_unplug,
1952    .bdrv_flush_io_queue = raw_aio_flush_io_queue,
1953
1954    .bdrv_truncate = raw_truncate,
1955    .bdrv_getlength = raw_getlength,
1956    .bdrv_get_info = raw_get_info,
1957    .bdrv_get_allocated_file_size
1958                        = raw_get_allocated_file_size,
1959
1960    .bdrv_detach_aio_context = raw_detach_aio_context,
1961    .bdrv_attach_aio_context = raw_attach_aio_context,
1962
1963    .create_opts = &raw_create_opts,
1964};
1965
1966/***********************************************/
1967/* host device */
1968
1969#if defined(__APPLE__) && defined(__MACH__)
1970static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
1971                                CFIndex maxPathSize, int flags);
1972static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator)
1973{
1974    kern_return_t kernResult = KERN_FAILURE;
1975    mach_port_t     masterPort;
1976    CFMutableDictionaryRef  classesToMatch;
1977    const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass};
1978    char *mediaType = NULL;
1979
1980    kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort );
1981    if ( KERN_SUCCESS != kernResult ) {
1982        printf( "IOMasterPort returned %d\n", kernResult );
1983    }
1984
1985    int index;
1986    for (index = 0; index < ARRAY_SIZE(matching_array); index++) {
1987        classesToMatch = IOServiceMatching(matching_array[index]);
1988        if (classesToMatch == NULL) {
1989            error_report("IOServiceMatching returned NULL for %s",
1990                         matching_array[index]);
1991            continue;
1992        }
1993        CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey),
1994                             kCFBooleanTrue);
1995        kernResult = IOServiceGetMatchingServices(masterPort, classesToMatch,
1996                                                  mediaIterator);
1997        if (kernResult != KERN_SUCCESS) {
1998            error_report("Note: IOServiceGetMatchingServices returned %d",
1999                         kernResult);
2000            continue;
2001        }
2002
2003        /* If a match was found, leave the loop */
2004        if (*mediaIterator != 0) {
2005            DPRINTF("Matching using %s\n", matching_array[index]);
2006            mediaType = g_strdup(matching_array[index]);
2007            break;
2008        }
2009    }
2010    return mediaType;
2011}
2012
2013kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
2014                         CFIndex maxPathSize, int flags)
2015{
2016    io_object_t     nextMedia;
2017    kern_return_t   kernResult = KERN_FAILURE;
2018    *bsdPath = '\0';
2019    nextMedia = IOIteratorNext( mediaIterator );
2020    if ( nextMedia )
2021    {
2022        CFTypeRef   bsdPathAsCFString;
2023    bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
2024        if ( bsdPathAsCFString ) {
2025            size_t devPathLength;
2026            strcpy( bsdPath, _PATH_DEV );
2027            if (flags & BDRV_O_NOCACHE) {
2028                strcat(bsdPath, "r");
2029            }
2030            devPathLength = strlen( bsdPath );
2031            if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
2032                kernResult = KERN_SUCCESS;
2033            }
2034            CFRelease( bsdPathAsCFString );
2035        }
2036        IOObjectRelease( nextMedia );
2037    }
2038
2039    return kernResult;
2040}
2041
2042/* Sets up a real cdrom for use in QEMU */
2043static bool setup_cdrom(char *bsd_path, Error **errp)
2044{
2045    int index, num_of_test_partitions = 2, fd;
2046    char test_partition[MAXPATHLEN];
2047    bool partition_found = false;
2048
2049    /* look for a working partition */
2050    for (index = 0; index < num_of_test_partitions; index++) {
2051        snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path,
2052                 index);
2053        fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE);
2054        if (fd >= 0) {
2055            partition_found = true;
2056            qemu_close(fd);
2057            break;
2058        }
2059    }
2060
2061    /* if a working partition on the device was not found */
2062    if (partition_found == false) {
2063        error_setg(errp, "Failed to find a working partition on disc");
2064    } else {
2065        DPRINTF("Using %s as optical disc\n", test_partition);
2066        pstrcpy(bsd_path, MAXPATHLEN, test_partition);
2067    }
2068    return partition_found;
2069}
2070
2071/* Prints directions on mounting and unmounting a device */
2072static void print_unmounting_directions(const char *file_name)
2073{
2074    error_report("If device %s is mounted on the desktop, unmount"
2075                 " it first before using it in QEMU", file_name);
2076    error_report("Command to unmount device: diskutil unmountDisk %s",
2077                 file_name);
2078    error_report("Command to mount device: diskutil mountDisk %s", file_name);
2079}
2080
2081#endif /* defined(__APPLE__) && defined(__MACH__) */
2082
2083static int hdev_probe_device(const char *filename)
2084{
2085    struct stat st;
2086
2087    /* allow a dedicated CD-ROM driver to match with a higher priority */
2088    if (strstart(filename, "/dev/cdrom", NULL))
2089        return 50;
2090
2091    if (stat(filename, &st) >= 0 &&
2092            (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
2093        return 100;
2094    }
2095
2096    return 0;
2097}
2098
2099static int check_hdev_writable(BDRVRawState *s)
2100{
2101#if defined(BLKROGET)
2102    /* Linux block devices can be configured "read-only" using blockdev(8).
2103     * This is independent of device node permissions and therefore open(2)
2104     * with O_RDWR succeeds.  Actual writes fail with EPERM.
2105     *
2106     * bdrv_open() is supposed to fail if the disk is read-only.  Explicitly
2107     * check for read-only block devices so that Linux block devices behave
2108     * properly.
2109     */
2110    struct stat st;
2111    int readonly = 0;
2112
2113    if (fstat(s->fd, &st)) {
2114        return -errno;
2115    }
2116
2117    if (!S_ISBLK(st.st_mode)) {
2118        return 0;
2119    }
2120
2121    if (ioctl(s->fd, BLKROGET, &readonly) < 0) {
2122        return -errno;
2123    }
2124
2125    if (readonly) {
2126        return -EACCES;
2127    }
2128#endif /* defined(BLKROGET) */
2129    return 0;
2130}
2131
2132static void hdev_parse_filename(const char *filename, QDict *options,
2133                                Error **errp)
2134{
2135    /* The prefix is optional, just as for "file". */
2136    strstart(filename, "host_device:", &filename);
2137
2138    qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename)));
2139}
2140
2141static bool hdev_is_sg(BlockDriverState *bs)
2142{
2143
2144#if defined(__linux__)
2145
2146    struct stat st;
2147    struct sg_scsi_id scsiid;
2148    int sg_version;
2149
2150    if (stat(bs->filename, &st) >= 0 && S_ISCHR(st.st_mode) &&
2151        !bdrv_ioctl(bs, SG_GET_VERSION_NUM, &sg_version) &&
2152        !bdrv_ioctl(bs, SG_GET_SCSI_ID, &scsiid)) {
2153        DPRINTF("SG device found: type=%d, version=%d\n",
2154            scsiid.scsi_type, sg_version);
2155        return true;
2156    }
2157
2158#endif
2159
2160    return false;
2161}
2162
2163static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
2164                     Error **errp)
2165{
2166    BDRVRawState *s = bs->opaque;
2167    Error *local_err = NULL;
2168    int ret;
2169
2170#if defined(__APPLE__) && defined(__MACH__)
2171    const char *filename = qdict_get_str(options, "filename");
2172    char bsd_path[MAXPATHLEN] = "";
2173    bool error_occurred = false;
2174
2175    /* If using a real cdrom */
2176    if (strcmp(filename, "/dev/cdrom") == 0) {
2177        char *mediaType = NULL;
2178        kern_return_t ret_val;
2179        io_iterator_t mediaIterator = 0;
2180
2181        mediaType = FindEjectableOpticalMedia(&mediaIterator);
2182        if (mediaType == NULL) {
2183            error_setg(errp, "Please make sure your CD/DVD is in the optical"
2184                       " drive");
2185            error_occurred = true;
2186            goto hdev_open_Mac_error;
2187        }
2188
2189        ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags);
2190        if (ret_val != KERN_SUCCESS) {
2191            error_setg(errp, "Could not get BSD path for optical drive");
2192            error_occurred = true;
2193            goto hdev_open_Mac_error;
2194        }
2195
2196        /* If a real optical drive was not found */
2197        if (bsd_path[0] == '\0') {
2198            error_setg(errp, "Failed to obtain bsd path for optical drive");
2199            error_occurred = true;
2200            goto hdev_open_Mac_error;
2201        }
2202
2203        /* If using a cdrom disc and finding a partition on the disc failed */
2204        if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 &&
2205            setup_cdrom(bsd_path, errp) == false) {
2206            print_unmounting_directions(bsd_path);
2207            error_occurred = true;
2208            goto hdev_open_Mac_error;
2209        }
2210
2211        qdict_put(options, "filename", qstring_from_str(bsd_path));
2212
2213hdev_open_Mac_error:
2214        g_free(mediaType);
2215        if (mediaIterator) {
2216            IOObjectRelease(mediaIterator);
2217        }
2218        if (error_occurred) {
2219            return -ENOENT;
2220        }
2221    }
2222#endif /* defined(__APPLE__) && defined(__MACH__) */
2223
2224    s->type = FTYPE_FILE;
2225
2226    ret = raw_open_common(bs, options, flags, 0, &local_err);
2227    if (ret < 0) {
2228        if (local_err) {
2229            error_propagate(errp, local_err);
2230        }
2231#if defined(__APPLE__) && defined(__MACH__)
2232        if (*bsd_path) {
2233            filename = bsd_path;
2234        }
2235        /* if a physical device experienced an error while being opened */
2236        if (strncmp(filename, "/dev/", 5) == 0) {
2237            print_unmounting_directions(filename);
2238        }
2239#endif /* defined(__APPLE__) && defined(__MACH__) */
2240        return ret;
2241    }
2242
2243    /* Since this does ioctl the device must be already opened */
2244    bs->sg = hdev_is_sg(bs);
2245
2246    if (flags & BDRV_O_RDWR) {
2247        ret = check_hdev_writable(s);
2248        if (ret < 0) {
2249            raw_close(bs);
2250            error_setg_errno(errp, -ret, "The device is not writable");
2251            return ret;
2252        }
2253    }
2254
2255    return ret;
2256}
2257
2258#if defined(__linux__)
2259
2260static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
2261        unsigned long int req, void *buf,
2262        BlockCompletionFunc *cb, void *opaque)
2263{
2264    BDRVRawState *s = bs->opaque;
2265    RawPosixAIOData *acb;
2266    ThreadPool *pool;
2267
2268    if (fd_open(bs) < 0)
2269        return NULL;
2270
2271    acb = g_new(RawPosixAIOData, 1);
2272    acb->bs = bs;
2273    acb->aio_type = QEMU_AIO_IOCTL;
2274    acb->aio_fildes = s->fd;
2275    acb->aio_offset = 0;
2276    acb->aio_ioctl_buf = buf;
2277    acb->aio_ioctl_cmd = req;
2278    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
2279    return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
2280}
2281#endif /* linux */
2282
2283static int fd_open(BlockDriverState *bs)
2284{
2285    BDRVRawState *s = bs->opaque;
2286
2287    /* this is just to ensure s->fd is sane (its called by io ops) */
2288    if (s->fd >= 0)
2289        return 0;
2290    return -EIO;
2291}
2292
2293static coroutine_fn BlockAIOCB *hdev_aio_discard(BlockDriverState *bs,
2294    int64_t sector_num, int nb_sectors,
2295    BlockCompletionFunc *cb, void *opaque)
2296{
2297    BDRVRawState *s = bs->opaque;
2298
2299    if (fd_open(bs) < 0) {
2300        return NULL;
2301    }
2302    return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors,
2303                       cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
2304}
2305
2306static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs,
2307    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
2308{
2309    BDRVRawState *s = bs->opaque;
2310    int rc;
2311
2312    rc = fd_open(bs);
2313    if (rc < 0) {
2314        return rc;
2315    }
2316    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
2317        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
2318                              QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV);
2319    } else if (s->discard_zeroes) {
2320        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
2321                              QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
2322    }
2323    return -ENOTSUP;
2324}
2325
2326static int hdev_create(const char *filename, QemuOpts *opts,
2327                       Error **errp)
2328{
2329    int fd;
2330    int ret = 0;
2331    struct stat stat_buf;
2332    int64_t total_size = 0;
2333    bool has_prefix;
2334
2335    /* This function is used by both protocol block drivers and therefore either
2336     * of these prefixes may be given.
2337     * The return value has to be stored somewhere, otherwise this is an error
2338     * due to -Werror=unused-value. */
2339    has_prefix =
2340        strstart(filename, "host_device:", &filename) ||
2341        strstart(filename, "host_cdrom:" , &filename);
2342
2343    (void)has_prefix;
2344
2345    ret = raw_normalize_devicepath(&filename);
2346    if (ret < 0) {
2347        error_setg_errno(errp, -ret, "Could not normalize device path");
2348        return ret;
2349    }
2350
2351    /* Read out options */
2352    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
2353                          BDRV_SECTOR_SIZE);
2354
2355    fd = qemu_open(filename, O_WRONLY | O_BINARY);
2356    if (fd < 0) {
2357        ret = -errno;
2358        error_setg_errno(errp, -ret, "Could not open device");
2359        return ret;
2360    }
2361
2362    if (fstat(fd, &stat_buf) < 0) {
2363        ret = -errno;
2364        error_setg_errno(errp, -ret, "Could not stat device");
2365    } else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) {
2366        error_setg(errp,
2367                   "The given file is neither a block nor a character device");
2368        ret = -ENODEV;
2369    } else if (lseek(fd, 0, SEEK_END) < total_size) {
2370        error_setg(errp, "Device is too small");
2371        ret = -ENOSPC;
2372    }
2373
2374    qemu_close(fd);
2375    return ret;
2376}
2377
2378static BlockDriver bdrv_host_device = {
2379    .format_name        = "host_device",
2380    .protocol_name        = "host_device",
2381    .instance_size      = sizeof(BDRVRawState),
2382    .bdrv_needs_filename = true,
2383    .bdrv_probe_device  = hdev_probe_device,
2384    .bdrv_parse_filename = hdev_parse_filename,
2385    .bdrv_file_open     = hdev_open,
2386    .bdrv_close         = raw_close,
2387    .bdrv_reopen_prepare = raw_reopen_prepare,
2388    .bdrv_reopen_commit  = raw_reopen_commit,
2389    .bdrv_reopen_abort   = raw_reopen_abort,
2390    .bdrv_create         = hdev_create,
2391    .create_opts         = &raw_create_opts,
2392    .bdrv_co_write_zeroes = hdev_co_write_zeroes,
2393
2394    .bdrv_aio_readv     = raw_aio_readv,
2395    .bdrv_aio_writev    = raw_aio_writev,
2396    .bdrv_aio_flush     = raw_aio_flush,
2397    .bdrv_aio_discard   = hdev_aio_discard,
2398    .bdrv_refresh_limits = raw_refresh_limits,
2399    .bdrv_io_plug = raw_aio_plug,
2400    .bdrv_io_unplug = raw_aio_unplug,
2401    .bdrv_flush_io_queue = raw_aio_flush_io_queue,
2402
2403    .bdrv_truncate      = raw_truncate,
2404    .bdrv_getlength     = raw_getlength,
2405    .bdrv_get_info = raw_get_info,
2406    .bdrv_get_allocated_file_size
2407                        = raw_get_allocated_file_size,
2408    .bdrv_probe_blocksizes = hdev_probe_blocksizes,
2409    .bdrv_probe_geometry = hdev_probe_geometry,
2410
2411    .bdrv_detach_aio_context = raw_detach_aio_context,
2412    .bdrv_attach_aio_context = raw_attach_aio_context,
2413
2414    /* generic scsi device */
2415#ifdef __linux__
2416    .bdrv_aio_ioctl     = hdev_aio_ioctl,
2417#endif
2418};
2419
2420#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
2421static void cdrom_parse_filename(const char *filename, QDict *options,
2422                                 Error **errp)
2423{
2424    /* The prefix is optional, just as for "file". */
2425    strstart(filename, "host_cdrom:", &filename);
2426
2427    qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename)));
2428}
2429#endif
2430
2431#ifdef __linux__
2432static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
2433                      Error **errp)
2434{
2435    BDRVRawState *s = bs->opaque;
2436    Error *local_err = NULL;
2437    int ret;
2438
2439    s->type = FTYPE_CD;
2440
2441    /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
2442    ret = raw_open_common(bs, options, flags, O_NONBLOCK, &local_err);
2443    if (local_err) {
2444        error_propagate(errp, local_err);
2445    }
2446    return ret;
2447}
2448
2449static int cdrom_probe_device(const char *filename)
2450{
2451    int fd, ret;
2452    int prio = 0;
2453    struct stat st;
2454
2455    fd = qemu_open(filename, O_RDONLY | O_NONBLOCK);
2456    if (fd < 0) {
2457        goto out;
2458    }
2459    ret = fstat(fd, &st);
2460    if (ret == -1 || !S_ISBLK(st.st_mode)) {
2461        goto outc;
2462    }
2463
2464    /* Attempt to detect via a CDROM specific ioctl */
2465    ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
2466    if (ret >= 0)
2467        prio = 100;
2468
2469outc:
2470    qemu_close(fd);
2471out:
2472    return prio;
2473}
2474
2475static bool cdrom_is_inserted(BlockDriverState *bs)
2476{
2477    BDRVRawState *s = bs->opaque;
2478    int ret;
2479
2480    ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
2481    return ret == CDS_DISC_OK;
2482}
2483
2484static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
2485{
2486    BDRVRawState *s = bs->opaque;
2487
2488    if (eject_flag) {
2489        if (ioctl(s->fd, CDROMEJECT, NULL) < 0)
2490            perror("CDROMEJECT");
2491    } else {
2492        if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
2493            perror("CDROMEJECT");
2494    }
2495}
2496
2497static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
2498{
2499    BDRVRawState *s = bs->opaque;
2500
2501    if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) {
2502        /*
2503         * Note: an error can happen if the distribution automatically
2504         * mounts the CD-ROM
2505         */
2506        /* perror("CDROM_LOCKDOOR"); */
2507    }
2508}
2509
2510static BlockDriver bdrv_host_cdrom = {
2511    .format_name        = "host_cdrom",
2512    .protocol_name      = "host_cdrom",
2513    .instance_size      = sizeof(BDRVRawState),
2514    .bdrv_needs_filename = true,
2515    .bdrv_probe_device  = cdrom_probe_device,
2516    .bdrv_parse_filename = cdrom_parse_filename,
2517    .bdrv_file_open     = cdrom_open,
2518    .bdrv_close         = raw_close,
2519    .bdrv_reopen_prepare = raw_reopen_prepare,
2520    .bdrv_reopen_commit  = raw_reopen_commit,
2521    .bdrv_reopen_abort   = raw_reopen_abort,
2522    .bdrv_create         = hdev_create,
2523    .create_opts         = &raw_create_opts,
2524
2525    .bdrv_aio_readv     = raw_aio_readv,
2526    .bdrv_aio_writev    = raw_aio_writev,
2527    .bdrv_aio_flush     = raw_aio_flush,
2528    .bdrv_refresh_limits = raw_refresh_limits,
2529    .bdrv_io_plug = raw_aio_plug,
2530    .bdrv_io_unplug = raw_aio_unplug,
2531    .bdrv_flush_io_queue = raw_aio_flush_io_queue,
2532
2533    .bdrv_truncate      = raw_truncate,
2534    .bdrv_getlength      = raw_getlength,
2535    .has_variable_length = true,
2536    .bdrv_get_allocated_file_size
2537                        = raw_get_allocated_file_size,
2538
2539    .bdrv_detach_aio_context = raw_detach_aio_context,
2540    .bdrv_attach_aio_context = raw_attach_aio_context,
2541
2542    /* removable device support */
2543    .bdrv_is_inserted   = cdrom_is_inserted,
2544    .bdrv_eject         = cdrom_eject,
2545    .bdrv_lock_medium   = cdrom_lock_medium,
2546
2547    /* generic scsi device */
2548    .bdrv_aio_ioctl     = hdev_aio_ioctl,
2549};
2550#endif /* __linux__ */
2551
2552#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2553static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
2554                      Error **errp)
2555{
2556    BDRVRawState *s = bs->opaque;
2557    Error *local_err = NULL;
2558    int ret;
2559
2560    s->type = FTYPE_CD;
2561
2562    ret = raw_open_common(bs, options, flags, 0, &local_err);
2563    if (ret) {
2564        if (local_err) {
2565            error_propagate(errp, local_err);
2566        }
2567        return ret;
2568    }
2569
2570    /* make sure the door isn't locked at this time */
2571    ioctl(s->fd, CDIOCALLOW);
2572    return 0;
2573}
2574
2575static int cdrom_probe_device(const char *filename)
2576{
2577    if (strstart(filename, "/dev/cd", NULL) ||
2578            strstart(filename, "/dev/acd", NULL))
2579        return 100;
2580    return 0;
2581}
2582
2583static int cdrom_reopen(BlockDriverState *bs)
2584{
2585    BDRVRawState *s = bs->opaque;
2586    int fd;
2587
2588    /*
2589     * Force reread of possibly changed/newly loaded disc,
2590     * FreeBSD seems to not notice sometimes...
2591     */
2592    if (s->fd >= 0)
2593        qemu_close(s->fd);
2594    fd = qemu_open(bs->filename, s->open_flags, 0644);
2595    if (fd < 0) {
2596        s->fd = -1;
2597        return -EIO;
2598    }
2599    s->fd = fd;
2600
2601    /* make sure the door isn't locked at this time */
2602    ioctl(s->fd, CDIOCALLOW);
2603    return 0;
2604}
2605
2606static bool cdrom_is_inserted(BlockDriverState *bs)
2607{
2608    return raw_getlength(bs) > 0;
2609}
2610
2611static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
2612{
2613    BDRVRawState *s = bs->opaque;
2614
2615    if (s->fd < 0)
2616        return;
2617
2618    (void) ioctl(s->fd, CDIOCALLOW);
2619
2620    if (eject_flag) {
2621        if (ioctl(s->fd, CDIOCEJECT) < 0)
2622            perror("CDIOCEJECT");
2623    } else {
2624        if (ioctl(s->fd, CDIOCCLOSE) < 0)
2625            perror("CDIOCCLOSE");
2626    }
2627
2628    cdrom_reopen(bs);
2629}
2630
2631static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
2632{
2633    BDRVRawState *s = bs->opaque;
2634
2635    if (s->fd < 0)
2636        return;
2637    if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
2638        /*
2639         * Note: an error can happen if the distribution automatically
2640         * mounts the CD-ROM
2641         */
2642        /* perror("CDROM_LOCKDOOR"); */
2643    }
2644}
2645
2646static BlockDriver bdrv_host_cdrom = {
2647    .format_name        = "host_cdrom",
2648    .protocol_name      = "host_cdrom",
2649    .instance_size      = sizeof(BDRVRawState),
2650    .bdrv_needs_filename = true,
2651    .bdrv_probe_device  = cdrom_probe_device,
2652    .bdrv_parse_filename = cdrom_parse_filename,
2653    .bdrv_file_open     = cdrom_open,
2654    .bdrv_close         = raw_close,
2655    .bdrv_reopen_prepare = raw_reopen_prepare,
2656    .bdrv_reopen_commit  = raw_reopen_commit,
2657    .bdrv_reopen_abort   = raw_reopen_abort,
2658    .bdrv_create        = hdev_create,
2659    .create_opts        = &raw_create_opts,
2660
2661    .bdrv_aio_readv     = raw_aio_readv,
2662    .bdrv_aio_writev    = raw_aio_writev,
2663    .bdrv_aio_flush     = raw_aio_flush,
2664    .bdrv_refresh_limits = raw_refresh_limits,
2665    .bdrv_io_plug = raw_aio_plug,
2666    .bdrv_io_unplug = raw_aio_unplug,
2667    .bdrv_flush_io_queue = raw_aio_flush_io_queue,
2668
2669    .bdrv_truncate      = raw_truncate,
2670    .bdrv_getlength      = raw_getlength,
2671    .has_variable_length = true,
2672    .bdrv_get_allocated_file_size
2673                        = raw_get_allocated_file_size,
2674
2675    .bdrv_detach_aio_context = raw_detach_aio_context,
2676    .bdrv_attach_aio_context = raw_attach_aio_context,
2677
2678    /* removable device support */
2679    .bdrv_is_inserted   = cdrom_is_inserted,
2680    .bdrv_eject         = cdrom_eject,
2681    .bdrv_lock_medium   = cdrom_lock_medium,
2682};
2683#endif /* __FreeBSD__ */
2684
2685static void bdrv_file_init(void)
2686{
2687    /*
2688     * Register all the drivers.  Note that order is important, the driver
2689     * registered last will get probed first.
2690     */
2691    bdrv_register(&bdrv_file);
2692    bdrv_register(&bdrv_host_device);
2693#ifdef __linux__
2694    bdrv_register(&bdrv_host_cdrom);
2695#endif
2696#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
2697    bdrv_register(&bdrv_host_cdrom);
2698#endif
2699}
2700
2701block_init(bdrv_file_init);
2702