qemu/block/raw-posix.c
<<
>>
Prefs
   1/*
   2 * Block driver for RAW files (posix)
   3 *
   4 * Copyright (c) 2006 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24#include "qemu/osdep.h"
  25#include "qapi/error.h"
  26#include "qemu/cutils.h"
  27#include "qemu/error-report.h"
  28#include "qemu/timer.h"
  29#include "qemu/log.h"
  30#include "block/block_int.h"
  31#include "qemu/module.h"
  32#include "trace.h"
  33#include "block/thread-pool.h"
  34#include "qemu/iov.h"
  35#include "block/raw-aio.h"
  36#include "qapi/util.h"
  37#include "qapi/qmp/qstring.h"
  38
  39#if defined(__APPLE__) && (__MACH__)
  40#include <paths.h>
  41#include <sys/param.h>
  42#include <IOKit/IOKitLib.h>
  43#include <IOKit/IOBSD.h>
  44#include <IOKit/storage/IOMediaBSDClient.h>
  45#include <IOKit/storage/IOMedia.h>
  46#include <IOKit/storage/IOCDMedia.h>
  47//#include <IOKit/storage/IOCDTypes.h>
  48#include <IOKit/storage/IODVDMedia.h>
  49#include <CoreFoundation/CoreFoundation.h>
  50#endif
  51
  52#ifdef __sun__
  53#define _POSIX_PTHREAD_SEMANTICS 1
  54#include <sys/dkio.h>
  55#endif
  56#ifdef __linux__
  57#include <sys/ioctl.h>
  58#include <sys/param.h>
  59#include <linux/cdrom.h>
  60#include <linux/fd.h>
  61#include <linux/fs.h>
  62#include <linux/hdreg.h>
  63#include <scsi/sg.h>
  64#ifdef __s390__
  65#include <asm/dasd.h>
  66#endif
  67#ifndef FS_NOCOW_FL
  68#define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
  69#endif
  70#endif
  71#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE)
  72#include <linux/falloc.h>
  73#endif
  74#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
  75#include <sys/disk.h>
  76#include <sys/cdio.h>
  77#endif
  78
  79#ifdef __OpenBSD__
  80#include <sys/ioctl.h>
  81#include <sys/disklabel.h>
  82#include <sys/dkio.h>
  83#endif
  84
  85#ifdef __NetBSD__
  86#include <sys/ioctl.h>
  87#include <sys/disklabel.h>
  88#include <sys/dkio.h>
  89#include <sys/disk.h>
  90#endif
  91
  92#ifdef __DragonFly__
  93#include <sys/ioctl.h>
  94#include <sys/diskslice.h>
  95#endif
  96
  97#ifdef CONFIG_XFS
  98#include <xfs/xfs.h>
  99#endif
 100
 101//#define DEBUG_BLOCK
 102
 103#ifdef DEBUG_BLOCK
 104# define DEBUG_BLOCK_PRINT 1
 105#else
 106# define DEBUG_BLOCK_PRINT 0
 107#endif
 108#define DPRINTF(fmt, ...) \
 109do { \
 110    if (DEBUG_BLOCK_PRINT) { \
 111        printf(fmt, ## __VA_ARGS__); \
 112    } \
 113} while (0)
 114
 115/* OS X does not have O_DSYNC */
 116#ifndef O_DSYNC
 117#ifdef O_SYNC
 118#define O_DSYNC O_SYNC
 119#elif defined(O_FSYNC)
 120#define O_DSYNC O_FSYNC
 121#endif
 122#endif
 123
 124/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
 125#ifndef O_DIRECT
 126#define O_DIRECT O_DSYNC
 127#endif
 128
 129#define FTYPE_FILE   0
 130#define FTYPE_CD     1
 131
 132#define MAX_BLOCKSIZE   4096
 133
 134typedef struct BDRVRawState {
 135    int fd;
 136    int type;
 137    int open_flags;
 138    size_t buf_align;
 139
 140#ifdef CONFIG_XFS
 141    bool is_xfs:1;
 142#endif
 143    bool has_discard:1;
 144    bool has_write_zeroes:1;
 145    bool discard_zeroes:1;
 146    bool has_fallocate;
 147    bool needs_alignment;
 148} BDRVRawState;
 149
 150typedef struct BDRVRawReopenState {
 151    int fd;
 152    int open_flags;
 153} BDRVRawReopenState;
 154
 155static int fd_open(BlockDriverState *bs);
 156static int64_t raw_getlength(BlockDriverState *bs);
 157
 158typedef struct RawPosixAIOData {
 159    BlockDriverState *bs;
 160    int aio_fildes;
 161    union {
 162        struct iovec *aio_iov;
 163        void *aio_ioctl_buf;
 164    };
 165    int aio_niov;
 166    uint64_t aio_nbytes;
 167#define aio_ioctl_cmd   aio_nbytes /* for QEMU_AIO_IOCTL */
 168    off_t aio_offset;
 169    int aio_type;
 170} RawPosixAIOData;
 171
 172#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
 173static int cdrom_reopen(BlockDriverState *bs);
 174#endif
 175
 176#if defined(__NetBSD__)
 177static int raw_normalize_devicepath(const char **filename)
 178{
 179    static char namebuf[PATH_MAX];
 180    const char *dp, *fname;
 181    struct stat sb;
 182
 183    fname = *filename;
 184    dp = strrchr(fname, '/');
 185    if (lstat(fname, &sb) < 0) {
 186        fprintf(stderr, "%s: stat failed: %s\n",
 187            fname, strerror(errno));
 188        return -errno;
 189    }
 190
 191    if (!S_ISBLK(sb.st_mode)) {
 192        return 0;
 193    }
 194
 195    if (dp == NULL) {
 196        snprintf(namebuf, PATH_MAX, "r%s", fname);
 197    } else {
 198        snprintf(namebuf, PATH_MAX, "%.*s/r%s",
 199            (int)(dp - fname), fname, dp + 1);
 200    }
 201    fprintf(stderr, "%s is a block device", fname);
 202    *filename = namebuf;
 203    fprintf(stderr, ", using %s\n", *filename);
 204
 205    return 0;
 206}
 207#else
 208static int raw_normalize_devicepath(const char **filename)
 209{
 210    return 0;
 211}
 212#endif
 213
 214/*
 215 * Get logical block size via ioctl. On success store it in @sector_size_p.
 216 */
 217static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
 218{
 219    unsigned int sector_size;
 220    bool success = false;
 221
 222    errno = ENOTSUP;
 223
 224    /* Try a few ioctls to get the right size */
 225#ifdef BLKSSZGET
 226    if (ioctl(fd, BLKSSZGET, &sector_size) >= 0) {
 227        *sector_size_p = sector_size;
 228        success = true;
 229    }
 230#endif
 231#ifdef DKIOCGETBLOCKSIZE
 232    if (ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) >= 0) {
 233        *sector_size_p = sector_size;
 234        success = true;
 235    }
 236#endif
 237#ifdef DIOCGSECTORSIZE
 238    if (ioctl(fd, DIOCGSECTORSIZE, &sector_size) >= 0) {
 239        *sector_size_p = sector_size;
 240        success = true;
 241    }
 242#endif
 243
 244    return success ? 0 : -errno;
 245}
 246
 247/**
 248 * Get physical block size of @fd.
 249 * On success, store it in @blk_size and return 0.
 250 * On failure, return -errno.
 251 */
 252static int probe_physical_blocksize(int fd, unsigned int *blk_size)
 253{
 254#ifdef BLKPBSZGET
 255    if (ioctl(fd, BLKPBSZGET, blk_size) < 0) {
 256        return -errno;
 257    }
 258    return 0;
 259#else
 260    return -ENOTSUP;
 261#endif
 262}
 263
 264/* Check if read is allowed with given memory buffer and length.
 265 *
 266 * This function is used to check O_DIRECT memory buffer and request alignment.
 267 */
 268static bool raw_is_io_aligned(int fd, void *buf, size_t len)
 269{
 270    ssize_t ret = pread(fd, buf, len, 0);
 271
 272    if (ret >= 0) {
 273        return true;
 274    }
 275
 276#ifdef __linux__
 277    /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads.  Ignore
 278     * other errors (e.g. real I/O error), which could happen on a failed
 279     * drive, since we only care about probing alignment.
 280     */
 281    if (errno != EINVAL) {
 282        return true;
 283    }
 284#endif
 285
 286    return false;
 287}
 288
 289static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
 290{
 291    BDRVRawState *s = bs->opaque;
 292    char *buf;
 293    size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
 294
 295    /* For SCSI generic devices the alignment is not really used.
 296       With buffered I/O, we don't have any restrictions. */
 297    if (bdrv_is_sg(bs) || !s->needs_alignment) {
 298        bs->bl.request_alignment = 1;
 299        s->buf_align = 1;
 300        return;
 301    }
 302
 303    bs->bl.request_alignment = 0;
 304    s->buf_align = 0;
 305    /* Let's try to use the logical blocksize for the alignment. */
 306    if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) {
 307        bs->bl.request_alignment = 0;
 308    }
 309#ifdef CONFIG_XFS
 310    if (s->is_xfs) {
 311        struct dioattr da;
 312        if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) {
 313            bs->bl.request_alignment = da.d_miniosz;
 314            /* The kernel returns wrong information for d_mem */
 315            /* s->buf_align = da.d_mem; */
 316        }
 317    }
 318#endif
 319
 320    /* If we could not get the sizes so far, we can only guess them */
 321    if (!s->buf_align) {
 322        size_t align;
 323        buf = qemu_memalign(max_align, 2 * max_align);
 324        for (align = 512; align <= max_align; align <<= 1) {
 325            if (raw_is_io_aligned(fd, buf + align, max_align)) {
 326                s->buf_align = align;
 327                break;
 328            }
 329        }
 330        qemu_vfree(buf);
 331    }
 332
 333    if (!bs->bl.request_alignment) {
 334        size_t align;
 335        buf = qemu_memalign(s->buf_align, max_align);
 336        for (align = 512; align <= max_align; align <<= 1) {
 337            if (raw_is_io_aligned(fd, buf, align)) {
 338                bs->bl.request_alignment = align;
 339                break;
 340            }
 341        }
 342        qemu_vfree(buf);
 343    }
 344
 345    if (!s->buf_align || !bs->bl.request_alignment) {
 346        error_setg(errp, "Could not find working O_DIRECT alignment");
 347        error_append_hint(errp, "Try cache.direct=off\n");
 348    }
 349}
 350
 351static void raw_parse_flags(int bdrv_flags, int *open_flags)
 352{
 353    assert(open_flags != NULL);
 354
 355    *open_flags |= O_BINARY;
 356    *open_flags &= ~O_ACCMODE;
 357    if (bdrv_flags & BDRV_O_RDWR) {
 358        *open_flags |= O_RDWR;
 359    } else {
 360        *open_flags |= O_RDONLY;
 361    }
 362
 363    /* Use O_DSYNC for write-through caching, no flags for write-back caching,
 364     * and O_DIRECT for no caching. */
 365    if ((bdrv_flags & BDRV_O_NOCACHE)) {
 366        *open_flags |= O_DIRECT;
 367    }
 368}
 369
 370#ifdef CONFIG_LINUX_AIO
 371static bool raw_use_aio(int bdrv_flags)
 372{
 373    /*
 374     * Currently Linux do AIO only for files opened with O_DIRECT
 375     * specified so check NOCACHE flag too
 376     */
 377    return (bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
 378                         (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO);
 379}
 380#endif
 381
 382static void raw_parse_filename(const char *filename, QDict *options,
 383                               Error **errp)
 384{
 385    /* The filename does not have to be prefixed by the protocol name, since
 386     * "file" is the default protocol; therefore, the return value of this
 387     * function call can be ignored. */
 388    strstart(filename, "file:", &filename);
 389
 390    qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename)));
 391}
 392
 393static QemuOptsList raw_runtime_opts = {
 394    .name = "raw",
 395    .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
 396    .desc = {
 397        {
 398            .name = "filename",
 399            .type = QEMU_OPT_STRING,
 400            .help = "File name of the image",
 401        },
 402        { /* end of list */ }
 403    },
 404};
 405
 406static int raw_open_common(BlockDriverState *bs, QDict *options,
 407                           int bdrv_flags, int open_flags, Error **errp)
 408{
 409    BDRVRawState *s = bs->opaque;
 410    QemuOpts *opts;
 411    Error *local_err = NULL;
 412    const char *filename = NULL;
 413    int fd, ret;
 414    struct stat st;
 415
 416    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
 417    qemu_opts_absorb_qdict(opts, options, &local_err);
 418    if (local_err) {
 419        error_propagate(errp, local_err);
 420        ret = -EINVAL;
 421        goto fail;
 422    }
 423
 424    filename = qemu_opt_get(opts, "filename");
 425
 426    ret = raw_normalize_devicepath(&filename);
 427    if (ret != 0) {
 428        error_setg_errno(errp, -ret, "Could not normalize device path");
 429        goto fail;
 430    }
 431
 432    s->open_flags = open_flags;
 433    raw_parse_flags(bdrv_flags, &s->open_flags);
 434
 435    s->fd = -1;
 436    fd = qemu_open(filename, s->open_flags, 0644);
 437    if (fd < 0) {
 438        ret = -errno;
 439        if (ret == -EROFS) {
 440            ret = -EACCES;
 441        }
 442        goto fail;
 443    }
 444    s->fd = fd;
 445
 446#ifdef CONFIG_LINUX_AIO
 447    if (!raw_use_aio(bdrv_flags) && (bdrv_flags & BDRV_O_NATIVE_AIO)) {
 448        error_setg(errp, "aio=native was specified, but it requires "
 449                         "cache.direct=on, which was not specified.");
 450        ret = -EINVAL;
 451        goto fail;
 452    }
 453#else
 454    if (bdrv_flags & BDRV_O_NATIVE_AIO) {
 455        error_setg(errp, "aio=native was specified, but is not supported "
 456                         "in this build.");
 457        ret = -EINVAL;
 458        goto fail;
 459    }
 460#endif /* !defined(CONFIG_LINUX_AIO) */
 461
 462    s->has_discard = true;
 463    s->has_write_zeroes = true;
 464    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
 465    if ((bs->open_flags & BDRV_O_NOCACHE) != 0) {
 466        s->needs_alignment = true;
 467    }
 468
 469    if (fstat(s->fd, &st) < 0) {
 470        ret = -errno;
 471        error_setg_errno(errp, errno, "Could not stat file");
 472        goto fail;
 473    }
 474    if (S_ISREG(st.st_mode)) {
 475        s->discard_zeroes = true;
 476        s->has_fallocate = true;
 477    }
 478    if (S_ISBLK(st.st_mode)) {
 479#ifdef BLKDISCARDZEROES
 480        unsigned int arg;
 481        if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) {
 482            s->discard_zeroes = true;
 483        }
 484#endif
 485#ifdef __linux__
 486        /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
 487         * not rely on the contents of discarded blocks unless using O_DIRECT.
 488         * Same for BLKZEROOUT.
 489         */
 490        if (!(bs->open_flags & BDRV_O_NOCACHE)) {
 491            s->discard_zeroes = false;
 492            s->has_write_zeroes = false;
 493        }
 494#endif
 495    }
 496#ifdef __FreeBSD__
 497    if (S_ISCHR(st.st_mode)) {
 498        /*
 499         * The file is a char device (disk), which on FreeBSD isn't behind
 500         * a pager, so force all requests to be aligned. This is needed
 501         * so QEMU makes sure all IO operations on the device are aligned
 502         * to sector size, or else FreeBSD will reject them with EINVAL.
 503         */
 504        s->needs_alignment = true;
 505    }
 506#endif
 507
 508#ifdef CONFIG_XFS
 509    if (platform_test_xfs_fd(s->fd)) {
 510        s->is_xfs = true;
 511    }
 512#endif
 513
 514    ret = 0;
 515fail:
 516    if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
 517        unlink(filename);
 518    }
 519    qemu_opts_del(opts);
 520    return ret;
 521}
 522
 523static int raw_open(BlockDriverState *bs, QDict *options, int flags,
 524                    Error **errp)
 525{
 526    BDRVRawState *s = bs->opaque;
 527
 528    s->type = FTYPE_FILE;
 529    return raw_open_common(bs, options, flags, 0, errp);
 530}
 531
 532static int raw_reopen_prepare(BDRVReopenState *state,
 533                              BlockReopenQueue *queue, Error **errp)
 534{
 535    BDRVRawState *s;
 536    BDRVRawReopenState *raw_s;
 537    int ret = 0;
 538    Error *local_err = NULL;
 539
 540    assert(state != NULL);
 541    assert(state->bs != NULL);
 542
 543    s = state->bs->opaque;
 544
 545    state->opaque = g_new0(BDRVRawReopenState, 1);
 546    raw_s = state->opaque;
 547
 548    if (s->type == FTYPE_CD) {
 549        raw_s->open_flags |= O_NONBLOCK;
 550    }
 551
 552    raw_parse_flags(state->flags, &raw_s->open_flags);
 553
 554    raw_s->fd = -1;
 555
 556    int fcntl_flags = O_APPEND | O_NONBLOCK;
 557#ifdef O_NOATIME
 558    fcntl_flags |= O_NOATIME;
 559#endif
 560
 561#ifdef O_ASYNC
 562    /* Not all operating systems have O_ASYNC, and those that don't
 563     * will not let us track the state into raw_s->open_flags (typically
 564     * you achieve the same effect with an ioctl, for example I_SETSIG
 565     * on Solaris). But we do not use O_ASYNC, so that's fine.
 566     */
 567    assert((s->open_flags & O_ASYNC) == 0);
 568#endif
 569
 570    if ((raw_s->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
 571        /* dup the original fd */
 572        raw_s->fd = qemu_dup(s->fd);
 573        if (raw_s->fd >= 0) {
 574            ret = fcntl_setfl(raw_s->fd, raw_s->open_flags);
 575            if (ret) {
 576                qemu_close(raw_s->fd);
 577                raw_s->fd = -1;
 578            }
 579        }
 580    }
 581
 582    /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
 583    if (raw_s->fd == -1) {
 584        const char *normalized_filename = state->bs->filename;
 585        ret = raw_normalize_devicepath(&normalized_filename);
 586        if (ret < 0) {
 587            error_setg_errno(errp, -ret, "Could not normalize device path");
 588        } else {
 589            assert(!(raw_s->open_flags & O_CREAT));
 590            raw_s->fd = qemu_open(normalized_filename, raw_s->open_flags);
 591            if (raw_s->fd == -1) {
 592                error_setg_errno(errp, errno, "Could not reopen file");
 593                ret = -1;
 594            }
 595        }
 596    }
 597
 598    /* Fail already reopen_prepare() if we can't get a working O_DIRECT
 599     * alignment with the new fd. */
 600    if (raw_s->fd != -1) {
 601        raw_probe_alignment(state->bs, raw_s->fd, &local_err);
 602        if (local_err) {
 603            qemu_close(raw_s->fd);
 604            raw_s->fd = -1;
 605            error_propagate(errp, local_err);
 606            ret = -EINVAL;
 607        }
 608    }
 609
 610    return ret;
 611}
 612
 613static void raw_reopen_commit(BDRVReopenState *state)
 614{
 615    BDRVRawReopenState *raw_s = state->opaque;
 616    BDRVRawState *s = state->bs->opaque;
 617
 618    s->open_flags = raw_s->open_flags;
 619
 620    qemu_close(s->fd);
 621    s->fd = raw_s->fd;
 622
 623    g_free(state->opaque);
 624    state->opaque = NULL;
 625}
 626
 627
 628static void raw_reopen_abort(BDRVReopenState *state)
 629{
 630    BDRVRawReopenState *raw_s = state->opaque;
 631
 632     /* nothing to do if NULL, we didn't get far enough */
 633    if (raw_s == NULL) {
 634        return;
 635    }
 636
 637    if (raw_s->fd >= 0) {
 638        qemu_close(raw_s->fd);
 639        raw_s->fd = -1;
 640    }
 641    g_free(state->opaque);
 642    state->opaque = NULL;
 643}
 644
 645static int hdev_get_max_transfer_length(int fd)
 646{
 647#ifdef BLKSECTGET
 648    int max_sectors = 0;
 649    if (ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
 650        return max_sectors;
 651    } else {
 652        return -errno;
 653    }
 654#else
 655    return -ENOSYS;
 656#endif
 657}
 658
 659static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 660{
 661    BDRVRawState *s = bs->opaque;
 662    struct stat st;
 663
 664    if (!fstat(s->fd, &st)) {
 665        if (S_ISBLK(st.st_mode)) {
 666            int ret = hdev_get_max_transfer_length(s->fd);
 667            if (ret > 0 && ret <= BDRV_REQUEST_MAX_SECTORS) {
 668                bs->bl.max_transfer = pow2floor(ret << BDRV_SECTOR_BITS);
 669            }
 670        }
 671    }
 672
 673    raw_probe_alignment(bs, s->fd, errp);
 674    bs->bl.min_mem_alignment = s->buf_align;
 675    bs->bl.opt_mem_alignment = MAX(s->buf_align, getpagesize());
 676}
 677
 678static int check_for_dasd(int fd)
 679{
 680#ifdef BIODASDINFO2
 681    struct dasd_information2_t info = {0};
 682
 683    return ioctl(fd, BIODASDINFO2, &info);
 684#else
 685    return -1;
 686#endif
 687}
 688
 689/**
 690 * Try to get @bs's logical and physical block size.
 691 * On success, store them in @bsz and return zero.
 692 * On failure, return negative errno.
 693 */
 694static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
 695{
 696    BDRVRawState *s = bs->opaque;
 697    int ret;
 698
 699    /* If DASD, get blocksizes */
 700    if (check_for_dasd(s->fd) < 0) {
 701        return -ENOTSUP;
 702    }
 703    ret = probe_logical_blocksize(s->fd, &bsz->log);
 704    if (ret < 0) {
 705        return ret;
 706    }
 707    return probe_physical_blocksize(s->fd, &bsz->phys);
 708}
 709
 710/**
 711 * Try to get @bs's geometry: cyls, heads, sectors.
 712 * On success, store them in @geo and return 0.
 713 * On failure return -errno.
 714 * (Allows block driver to assign default geometry values that guest sees)
 715 */
 716#ifdef __linux__
 717static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
 718{
 719    BDRVRawState *s = bs->opaque;
 720    struct hd_geometry ioctl_geo = {0};
 721
 722    /* If DASD, get its geometry */
 723    if (check_for_dasd(s->fd) < 0) {
 724        return -ENOTSUP;
 725    }
 726    if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) {
 727        return -errno;
 728    }
 729    /* HDIO_GETGEO may return success even though geo contains zeros
 730       (e.g. certain multipath setups) */
 731    if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) {
 732        return -ENOTSUP;
 733    }
 734    /* Do not return a geometry for partition */
 735    if (ioctl_geo.start != 0) {
 736        return -ENOTSUP;
 737    }
 738    geo->heads = ioctl_geo.heads;
 739    geo->sectors = ioctl_geo.sectors;
 740    geo->cylinders = ioctl_geo.cylinders;
 741
 742    return 0;
 743}
 744#else /* __linux__ */
 745static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
 746{
 747    return -ENOTSUP;
 748}
 749#endif
 750
 751static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)
 752{
 753    int ret;
 754
 755    ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf);
 756    if (ret == -1) {
 757        return -errno;
 758    }
 759
 760    return 0;
 761}
 762
 763static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb)
 764{
 765    int ret;
 766
 767    ret = qemu_fdatasync(aiocb->aio_fildes);
 768    if (ret == -1) {
 769        return -errno;
 770    }
 771    return 0;
 772}
 773
 774#ifdef CONFIG_PREADV
 775
 776static bool preadv_present = true;
 777
 778static ssize_t
 779qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
 780{
 781    return preadv(fd, iov, nr_iov, offset);
 782}
 783
 784static ssize_t
 785qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
 786{
 787    return pwritev(fd, iov, nr_iov, offset);
 788}
 789
 790#else
 791
 792static bool preadv_present = false;
 793
 794static ssize_t
 795qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
 796{
 797    return -ENOSYS;
 798}
 799
 800static ssize_t
 801qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
 802{
 803    return -ENOSYS;
 804}
 805
 806#endif
 807
 808static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
 809{
 810    ssize_t len;
 811
 812    do {
 813        if (aiocb->aio_type & QEMU_AIO_WRITE)
 814            len = qemu_pwritev(aiocb->aio_fildes,
 815                               aiocb->aio_iov,
 816                               aiocb->aio_niov,
 817                               aiocb->aio_offset);
 818         else
 819            len = qemu_preadv(aiocb->aio_fildes,
 820                              aiocb->aio_iov,
 821                              aiocb->aio_niov,
 822                              aiocb->aio_offset);
 823    } while (len == -1 && errno == EINTR);
 824
 825    if (len == -1) {
 826        return -errno;
 827    }
 828    return len;
 829}
 830
 831/*
 832 * Read/writes the data to/from a given linear buffer.
 833 *
 834 * Returns the number of bytes handles or -errno in case of an error. Short
 835 * reads are only returned if the end of the file is reached.
 836 */
 837static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
 838{
 839    ssize_t offset = 0;
 840    ssize_t len;
 841
 842    while (offset < aiocb->aio_nbytes) {
 843        if (aiocb->aio_type & QEMU_AIO_WRITE) {
 844            len = pwrite(aiocb->aio_fildes,
 845                         (const char *)buf + offset,
 846                         aiocb->aio_nbytes - offset,
 847                         aiocb->aio_offset + offset);
 848        } else {
 849            len = pread(aiocb->aio_fildes,
 850                        buf + offset,
 851                        aiocb->aio_nbytes - offset,
 852                        aiocb->aio_offset + offset);
 853        }
 854        if (len == -1 && errno == EINTR) {
 855            continue;
 856        } else if (len == -1 && errno == EINVAL &&
 857                   (aiocb->bs->open_flags & BDRV_O_NOCACHE) &&
 858                   !(aiocb->aio_type & QEMU_AIO_WRITE) &&
 859                   offset > 0) {
 860            /* O_DIRECT pread() may fail with EINVAL when offset is unaligned
 861             * after a short read.  Assume that O_DIRECT short reads only occur
 862             * at EOF.  Therefore this is a short read, not an I/O error.
 863             */
 864            break;
 865        } else if (len == -1) {
 866            offset = -errno;
 867            break;
 868        } else if (len == 0) {
 869            break;
 870        }
 871        offset += len;
 872    }
 873
 874    return offset;
 875}
 876
 877static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
 878{
 879    ssize_t nbytes;
 880    char *buf;
 881
 882    if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
 883        /*
 884         * If there is just a single buffer, and it is properly aligned
 885         * we can just use plain pread/pwrite without any problems.
 886         */
 887        if (aiocb->aio_niov == 1) {
 888             return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base);
 889        }
 890        /*
 891         * We have more than one iovec, and all are properly aligned.
 892         *
 893         * Try preadv/pwritev first and fall back to linearizing the
 894         * buffer if it's not supported.
 895         */
 896        if (preadv_present) {
 897            nbytes = handle_aiocb_rw_vector(aiocb);
 898            if (nbytes == aiocb->aio_nbytes ||
 899                (nbytes < 0 && nbytes != -ENOSYS)) {
 900                return nbytes;
 901            }
 902            preadv_present = false;
 903        }
 904
 905        /*
 906         * XXX(hch): short read/write.  no easy way to handle the reminder
 907         * using these interfaces.  For now retry using plain
 908         * pread/pwrite?
 909         */
 910    }
 911
 912    /*
 913     * Ok, we have to do it the hard way, copy all segments into
 914     * a single aligned buffer.
 915     */
 916    buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes);
 917    if (buf == NULL) {
 918        return -ENOMEM;
 919    }
 920
 921    if (aiocb->aio_type & QEMU_AIO_WRITE) {
 922        char *p = buf;
 923        int i;
 924
 925        for (i = 0; i < aiocb->aio_niov; ++i) {
 926            memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len);
 927            p += aiocb->aio_iov[i].iov_len;
 928        }
 929        assert(p - buf == aiocb->aio_nbytes);
 930    }
 931
 932    nbytes = handle_aiocb_rw_linear(aiocb, buf);
 933    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
 934        char *p = buf;
 935        size_t count = aiocb->aio_nbytes, copy;
 936        int i;
 937
 938        for (i = 0; i < aiocb->aio_niov && count; ++i) {
 939            copy = count;
 940            if (copy > aiocb->aio_iov[i].iov_len) {
 941                copy = aiocb->aio_iov[i].iov_len;
 942            }
 943            memcpy(aiocb->aio_iov[i].iov_base, p, copy);
 944            assert(count >= copy);
 945            p     += copy;
 946            count -= copy;
 947        }
 948        assert(count == 0);
 949    }
 950    qemu_vfree(buf);
 951
 952    return nbytes;
 953}
 954
 955#ifdef CONFIG_XFS
 956static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes)
 957{
 958    struct xfs_flock64 fl;
 959    int err;
 960
 961    memset(&fl, 0, sizeof(fl));
 962    fl.l_whence = SEEK_SET;
 963    fl.l_start = offset;
 964    fl.l_len = bytes;
 965
 966    if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) {
 967        err = errno;
 968        DPRINTF("cannot write zero range (%s)\n", strerror(errno));
 969        return -err;
 970    }
 971
 972    return 0;
 973}
 974
 975static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
 976{
 977    struct xfs_flock64 fl;
 978    int err;
 979
 980    memset(&fl, 0, sizeof(fl));
 981    fl.l_whence = SEEK_SET;
 982    fl.l_start = offset;
 983    fl.l_len = bytes;
 984
 985    if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) {
 986        err = errno;
 987        DPRINTF("cannot punch hole (%s)\n", strerror(errno));
 988        return -err;
 989    }
 990
 991    return 0;
 992}
 993#endif
 994
 995static int translate_err(int err)
 996{
 997    if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
 998        err == -ENOTTY) {
 999        err = -ENOTSUP;
1000    }
1001    return err;
1002}
1003
1004#ifdef CONFIG_FALLOCATE
1005static int do_fallocate(int fd, int mode, off_t offset, off_t len)
1006{
1007    do {
1008        if (fallocate(fd, mode, offset, len) == 0) {
1009            return 0;
1010        }
1011    } while (errno == EINTR);
1012    return translate_err(-errno);
1013}
1014#endif
1015
1016static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb)
1017{
1018    int ret = -ENOTSUP;
1019    BDRVRawState *s = aiocb->bs->opaque;
1020
1021    if (!s->has_write_zeroes) {
1022        return -ENOTSUP;
1023    }
1024
1025#ifdef BLKZEROOUT
1026    do {
1027        uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1028        if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
1029            return 0;
1030        }
1031    } while (errno == EINTR);
1032
1033    ret = translate_err(-errno);
1034#endif
1035
1036    if (ret == -ENOTSUP) {
1037        s->has_write_zeroes = false;
1038    }
1039    return ret;
1040}
1041
1042static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
1043{
1044#if defined(CONFIG_FALLOCATE) || defined(CONFIG_XFS)
1045    BDRVRawState *s = aiocb->bs->opaque;
1046#endif
1047
1048    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1049        return handle_aiocb_write_zeroes_block(aiocb);
1050    }
1051
1052#ifdef CONFIG_XFS
1053    if (s->is_xfs) {
1054        return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes);
1055    }
1056#endif
1057
1058#ifdef CONFIG_FALLOCATE_ZERO_RANGE
1059    if (s->has_write_zeroes) {
1060        int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE,
1061                               aiocb->aio_offset, aiocb->aio_nbytes);
1062        if (ret == 0 || ret != -ENOTSUP) {
1063            return ret;
1064        }
1065        s->has_write_zeroes = false;
1066    }
1067#endif
1068
1069#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1070    if (s->has_discard && s->has_fallocate) {
1071        int ret = do_fallocate(s->fd,
1072                               FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1073                               aiocb->aio_offset, aiocb->aio_nbytes);
1074        if (ret == 0) {
1075            ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1076            if (ret == 0 || ret != -ENOTSUP) {
1077                return ret;
1078            }
1079            s->has_fallocate = false;
1080        } else if (ret != -ENOTSUP) {
1081            return ret;
1082        } else {
1083            s->has_discard = false;
1084        }
1085    }
1086#endif
1087
1088#ifdef CONFIG_FALLOCATE
1089    if (s->has_fallocate && aiocb->aio_offset >= bdrv_getlength(aiocb->bs)) {
1090        int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1091        if (ret == 0 || ret != -ENOTSUP) {
1092            return ret;
1093        }
1094        s->has_fallocate = false;
1095    }
1096#endif
1097
1098    return -ENOTSUP;
1099}
1100
1101static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
1102{
1103    int ret = -EOPNOTSUPP;
1104    BDRVRawState *s = aiocb->bs->opaque;
1105
1106    if (!s->has_discard) {
1107        return -ENOTSUP;
1108    }
1109
1110    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1111#ifdef BLKDISCARD
1112        do {
1113            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1114            if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) {
1115                return 0;
1116            }
1117        } while (errno == EINTR);
1118
1119        ret = -errno;
1120#endif
1121    } else {
1122#ifdef CONFIG_XFS
1123        if (s->is_xfs) {
1124            return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes);
1125        }
1126#endif
1127
1128#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1129        ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1130                           aiocb->aio_offset, aiocb->aio_nbytes);
1131#endif
1132    }
1133
1134    ret = translate_err(ret);
1135    if (ret == -ENOTSUP) {
1136        s->has_discard = false;
1137    }
1138    return ret;
1139}
1140
1141static int aio_worker(void *arg)
1142{
1143    RawPosixAIOData *aiocb = arg;
1144    ssize_t ret = 0;
1145
1146    switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) {
1147    case QEMU_AIO_READ:
1148        ret = handle_aiocb_rw(aiocb);
1149        if (ret >= 0 && ret < aiocb->aio_nbytes) {
1150            iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret,
1151                      0, aiocb->aio_nbytes - ret);
1152
1153            ret = aiocb->aio_nbytes;
1154        }
1155        if (ret == aiocb->aio_nbytes) {
1156            ret = 0;
1157        } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
1158            ret = -EINVAL;
1159        }
1160        break;
1161    case QEMU_AIO_WRITE:
1162        ret = handle_aiocb_rw(aiocb);
1163        if (ret == aiocb->aio_nbytes) {
1164            ret = 0;
1165        } else if (ret >= 0 && ret < aiocb->aio_nbytes) {
1166            ret = -EINVAL;
1167        }
1168        break;
1169    case QEMU_AIO_FLUSH:
1170        ret = handle_aiocb_flush(aiocb);
1171        break;
1172    case QEMU_AIO_IOCTL:
1173        ret = handle_aiocb_ioctl(aiocb);
1174        break;
1175    case QEMU_AIO_DISCARD:
1176        ret = handle_aiocb_discard(aiocb);
1177        break;
1178    case QEMU_AIO_WRITE_ZEROES:
1179        ret = handle_aiocb_write_zeroes(aiocb);
1180        break;
1181    default:
1182        fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
1183        ret = -EINVAL;
1184        break;
1185    }
1186
1187    g_free(aiocb);
1188    return ret;
1189}
1190
1191static int paio_submit_co(BlockDriverState *bs, int fd,
1192                          int64_t offset, QEMUIOVector *qiov,
1193                          int count, int type)
1194{
1195    RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
1196    ThreadPool *pool;
1197
1198    acb->bs = bs;
1199    acb->aio_type = type;
1200    acb->aio_fildes = fd;
1201
1202    acb->aio_nbytes = count;
1203    acb->aio_offset = offset;
1204
1205    if (qiov) {
1206        acb->aio_iov = qiov->iov;
1207        acb->aio_niov = qiov->niov;
1208        assert(qiov->size == count);
1209    }
1210
1211    trace_paio_submit_co(offset, count, type);
1212    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
1213    return thread_pool_submit_co(pool, aio_worker, acb);
1214}
1215
1216static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
1217        int64_t offset, QEMUIOVector *qiov, int count,
1218        BlockCompletionFunc *cb, void *opaque, int type)
1219{
1220    RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
1221    ThreadPool *pool;
1222
1223    acb->bs = bs;
1224    acb->aio_type = type;
1225    acb->aio_fildes = fd;
1226
1227    acb->aio_nbytes = count;
1228    acb->aio_offset = offset;
1229
1230    if (qiov) {
1231        acb->aio_iov = qiov->iov;
1232        acb->aio_niov = qiov->niov;
1233        assert(qiov->size == acb->aio_nbytes);
1234    }
1235
1236    trace_paio_submit(acb, opaque, offset, count, type);
1237    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
1238    return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
1239}
1240
1241static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
1242                                   uint64_t bytes, QEMUIOVector *qiov, int type)
1243{
1244    BDRVRawState *s = bs->opaque;
1245
1246    if (fd_open(bs) < 0)
1247        return -EIO;
1248
1249    /*
1250     * Check if the underlying device requires requests to be aligned,
1251     * and if the request we are trying to submit is aligned or not.
1252     * If this is the case tell the low-level driver that it needs
1253     * to copy the buffer.
1254     */
1255    if (s->needs_alignment) {
1256        if (!bdrv_qiov_is_aligned(bs, qiov)) {
1257            type |= QEMU_AIO_MISALIGNED;
1258#ifdef CONFIG_LINUX_AIO
1259        } else if (bs->open_flags & BDRV_O_NATIVE_AIO) {
1260            LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
1261            assert(qiov->size == bytes);
1262            return laio_co_submit(bs, aio, s->fd, offset, qiov, type);
1263#endif
1264        }
1265    }
1266
1267    return paio_submit_co(bs, s->fd, offset, qiov, bytes, type);
1268}
1269
1270static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
1271                                      uint64_t bytes, QEMUIOVector *qiov,
1272                                      int flags)
1273{
1274    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
1275}
1276
1277static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
1278                                       uint64_t bytes, QEMUIOVector *qiov,
1279                                       int flags)
1280{
1281    assert(flags == 0);
1282    return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
1283}
1284
1285static void raw_aio_plug(BlockDriverState *bs)
1286{
1287#ifdef CONFIG_LINUX_AIO
1288    if (bs->open_flags & BDRV_O_NATIVE_AIO) {
1289        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
1290        laio_io_plug(bs, aio);
1291    }
1292#endif
1293}
1294
1295static void raw_aio_unplug(BlockDriverState *bs)
1296{
1297#ifdef CONFIG_LINUX_AIO
1298    if (bs->open_flags & BDRV_O_NATIVE_AIO) {
1299        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
1300        laio_io_unplug(bs, aio);
1301    }
1302#endif
1303}
1304
1305static BlockAIOCB *raw_aio_flush(BlockDriverState *bs,
1306        BlockCompletionFunc *cb, void *opaque)
1307{
1308    BDRVRawState *s = bs->opaque;
1309
1310    if (fd_open(bs) < 0)
1311        return NULL;
1312
1313    return paio_submit(bs, s->fd, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH);
1314}
1315
1316static void raw_close(BlockDriverState *bs)
1317{
1318    BDRVRawState *s = bs->opaque;
1319
1320    if (s->fd >= 0) {
1321        qemu_close(s->fd);
1322        s->fd = -1;
1323    }
1324}
1325
1326static int raw_truncate(BlockDriverState *bs, int64_t offset)
1327{
1328    BDRVRawState *s = bs->opaque;
1329    struct stat st;
1330
1331    if (fstat(s->fd, &st)) {
1332        return -errno;
1333    }
1334
1335    if (S_ISREG(st.st_mode)) {
1336        if (ftruncate(s->fd, offset) < 0) {
1337            return -errno;
1338        }
1339    } else if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
1340       if (offset > raw_getlength(bs)) {
1341           return -EINVAL;
1342       }
1343    } else {
1344        return -ENOTSUP;
1345    }
1346
1347    return 0;
1348}
1349
1350#ifdef __OpenBSD__
1351static int64_t raw_getlength(BlockDriverState *bs)
1352{
1353    BDRVRawState *s = bs->opaque;
1354    int fd = s->fd;
1355    struct stat st;
1356
1357    if (fstat(fd, &st))
1358        return -errno;
1359    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
1360        struct disklabel dl;
1361
1362        if (ioctl(fd, DIOCGDINFO, &dl))
1363            return -errno;
1364        return (uint64_t)dl.d_secsize *
1365            dl.d_partitions[DISKPART(st.st_rdev)].p_size;
1366    } else
1367        return st.st_size;
1368}
1369#elif defined(__NetBSD__)
1370static int64_t raw_getlength(BlockDriverState *bs)
1371{
1372    BDRVRawState *s = bs->opaque;
1373    int fd = s->fd;
1374    struct stat st;
1375
1376    if (fstat(fd, &st))
1377        return -errno;
1378    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
1379        struct dkwedge_info dkw;
1380
1381        if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) {
1382            return dkw.dkw_size * 512;
1383        } else {
1384            struct disklabel dl;
1385
1386            if (ioctl(fd, DIOCGDINFO, &dl))
1387                return -errno;
1388            return (uint64_t)dl.d_secsize *
1389                dl.d_partitions[DISKPART(st.st_rdev)].p_size;
1390        }
1391    } else
1392        return st.st_size;
1393}
1394#elif defined(__sun__)
1395static int64_t raw_getlength(BlockDriverState *bs)
1396{
1397    BDRVRawState *s = bs->opaque;
1398    struct dk_minfo minfo;
1399    int ret;
1400    int64_t size;
1401
1402    ret = fd_open(bs);
1403    if (ret < 0) {
1404        return ret;
1405    }
1406
1407    /*
1408     * Use the DKIOCGMEDIAINFO ioctl to read the size.
1409     */
1410    ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo);
1411    if (ret != -1) {
1412        return minfo.dki_lbsize * minfo.dki_capacity;
1413    }
1414
1415    /*
1416     * There are reports that lseek on some devices fails, but
1417     * irc discussion said that contingency on contingency was overkill.
1418     */
1419    size = lseek(s->fd, 0, SEEK_END);
1420    if (size < 0) {
1421        return -errno;
1422    }
1423    return size;
1424}
1425#elif defined(CONFIG_BSD)
1426static int64_t raw_getlength(BlockDriverState *bs)
1427{
1428    BDRVRawState *s = bs->opaque;
1429    int fd = s->fd;
1430    int64_t size;
1431    struct stat sb;
1432#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
1433    int reopened = 0;
1434#endif
1435    int ret;
1436
1437    ret = fd_open(bs);
1438    if (ret < 0)
1439        return ret;
1440
1441#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
1442again:
1443#endif
1444    if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
1445#ifdef DIOCGMEDIASIZE
1446        if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size))
1447#elif defined(DIOCGPART)
1448        {
1449                struct partinfo pi;
1450                if (ioctl(fd, DIOCGPART, &pi) == 0)
1451                        size = pi.media_size;
1452                else
1453                        size = 0;
1454        }
1455        if (size == 0)
1456#endif
1457#if defined(__APPLE__) && defined(__MACH__)
1458        {
1459            uint64_t sectors = 0;
1460            uint32_t sector_size = 0;
1461
1462            if (ioctl(fd, DKIOCGETBLOCKCOUNT, &sectors) == 0
1463               && ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) == 0) {
1464                size = sectors * sector_size;
1465            } else {
1466                size = lseek(fd, 0LL, SEEK_END);
1467                if (size < 0) {
1468                    return -errno;
1469                }
1470            }
1471        }
1472#else
1473        size = lseek(fd, 0LL, SEEK_END);
1474        if (size < 0) {
1475            return -errno;
1476        }
1477#endif
1478#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
1479        switch(s->type) {
1480        case FTYPE_CD:
1481            /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
1482            if (size == 2048LL * (unsigned)-1)
1483                size = 0;
1484            /* XXX no disc?  maybe we need to reopen... */
1485            if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) {
1486                reopened = 1;
1487                goto again;
1488            }
1489        }
1490#endif
1491    } else {
1492        size = lseek(fd, 0, SEEK_END);
1493        if (size < 0) {
1494            return -errno;
1495        }
1496    }
1497    return size;
1498}
1499#else
1500static int64_t raw_getlength(BlockDriverState *bs)
1501{
1502    BDRVRawState *s = bs->opaque;
1503    int ret;
1504    int64_t size;
1505
1506    ret = fd_open(bs);
1507    if (ret < 0) {
1508        return ret;
1509    }
1510
1511    size = lseek(s->fd, 0, SEEK_END);
1512    if (size < 0) {
1513        return -errno;
1514    }
1515    return size;
1516}
1517#endif
1518
1519static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
1520{
1521    struct stat st;
1522    BDRVRawState *s = bs->opaque;
1523
1524    if (fstat(s->fd, &st) < 0) {
1525        return -errno;
1526    }
1527    return (int64_t)st.st_blocks * 512;
1528}
1529
1530static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
1531{
1532    int fd;
1533    int result = 0;
1534    int64_t total_size = 0;
1535    bool nocow = false;
1536    PreallocMode prealloc;
1537    char *buf = NULL;
1538    Error *local_err = NULL;
1539
1540    strstart(filename, "file:", &filename);
1541
1542    /* Read out options */
1543    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
1544                          BDRV_SECTOR_SIZE);
1545    nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
1546    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
1547    prealloc = qapi_enum_parse(PreallocMode_lookup, buf,
1548                               PREALLOC_MODE__MAX, PREALLOC_MODE_OFF,
1549                               &local_err);
1550    g_free(buf);
1551    if (local_err) {
1552        error_propagate(errp, local_err);
1553        result = -EINVAL;
1554        goto out;
1555    }
1556
1557    fd = qemu_open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY,
1558                   0644);
1559    if (fd < 0) {
1560        result = -errno;
1561        error_setg_errno(errp, -result, "Could not create file");
1562        goto out;
1563    }
1564
1565    if (nocow) {
1566#ifdef __linux__
1567        /* Set NOCOW flag to solve performance issue on fs like btrfs.
1568         * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
1569         * will be ignored since any failure of this operation should not
1570         * block the left work.
1571         */
1572        int attr;
1573        if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
1574            attr |= FS_NOCOW_FL;
1575            ioctl(fd, FS_IOC_SETFLAGS, &attr);
1576        }
1577#endif
1578    }
1579
1580    if (ftruncate(fd, total_size) != 0) {
1581        result = -errno;
1582        error_setg_errno(errp, -result, "Could not resize file");
1583        goto out_close;
1584    }
1585
1586    switch (prealloc) {
1587#ifdef CONFIG_POSIX_FALLOCATE
1588    case PREALLOC_MODE_FALLOC:
1589        /* posix_fallocate() doesn't set errno. */
1590        result = -posix_fallocate(fd, 0, total_size);
1591        if (result != 0) {
1592            error_setg_errno(errp, -result,
1593                             "Could not preallocate data for the new file");
1594        }
1595        break;
1596#endif
1597    case PREALLOC_MODE_FULL:
1598    {
1599        int64_t num = 0, left = total_size;
1600        buf = g_malloc0(65536);
1601
1602        while (left > 0) {
1603            num = MIN(left, 65536);
1604            result = write(fd, buf, num);
1605            if (result < 0) {
1606                result = -errno;
1607                error_setg_errno(errp, -result,
1608                                 "Could not write to the new file");
1609                break;
1610            }
1611            left -= result;
1612        }
1613        if (result >= 0) {
1614            result = fsync(fd);
1615            if (result < 0) {
1616                result = -errno;
1617                error_setg_errno(errp, -result,
1618                                 "Could not flush new file to disk");
1619            }
1620        }
1621        g_free(buf);
1622        break;
1623    }
1624    case PREALLOC_MODE_OFF:
1625        break;
1626    default:
1627        result = -EINVAL;
1628        error_setg(errp, "Unsupported preallocation mode: %s",
1629                   PreallocMode_lookup[prealloc]);
1630        break;
1631    }
1632
1633out_close:
1634    if (qemu_close(fd) != 0 && result == 0) {
1635        result = -errno;
1636        error_setg_errno(errp, -result, "Could not close the new file");
1637    }
1638out:
1639    return result;
1640}
1641
1642/*
1643 * Find allocation range in @bs around offset @start.
1644 * May change underlying file descriptor's file offset.
1645 * If @start is not in a hole, store @start in @data, and the
1646 * beginning of the next hole in @hole, and return 0.
1647 * If @start is in a non-trailing hole, store @start in @hole and the
1648 * beginning of the next non-hole in @data, and return 0.
1649 * If @start is in a trailing hole or beyond EOF, return -ENXIO.
1650 * If we can't find out, return a negative errno other than -ENXIO.
1651 */
1652static int find_allocation(BlockDriverState *bs, off_t start,
1653                           off_t *data, off_t *hole)
1654{
1655#if defined SEEK_HOLE && defined SEEK_DATA
1656    BDRVRawState *s = bs->opaque;
1657    off_t offs;
1658
1659    /*
1660     * SEEK_DATA cases:
1661     * D1. offs == start: start is in data
1662     * D2. offs > start: start is in a hole, next data at offs
1663     * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
1664     *                              or start is beyond EOF
1665     *     If the latter happens, the file has been truncated behind
1666     *     our back since we opened it.  All bets are off then.
1667     *     Treating like a trailing hole is simplest.
1668     * D4. offs < 0, errno != ENXIO: we learned nothing
1669     */
1670    offs = lseek(s->fd, start, SEEK_DATA);
1671    if (offs < 0) {
1672        return -errno;          /* D3 or D4 */
1673    }
1674    assert(offs >= start);
1675
1676    if (offs > start) {
1677        /* D2: in hole, next data at offs */
1678        *hole = start;
1679        *data = offs;
1680        return 0;
1681    }
1682
1683    /* D1: in data, end not yet known */
1684
1685    /*
1686     * SEEK_HOLE cases:
1687     * H1. offs == start: start is in a hole
1688     *     If this happens here, a hole has been dug behind our back
1689     *     since the previous lseek().
1690     * H2. offs > start: either start is in data, next hole at offs,
1691     *                   or start is in trailing hole, EOF at offs
1692     *     Linux treats trailing holes like any other hole: offs ==
1693     *     start.  Solaris seeks to EOF instead: offs > start (blech).
1694     *     If that happens here, a hole has been dug behind our back
1695     *     since the previous lseek().
1696     * H3. offs < 0, errno = ENXIO: start is beyond EOF
1697     *     If this happens, the file has been truncated behind our
1698     *     back since we opened it.  Treat it like a trailing hole.
1699     * H4. offs < 0, errno != ENXIO: we learned nothing
1700     *     Pretend we know nothing at all, i.e. "forget" about D1.
1701     */
1702    offs = lseek(s->fd, start, SEEK_HOLE);
1703    if (offs < 0) {
1704        return -errno;          /* D1 and (H3 or H4) */
1705    }
1706    assert(offs >= start);
1707
1708    if (offs > start) {
1709        /*
1710         * D1 and H2: either in data, next hole at offs, or it was in
1711         * data but is now in a trailing hole.  In the latter case,
1712         * all bets are off.  Treating it as if it there was data all
1713         * the way to EOF is safe, so simply do that.
1714         */
1715        *data = start;
1716        *hole = offs;
1717        return 0;
1718    }
1719
1720    /* D1 and H1 */
1721    return -EBUSY;
1722#else
1723    return -ENOTSUP;
1724#endif
1725}
1726
1727/*
1728 * Returns the allocation status of the specified sectors.
1729 *
1730 * If 'sector_num' is beyond the end of the disk image the return value is 0
1731 * and 'pnum' is set to 0.
1732 *
1733 * 'pnum' is set to the number of sectors (including and immediately following
1734 * the specified sector) that are known to be in the same
1735 * allocated/unallocated state.
1736 *
1737 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
1738 * beyond the end of the disk image it will be clamped.
1739 */
1740static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
1741                                                    int64_t sector_num,
1742                                                    int nb_sectors, int *pnum,
1743                                                    BlockDriverState **file)
1744{
1745    off_t start, data = 0, hole = 0;
1746    int64_t total_size;
1747    int ret;
1748
1749    ret = fd_open(bs);
1750    if (ret < 0) {
1751        return ret;
1752    }
1753
1754    start = sector_num * BDRV_SECTOR_SIZE;
1755    total_size = bdrv_getlength(bs);
1756    if (total_size < 0) {
1757        return total_size;
1758    } else if (start >= total_size) {
1759        *pnum = 0;
1760        return 0;
1761    } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) {
1762        nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE);
1763    }
1764
1765    ret = find_allocation(bs, start, &data, &hole);
1766    if (ret == -ENXIO) {
1767        /* Trailing hole */
1768        *pnum = nb_sectors;
1769        ret = BDRV_BLOCK_ZERO;
1770    } else if (ret < 0) {
1771        /* No info available, so pretend there are no holes */
1772        *pnum = nb_sectors;
1773        ret = BDRV_BLOCK_DATA;
1774    } else if (data == start) {
1775        /* On a data extent, compute sectors to the end of the extent,
1776         * possibly including a partial sector at EOF. */
1777        *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE));
1778        ret = BDRV_BLOCK_DATA;
1779    } else {
1780        /* On a hole, compute sectors to the beginning of the next extent.  */
1781        assert(hole == start);
1782        *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
1783        ret = BDRV_BLOCK_ZERO;
1784    }
1785    *file = bs;
1786    return ret | BDRV_BLOCK_OFFSET_VALID | start;
1787}
1788
1789static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs,
1790    int64_t offset, int count,
1791    BlockCompletionFunc *cb, void *opaque)
1792{
1793    BDRVRawState *s = bs->opaque;
1794
1795    return paio_submit(bs, s->fd, offset, NULL, count,
1796                       cb, opaque, QEMU_AIO_DISCARD);
1797}
1798
1799static int coroutine_fn raw_co_pwrite_zeroes(
1800    BlockDriverState *bs, int64_t offset,
1801    int count, BdrvRequestFlags flags)
1802{
1803    BDRVRawState *s = bs->opaque;
1804
1805    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
1806        return paio_submit_co(bs, s->fd, offset, NULL, count,
1807                              QEMU_AIO_WRITE_ZEROES);
1808    } else if (s->discard_zeroes) {
1809        return paio_submit_co(bs, s->fd, offset, NULL, count,
1810                              QEMU_AIO_DISCARD);
1811    }
1812    return -ENOTSUP;
1813}
1814
1815static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1816{
1817    BDRVRawState *s = bs->opaque;
1818
1819    bdi->unallocated_blocks_are_zero = s->discard_zeroes;
1820    bdi->can_write_zeroes_with_unmap = s->discard_zeroes;
1821    return 0;
1822}
1823
1824static QemuOptsList raw_create_opts = {
1825    .name = "raw-create-opts",
1826    .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
1827    .desc = {
1828        {
1829            .name = BLOCK_OPT_SIZE,
1830            .type = QEMU_OPT_SIZE,
1831            .help = "Virtual disk size"
1832        },
1833        {
1834            .name = BLOCK_OPT_NOCOW,
1835            .type = QEMU_OPT_BOOL,
1836            .help = "Turn off copy-on-write (valid only on btrfs)"
1837        },
1838        {
1839            .name = BLOCK_OPT_PREALLOC,
1840            .type = QEMU_OPT_STRING,
1841            .help = "Preallocation mode (allowed values: off, falloc, full)"
1842        },
1843        { /* end of list */ }
1844    }
1845};
1846
1847BlockDriver bdrv_file = {
1848    .format_name = "file",
1849    .protocol_name = "file",
1850    .instance_size = sizeof(BDRVRawState),
1851    .bdrv_needs_filename = true,
1852    .bdrv_probe = NULL, /* no probe for protocols */
1853    .bdrv_parse_filename = raw_parse_filename,
1854    .bdrv_file_open = raw_open,
1855    .bdrv_reopen_prepare = raw_reopen_prepare,
1856    .bdrv_reopen_commit = raw_reopen_commit,
1857    .bdrv_reopen_abort = raw_reopen_abort,
1858    .bdrv_close = raw_close,
1859    .bdrv_create = raw_create,
1860    .bdrv_has_zero_init = bdrv_has_zero_init_1,
1861    .bdrv_co_get_block_status = raw_co_get_block_status,
1862    .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
1863
1864    .bdrv_co_preadv         = raw_co_preadv,
1865    .bdrv_co_pwritev        = raw_co_pwritev,
1866    .bdrv_aio_flush = raw_aio_flush,
1867    .bdrv_aio_pdiscard = raw_aio_pdiscard,
1868    .bdrv_refresh_limits = raw_refresh_limits,
1869    .bdrv_io_plug = raw_aio_plug,
1870    .bdrv_io_unplug = raw_aio_unplug,
1871
1872    .bdrv_truncate = raw_truncate,
1873    .bdrv_getlength = raw_getlength,
1874    .bdrv_get_info = raw_get_info,
1875    .bdrv_get_allocated_file_size
1876                        = raw_get_allocated_file_size,
1877
1878    .create_opts = &raw_create_opts,
1879};
1880
1881/***********************************************/
1882/* host device */
1883
1884#if defined(__APPLE__) && defined(__MACH__)
1885static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
1886                                CFIndex maxPathSize, int flags);
1887static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator)
1888{
1889    kern_return_t kernResult = KERN_FAILURE;
1890    mach_port_t     masterPort;
1891    CFMutableDictionaryRef  classesToMatch;
1892    const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass};
1893    char *mediaType = NULL;
1894
1895    kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort );
1896    if ( KERN_SUCCESS != kernResult ) {
1897        printf( "IOMasterPort returned %d\n", kernResult );
1898    }
1899
1900    int index;
1901    for (index = 0; index < ARRAY_SIZE(matching_array); index++) {
1902        classesToMatch = IOServiceMatching(matching_array[index]);
1903        if (classesToMatch == NULL) {
1904            error_report("IOServiceMatching returned NULL for %s",
1905                         matching_array[index]);
1906            continue;
1907        }
1908        CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey),
1909                             kCFBooleanTrue);
1910        kernResult = IOServiceGetMatchingServices(masterPort, classesToMatch,
1911                                                  mediaIterator);
1912        if (kernResult != KERN_SUCCESS) {
1913            error_report("Note: IOServiceGetMatchingServices returned %d",
1914                         kernResult);
1915            continue;
1916        }
1917
1918        /* If a match was found, leave the loop */
1919        if (*mediaIterator != 0) {
1920            DPRINTF("Matching using %s\n", matching_array[index]);
1921            mediaType = g_strdup(matching_array[index]);
1922            break;
1923        }
1924    }
1925    return mediaType;
1926}
1927
1928kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
1929                         CFIndex maxPathSize, int flags)
1930{
1931    io_object_t     nextMedia;
1932    kern_return_t   kernResult = KERN_FAILURE;
1933    *bsdPath = '\0';
1934    nextMedia = IOIteratorNext( mediaIterator );
1935    if ( nextMedia )
1936    {
1937        CFTypeRef   bsdPathAsCFString;
1938    bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
1939        if ( bsdPathAsCFString ) {
1940            size_t devPathLength;
1941            strcpy( bsdPath, _PATH_DEV );
1942            if (flags & BDRV_O_NOCACHE) {
1943                strcat(bsdPath, "r");
1944            }
1945            devPathLength = strlen( bsdPath );
1946            if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
1947                kernResult = KERN_SUCCESS;
1948            }
1949            CFRelease( bsdPathAsCFString );
1950        }
1951        IOObjectRelease( nextMedia );
1952    }
1953
1954    return kernResult;
1955}
1956
1957/* Sets up a real cdrom for use in QEMU */
1958static bool setup_cdrom(char *bsd_path, Error **errp)
1959{
1960    int index, num_of_test_partitions = 2, fd;
1961    char test_partition[MAXPATHLEN];
1962    bool partition_found = false;
1963
1964    /* look for a working partition */
1965    for (index = 0; index < num_of_test_partitions; index++) {
1966        snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path,
1967                 index);
1968        fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE);
1969        if (fd >= 0) {
1970            partition_found = true;
1971            qemu_close(fd);
1972            break;
1973        }
1974    }
1975
1976    /* if a working partition on the device was not found */
1977    if (partition_found == false) {
1978        error_setg(errp, "Failed to find a working partition on disc");
1979    } else {
1980        DPRINTF("Using %s as optical disc\n", test_partition);
1981        pstrcpy(bsd_path, MAXPATHLEN, test_partition);
1982    }
1983    return partition_found;
1984}
1985
1986/* Prints directions on mounting and unmounting a device */
1987static void print_unmounting_directions(const char *file_name)
1988{
1989    error_report("If device %s is mounted on the desktop, unmount"
1990                 " it first before using it in QEMU", file_name);
1991    error_report("Command to unmount device: diskutil unmountDisk %s",
1992                 file_name);
1993    error_report("Command to mount device: diskutil mountDisk %s", file_name);
1994}
1995
1996#endif /* defined(__APPLE__) && defined(__MACH__) */
1997
1998static int hdev_probe_device(const char *filename)
1999{
2000    struct stat st;
2001
2002    /* allow a dedicated CD-ROM driver to match with a higher priority */
2003    if (strstart(filename, "/dev/cdrom", NULL))
2004        return 50;
2005
2006    if (stat(filename, &st) >= 0 &&
2007            (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
2008        return 100;
2009    }
2010
2011    return 0;
2012}
2013
2014static int check_hdev_writable(BDRVRawState *s)
2015{
2016#if defined(BLKROGET)
2017    /* Linux block devices can be configured "read-only" using blockdev(8).
2018     * This is independent of device node permissions and therefore open(2)
2019     * with O_RDWR succeeds.  Actual writes fail with EPERM.
2020     *
2021     * bdrv_open() is supposed to fail if the disk is read-only.  Explicitly
2022     * check for read-only block devices so that Linux block devices behave
2023     * properly.
2024     */
2025    struct stat st;
2026    int readonly = 0;
2027
2028    if (fstat(s->fd, &st)) {
2029        return -errno;
2030    }
2031
2032    if (!S_ISBLK(st.st_mode)) {
2033        return 0;
2034    }
2035
2036    if (ioctl(s->fd, BLKROGET, &readonly) < 0) {
2037        return -errno;
2038    }
2039
2040    if (readonly) {
2041        return -EACCES;
2042    }
2043#endif /* defined(BLKROGET) */
2044    return 0;
2045}
2046
2047static void hdev_parse_filename(const char *filename, QDict *options,
2048                                Error **errp)
2049{
2050    /* The prefix is optional, just as for "file". */
2051    strstart(filename, "host_device:", &filename);
2052
2053    qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename)));
2054}
2055
2056static bool hdev_is_sg(BlockDriverState *bs)
2057{
2058
2059#if defined(__linux__)
2060
2061    struct stat st;
2062    struct sg_scsi_id scsiid;
2063    int sg_version;
2064
2065    if (stat(bs->filename, &st) >= 0 && S_ISCHR(st.st_mode) &&
2066        !bdrv_ioctl(bs, SG_GET_VERSION_NUM, &sg_version) &&
2067        !bdrv_ioctl(bs, SG_GET_SCSI_ID, &scsiid)) {
2068        DPRINTF("SG device found: type=%d, version=%d\n",
2069            scsiid.scsi_type, sg_version);
2070        return true;
2071    }
2072
2073#endif
2074
2075    return false;
2076}
2077
2078static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
2079                     Error **errp)
2080{
2081    BDRVRawState *s = bs->opaque;
2082    Error *local_err = NULL;
2083    int ret;
2084
2085#if defined(__APPLE__) && defined(__MACH__)
2086    const char *filename = qdict_get_str(options, "filename");
2087    char bsd_path[MAXPATHLEN] = "";
2088    bool error_occurred = false;
2089
2090    /* If using a real cdrom */
2091    if (strcmp(filename, "/dev/cdrom") == 0) {
2092        char *mediaType = NULL;
2093        kern_return_t ret_val;
2094        io_iterator_t mediaIterator = 0;
2095
2096        mediaType = FindEjectableOpticalMedia(&mediaIterator);
2097        if (mediaType == NULL) {
2098            error_setg(errp, "Please make sure your CD/DVD is in the optical"
2099                       " drive");
2100            error_occurred = true;
2101            goto hdev_open_Mac_error;
2102        }
2103
2104        ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags);
2105        if (ret_val != KERN_SUCCESS) {
2106            error_setg(errp, "Could not get BSD path for optical drive");
2107            error_occurred = true;
2108            goto hdev_open_Mac_error;
2109        }
2110
2111        /* If a real optical drive was not found */
2112        if (bsd_path[0] == '\0') {
2113            error_setg(errp, "Failed to obtain bsd path for optical drive");
2114            error_occurred = true;
2115            goto hdev_open_Mac_error;
2116        }
2117
2118        /* If using a cdrom disc and finding a partition on the disc failed */
2119        if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 &&
2120            setup_cdrom(bsd_path, errp) == false) {
2121            print_unmounting_directions(bsd_path);
2122            error_occurred = true;
2123            goto hdev_open_Mac_error;
2124        }
2125
2126        qdict_put(options, "filename", qstring_from_str(bsd_path));
2127
2128hdev_open_Mac_error:
2129        g_free(mediaType);
2130        if (mediaIterator) {
2131            IOObjectRelease(mediaIterator);
2132        }
2133        if (error_occurred) {
2134            return -ENOENT;
2135        }
2136    }
2137#endif /* defined(__APPLE__) && defined(__MACH__) */
2138
2139    s->type = FTYPE_FILE;
2140
2141    ret = raw_open_common(bs, options, flags, 0, &local_err);
2142    if (ret < 0) {
2143        error_propagate(errp, local_err);
2144#if defined(__APPLE__) && defined(__MACH__)
2145        if (*bsd_path) {
2146            filename = bsd_path;
2147        }
2148        /* if a physical device experienced an error while being opened */
2149        if (strncmp(filename, "/dev/", 5) == 0) {
2150            print_unmounting_directions(filename);
2151        }
2152#endif /* defined(__APPLE__) && defined(__MACH__) */
2153        return ret;
2154    }
2155
2156    /* Since this does ioctl the device must be already opened */
2157    bs->sg = hdev_is_sg(bs);
2158
2159    if (flags & BDRV_O_RDWR) {
2160        ret = check_hdev_writable(s);
2161        if (ret < 0) {
2162            raw_close(bs);
2163            error_setg_errno(errp, -ret, "The device is not writable");
2164            return ret;
2165        }
2166    }
2167
2168    return ret;
2169}
2170
2171#if defined(__linux__)
2172
2173static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
2174        unsigned long int req, void *buf,
2175        BlockCompletionFunc *cb, void *opaque)
2176{
2177    BDRVRawState *s = bs->opaque;
2178    RawPosixAIOData *acb;
2179    ThreadPool *pool;
2180
2181    if (fd_open(bs) < 0)
2182        return NULL;
2183
2184    acb = g_new(RawPosixAIOData, 1);
2185    acb->bs = bs;
2186    acb->aio_type = QEMU_AIO_IOCTL;
2187    acb->aio_fildes = s->fd;
2188    acb->aio_offset = 0;
2189    acb->aio_ioctl_buf = buf;
2190    acb->aio_ioctl_cmd = req;
2191    pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
2192    return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
2193}
2194#endif /* linux */
2195
2196static int fd_open(BlockDriverState *bs)
2197{
2198    BDRVRawState *s = bs->opaque;
2199
2200    /* this is just to ensure s->fd is sane (its called by io ops) */
2201    if (s->fd >= 0)
2202        return 0;
2203    return -EIO;
2204}
2205
2206static coroutine_fn BlockAIOCB *hdev_aio_pdiscard(BlockDriverState *bs,
2207    int64_t offset, int count,
2208    BlockCompletionFunc *cb, void *opaque)
2209{
2210    BDRVRawState *s = bs->opaque;
2211
2212    if (fd_open(bs) < 0) {
2213        return NULL;
2214    }
2215    return paio_submit(bs, s->fd, offset, NULL, count,
2216                       cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
2217}
2218
2219static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
2220    int64_t offset, int count, BdrvRequestFlags flags)
2221{
2222    BDRVRawState *s = bs->opaque;
2223    int rc;
2224
2225    rc = fd_open(bs);
2226    if (rc < 0) {
2227        return rc;
2228    }
2229    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
2230        return paio_submit_co(bs, s->fd, offset, NULL, count,
2231                              QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV);
2232    } else if (s->discard_zeroes) {
2233        return paio_submit_co(bs, s->fd, offset, NULL, count,
2234                              QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
2235    }
2236    return -ENOTSUP;
2237}
2238
2239static int hdev_create(const char *filename, QemuOpts *opts,
2240                       Error **errp)
2241{
2242    int fd;
2243    int ret = 0;
2244    struct stat stat_buf;
2245    int64_t total_size = 0;
2246    bool has_prefix;
2247
2248    /* This function is used by both protocol block drivers and therefore either
2249     * of these prefixes may be given.
2250     * The return value has to be stored somewhere, otherwise this is an error
2251     * due to -Werror=unused-value. */
2252    has_prefix =
2253        strstart(filename, "host_device:", &filename) ||
2254        strstart(filename, "host_cdrom:" , &filename);
2255
2256    (void)has_prefix;
2257
2258    ret = raw_normalize_devicepath(&filename);
2259    if (ret < 0) {
2260        error_setg_errno(errp, -ret, "Could not normalize device path");
2261        return ret;
2262    }
2263
2264    /* Read out options */
2265    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
2266                          BDRV_SECTOR_SIZE);
2267
2268    fd = qemu_open(filename, O_WRONLY | O_BINARY);
2269    if (fd < 0) {
2270        ret = -errno;
2271        error_setg_errno(errp, -ret, "Could not open device");
2272        return ret;
2273    }
2274
2275    if (fstat(fd, &stat_buf) < 0) {
2276        ret = -errno;
2277        error_setg_errno(errp, -ret, "Could not stat device");
2278    } else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) {
2279        error_setg(errp,
2280                   "The given file is neither a block nor a character device");
2281        ret = -ENODEV;
2282    } else if (lseek(fd, 0, SEEK_END) < total_size) {
2283        error_setg(errp, "Device is too small");
2284        ret = -ENOSPC;
2285    }
2286
2287    qemu_close(fd);
2288    return ret;
2289}
2290
2291static BlockDriver bdrv_host_device = {
2292    .format_name        = "host_device",
2293    .protocol_name        = "host_device",
2294    .instance_size      = sizeof(BDRVRawState),
2295    .bdrv_needs_filename = true,
2296    .bdrv_probe_device  = hdev_probe_device,
2297    .bdrv_parse_filename = hdev_parse_filename,
2298    .bdrv_file_open     = hdev_open,
2299    .bdrv_close         = raw_close,
2300    .bdrv_reopen_prepare = raw_reopen_prepare,
2301    .bdrv_reopen_commit  = raw_reopen_commit,
2302    .bdrv_reopen_abort   = raw_reopen_abort,
2303    .bdrv_create         = hdev_create,
2304    .create_opts         = &raw_create_opts,
2305    .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
2306
2307    .bdrv_co_preadv         = raw_co_preadv,
2308    .bdrv_co_pwritev        = raw_co_pwritev,
2309    .bdrv_aio_flush     = raw_aio_flush,
2310    .bdrv_aio_pdiscard   = hdev_aio_pdiscard,
2311    .bdrv_refresh_limits = raw_refresh_limits,
2312    .bdrv_io_plug = raw_aio_plug,
2313    .bdrv_io_unplug = raw_aio_unplug,
2314
2315    .bdrv_truncate      = raw_truncate,
2316    .bdrv_getlength     = raw_getlength,
2317    .bdrv_get_info = raw_get_info,
2318    .bdrv_get_allocated_file_size
2319                        = raw_get_allocated_file_size,
2320    .bdrv_probe_blocksizes = hdev_probe_blocksizes,
2321    .bdrv_probe_geometry = hdev_probe_geometry,
2322
2323    /* generic scsi device */
2324#ifdef __linux__
2325    .bdrv_aio_ioctl     = hdev_aio_ioctl,
2326#endif
2327};
2328
2329#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
2330static void cdrom_parse_filename(const char *filename, QDict *options,
2331                                 Error **errp)
2332{
2333    /* The prefix is optional, just as for "file". */
2334    strstart(filename, "host_cdrom:", &filename);
2335
2336    qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename)));
2337}
2338#endif
2339
2340#ifdef __linux__
2341static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
2342                      Error **errp)
2343{
2344    BDRVRawState *s = bs->opaque;
2345
2346    s->type = FTYPE_CD;
2347
2348    /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
2349    return raw_open_common(bs, options, flags, O_NONBLOCK, errp);
2350}
2351
2352static int cdrom_probe_device(const char *filename)
2353{
2354    int fd, ret;
2355    int prio = 0;
2356    struct stat st;
2357
2358    fd = qemu_open(filename, O_RDONLY | O_NONBLOCK);
2359    if (fd < 0) {
2360        goto out;
2361    }
2362    ret = fstat(fd, &st);
2363    if (ret == -1 || !S_ISBLK(st.st_mode)) {
2364        goto outc;
2365    }
2366
2367    /* Attempt to detect via a CDROM specific ioctl */
2368    ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
2369    if (ret >= 0)
2370        prio = 100;
2371
2372outc:
2373    qemu_close(fd);
2374out:
2375    return prio;
2376}
2377
2378static bool cdrom_is_inserted(BlockDriverState *bs)
2379{
2380    BDRVRawState *s = bs->opaque;
2381    int ret;
2382
2383    ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
2384    return ret == CDS_DISC_OK;
2385}
2386
2387static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
2388{
2389    BDRVRawState *s = bs->opaque;
2390
2391    if (eject_flag) {
2392        if (ioctl(s->fd, CDROMEJECT, NULL) < 0)
2393            perror("CDROMEJECT");
2394    } else {
2395        if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
2396            perror("CDROMEJECT");
2397    }
2398}
2399
2400static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
2401{
2402    BDRVRawState *s = bs->opaque;
2403
2404    if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) {
2405        /*
2406         * Note: an error can happen if the distribution automatically
2407         * mounts the CD-ROM
2408         */
2409        /* perror("CDROM_LOCKDOOR"); */
2410    }
2411}
2412
2413static BlockDriver bdrv_host_cdrom = {
2414    .format_name        = "host_cdrom",
2415    .protocol_name      = "host_cdrom",
2416    .instance_size      = sizeof(BDRVRawState),
2417    .bdrv_needs_filename = true,
2418    .bdrv_probe_device  = cdrom_probe_device,
2419    .bdrv_parse_filename = cdrom_parse_filename,
2420    .bdrv_file_open     = cdrom_open,
2421    .bdrv_close         = raw_close,
2422    .bdrv_reopen_prepare = raw_reopen_prepare,
2423    .bdrv_reopen_commit  = raw_reopen_commit,
2424    .bdrv_reopen_abort   = raw_reopen_abort,
2425    .bdrv_create         = hdev_create,
2426    .create_opts         = &raw_create_opts,
2427
2428
2429    .bdrv_co_preadv         = raw_co_preadv,
2430    .bdrv_co_pwritev        = raw_co_pwritev,
2431    .bdrv_aio_flush     = raw_aio_flush,
2432    .bdrv_refresh_limits = raw_refresh_limits,
2433    .bdrv_io_plug = raw_aio_plug,
2434    .bdrv_io_unplug = raw_aio_unplug,
2435
2436    .bdrv_truncate      = raw_truncate,
2437    .bdrv_getlength      = raw_getlength,
2438    .has_variable_length = true,
2439    .bdrv_get_allocated_file_size
2440                        = raw_get_allocated_file_size,
2441
2442    /* removable device support */
2443    .bdrv_is_inserted   = cdrom_is_inserted,
2444    .bdrv_eject         = cdrom_eject,
2445    .bdrv_lock_medium   = cdrom_lock_medium,
2446
2447    /* generic scsi device */
2448    .bdrv_aio_ioctl     = hdev_aio_ioctl,
2449};
2450#endif /* __linux__ */
2451
2452#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2453static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
2454                      Error **errp)
2455{
2456    BDRVRawState *s = bs->opaque;
2457    Error *local_err = NULL;
2458    int ret;
2459
2460    s->type = FTYPE_CD;
2461
2462    ret = raw_open_common(bs, options, flags, 0, &local_err);
2463    if (ret) {
2464        error_propagate(errp, local_err);
2465        return ret;
2466    }
2467
2468    /* make sure the door isn't locked at this time */
2469    ioctl(s->fd, CDIOCALLOW);
2470    return 0;
2471}
2472
2473static int cdrom_probe_device(const char *filename)
2474{
2475    if (strstart(filename, "/dev/cd", NULL) ||
2476            strstart(filename, "/dev/acd", NULL))
2477        return 100;
2478    return 0;
2479}
2480
2481static int cdrom_reopen(BlockDriverState *bs)
2482{
2483    BDRVRawState *s = bs->opaque;
2484    int fd;
2485
2486    /*
2487     * Force reread of possibly changed/newly loaded disc,
2488     * FreeBSD seems to not notice sometimes...
2489     */
2490    if (s->fd >= 0)
2491        qemu_close(s->fd);
2492    fd = qemu_open(bs->filename, s->open_flags, 0644);
2493    if (fd < 0) {
2494        s->fd = -1;
2495        return -EIO;
2496    }
2497    s->fd = fd;
2498
2499    /* make sure the door isn't locked at this time */
2500    ioctl(s->fd, CDIOCALLOW);
2501    return 0;
2502}
2503
2504static bool cdrom_is_inserted(BlockDriverState *bs)
2505{
2506    return raw_getlength(bs) > 0;
2507}
2508
2509static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
2510{
2511    BDRVRawState *s = bs->opaque;
2512
2513    if (s->fd < 0)
2514        return;
2515
2516    (void) ioctl(s->fd, CDIOCALLOW);
2517
2518    if (eject_flag) {
2519        if (ioctl(s->fd, CDIOCEJECT) < 0)
2520            perror("CDIOCEJECT");
2521    } else {
2522        if (ioctl(s->fd, CDIOCCLOSE) < 0)
2523            perror("CDIOCCLOSE");
2524    }
2525
2526    cdrom_reopen(bs);
2527}
2528
2529static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
2530{
2531    BDRVRawState *s = bs->opaque;
2532
2533    if (s->fd < 0)
2534        return;
2535    if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
2536        /*
2537         * Note: an error can happen if the distribution automatically
2538         * mounts the CD-ROM
2539         */
2540        /* perror("CDROM_LOCKDOOR"); */
2541    }
2542}
2543
2544static BlockDriver bdrv_host_cdrom = {
2545    .format_name        = "host_cdrom",
2546    .protocol_name      = "host_cdrom",
2547    .instance_size      = sizeof(BDRVRawState),
2548    .bdrv_needs_filename = true,
2549    .bdrv_probe_device  = cdrom_probe_device,
2550    .bdrv_parse_filename = cdrom_parse_filename,
2551    .bdrv_file_open     = cdrom_open,
2552    .bdrv_close         = raw_close,
2553    .bdrv_reopen_prepare = raw_reopen_prepare,
2554    .bdrv_reopen_commit  = raw_reopen_commit,
2555    .bdrv_reopen_abort   = raw_reopen_abort,
2556    .bdrv_create        = hdev_create,
2557    .create_opts        = &raw_create_opts,
2558
2559    .bdrv_co_preadv         = raw_co_preadv,
2560    .bdrv_co_pwritev        = raw_co_pwritev,
2561    .bdrv_aio_flush     = raw_aio_flush,
2562    .bdrv_refresh_limits = raw_refresh_limits,
2563    .bdrv_io_plug = raw_aio_plug,
2564    .bdrv_io_unplug = raw_aio_unplug,
2565
2566    .bdrv_truncate      = raw_truncate,
2567    .bdrv_getlength      = raw_getlength,
2568    .has_variable_length = true,
2569    .bdrv_get_allocated_file_size
2570                        = raw_get_allocated_file_size,
2571
2572    /* removable device support */
2573    .bdrv_is_inserted   = cdrom_is_inserted,
2574    .bdrv_eject         = cdrom_eject,
2575    .bdrv_lock_medium   = cdrom_lock_medium,
2576};
2577#endif /* __FreeBSD__ */
2578
2579static void bdrv_file_init(void)
2580{
2581    /*
2582     * Register all the drivers.  Note that order is important, the driver
2583     * registered last will get probed first.
2584     */
2585    bdrv_register(&bdrv_file);
2586    bdrv_register(&bdrv_host_device);
2587#ifdef __linux__
2588    bdrv_register(&bdrv_host_cdrom);
2589#endif
2590#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
2591    bdrv_register(&bdrv_host_cdrom);
2592#endif
2593}
2594
2595block_init(bdrv_file_init);
2596